NVIDIA · erhoo82 · May 21, 2024 · May 21, 2024 · May 22, 2024 · May 22, 2024
diff --git a/launcher_scripts/conf/peft/llama/sft.yaml b/launcher_scripts/conf/peft/llama/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True

diff --git a/launcher_scripts/conf/peft/nemotron/sft.yaml b/launcher_scripts/conf/peft/nemotron/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True

diff --git a/launcher_scripts/conf/peft/qwen2/sft.yaml b/launcher_scripts/conf/peft/qwen2/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True

diff --git a/launcher_scripts/conf/peft/starcoder2/sft.yaml b/launcher_scripts/conf/peft/starcoder2/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True

diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -144,8 +144,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/175b_16k.yaml b/launcher_scripts/conf/training/gpt3/175b_16k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/175b_32k.yaml b/launcher_scripts/conf/training/gpt3/175b_32k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/40b_16k.yaml b/launcher_scripts/conf/training/gpt3/40b_16k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/40b_32k.yaml b/launcher_scripts/conf/training/gpt3/40b_32k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/40b_64k.yaml b/launcher_scripts/conf/training/gpt3/40b_64k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/5b_16k.yaml b/launcher_scripts/conf/training/gpt3/5b_16k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/5b_32k.yaml b/launcher_scripts/conf/training/gpt3/5b_32k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/5b_64k.yaml b/launcher_scripts/conf/training/gpt3/5b_64k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234

diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/gpt3/mlperf.yaml b/launcher_scripts/conf/training/gpt3/mlperf.yaml
@@ -189,18 +189,18 @@ model:
       aggregate: 0
     fc2_dgrad:
       method: ring_exchange
-      aggregate: 1
+      aggregate: 0
     proj_fprop:
       method: pipeline
       num_sm: 24
       cga_size: 2
       num_splits: 4
       set_sm_margin: 1
+      atomic_gemm: 1
+      fp8_buf: 1
     fc2_fprop:
-      method: pipeline
-      num_sm: 4
-      cga_size: 2
-      num_splits: 4
+      method: ring_exchange
+      num_sm: 1
       set_sm_margin: 1
   use_flash_attention: false
   cpu_offloading: false
@@ -268,7 +268,3 @@ model:
   gc_interval: 100
   name: megatron_gpt_full_te_layer_autocast
   fp8_params: true
-  tp_comm_split_ag: true
-  tp_comm_split_rs: false
-  tp_comm_atomic_ag: false
-  tp_comm_atomic_rs: true
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -136,8 +136,6 @@ model:
   ub_tp_comm_overlap: false
   overlap_p2p_comm: true
   batch_p2p_comm: false
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -134,8 +134,6 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   overlap_p2p_comm: true
   batch_p2p_comm: false

diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -136,8 +136,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   use_emha: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml
@@ -155,8 +155,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   nsys_profile:
     enabled: False

diff --git a/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml
@@ -155,8 +155,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   gc_interval: 100
 

diff --git a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml
@@ -155,8 +155,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   nsys_profile:
     enabled: False

diff --git a/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml
@@ -138,8 +138,6 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml
@@ -140,8 +140,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   use_emha: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam

diff --git a/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml
@@ -137,8 +137,6 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   overlap_p2p_comm: true
   batch_p2p_comm: false

diff --git a/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml
@@ -140,8 +140,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   use_emha: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam