From e234dcef3061691293faa65c2dcb07a38622dc7c Mon Sep 17 00:00:00 2001
From: Yan Gao <yangao07@hit.edu.cn>
Date: Fri, 5 Apr 2024 16:20:11 -0400
Subject: [PATCH] fix instruction-dependent result difference and seg fault due
 to large memory usage (#66)

---
 Makefile               |  18 +-
 TODO.md                |   9 +
 include/abpoa.h        |   5 +
 src/abpoa.c            |   2 +
 src/abpoa.h            |   5 +
 src/abpoa_align.c      |  19 +-
 src/abpoa_output.c     |  49 ++---
 src/abpoa_seed.c       |  60 +++---
 src/simd_abpoa_align.c | 451 +++++++++++++++++++++++------------------
 9 files changed, 350 insertions(+), 268 deletions(-)
 create mode 100644 TODO.md

diff --git a/Makefile b/Makefile
index 0b6df3f..2c5f8f0 100644
--- a/Makefile
+++ b/Makefile
@@ -4,20 +4,28 @@ default: abpoa
 #CC          = gcc
 OS          := $(shell uname)
 ARCH        := $(shell arch)
-EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
-CFLAGS      = -Wall -O3 $(EXTRA_FLAGS)
+# add -fno-tree-vectorize to avoid certain vectorization errors in O3 optimization
+# right now, we are using -O3 for the best performance, and no vectorization errors were found
+EXTRA_FLAGS = -Wall -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES # -fno-tree-vectorize
 
+DFLAGS	  =
 # for debug
 ifneq ($(debug),)
-	DFLAGS   =   -D __DEBUG__
+	DFLAGS  +=   -D __DEBUG__
 endif
+ifneq ($(sdebug),)
+	DFLAGS  +=   -D __SIMD_DEBUG__
+endif
+
 # for gdb
 ifneq ($(gdb),)
-	CFLAGS   = -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
+	OPT_FLAGS = -g
 else
-	CFLAGS   = -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
+	OPT_FLAGS = -O3
 endif
 
+CFLAGS        = OPT_FLAGS ${DFLAGS} $(EXTRA_FLAGS)
+
 # for gprof
 ifneq ($(pg),)
 	PG_FLAG  =   -pg
diff --git a/TODO.md b/TODO.md
new file mode 100644
index 0000000..3c7e96d
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,9 @@
+# To test after commit
+1. run through
+   1. SSE/AVX2/AVX512
+   2. 16/32bits
+   3. global/local
+   4. lg/ag/cg
+2. same result
+   1. SSE vs AVX2 vs AVX512
+   2. 16bits vs 32 bits
\ No newline at end of file
diff --git a/include/abpoa.h b/include/abpoa.h
index 87bbdcf..ae9a6b7 100644
--- a/include/abpoa.h
+++ b/include/abpoa.h
@@ -38,6 +38,11 @@
 #define ABPOA_HB 0
 #define ABPOA_HC 1
 
+#define ABPOA_NONE_VERBOSE 0
+#define ABPOA_INFO_VERBOSE 1
+#define ABPOA_DEBUG_VERBOSE 2
+#define ABPOA_LONG_DEBUG_VERBOSE 3
+
 // NOTE: upper boundary of in_edge_n is pow(2,30)
 // for MATCH/MISMATCH: node_id << 34  | query_id << 4 | op
 // for INSERTION:      query_id << 34 | op_len << 4   | op
diff --git a/src/abpoa.c b/src/abpoa.c
index 64105c1..031ee4b 100644
--- a/src/abpoa.c
+++ b/src/abpoa.c
@@ -54,6 +54,7 @@ const struct option abpoa_long_opt [] = {
 
     { "help", 0, NULL, 'h' },
     { "version", 0, NULL, 'v' },
+    { "verbose", 1, NULL, 'V'},
 
     { 0, 0, 0, 0}
 };
@@ -122,6 +123,7 @@ int abpoa_usage(void)
 
     err_printf("    -h --help               print this help usage information\n");
     err_printf("    -v --version            show version number\n");
+    err_printf("    -V --verbose     INT    verbose level (0-2). 0: none, 1: information, 2: debug [0]\n");
 
 
     err_printf("\n");
diff --git a/src/abpoa.h b/src/abpoa.h
index 87bbdcf..ae9a6b7 100644
--- a/src/abpoa.h
+++ b/src/abpoa.h
@@ -38,6 +38,11 @@
 #define ABPOA_HB 0
 #define ABPOA_HC 1
 
+#define ABPOA_NONE_VERBOSE 0
+#define ABPOA_INFO_VERBOSE 1
+#define ABPOA_DEBUG_VERBOSE 2
+#define ABPOA_LONG_DEBUG_VERBOSE 3
+
 // NOTE: upper boundary of in_edge_n is pow(2,30)
 // for MATCH/MISMATCH: node_id << 34  | query_id << 4 | op
 // for INSERTION:      query_id << 34 | op_len << 4   | op
diff --git a/src/abpoa_align.c b/src/abpoa_align.c
index 7eca502..3829392 100644
--- a/src/abpoa_align.c
+++ b/src/abpoa_align.c
@@ -133,7 +133,7 @@ abpoa_para_t *abpoa_init_para(void) {
     abpt->min_w = ABPOA_MIN_POA_WIN;
     abpt->progressive_poa = 0; // progressive partial order alignment
 
-    abpt->verbose = 0;
+    abpt->verbose = ABPOA_NONE_VERBOSE;
 
     // abpt->simd_flag = simd_check();
 
@@ -199,9 +199,7 @@ int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weig
     // uint8_t *seq1;
     for (_i = 0; _i < n_seq; ++_i) {
         i = read_id_map[_i]; read_id = exist_n_seq + i; qlen = seq_lens[i]; whole_res.n_cigar = 0, whole_res.m_cigar = 0, whole_res.graph_cigar = 0;
-#ifdef __DEBUG__
-        fprintf(stderr, "seq: # %d\n", i);
-#endif
+        if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "seq: # %d\n", i);
         // seq-to-graph alignment and add alignment within each split window
         if (_i == 0) ai = 0; else ai = par_c[_i-1];
 
@@ -252,10 +250,7 @@ int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weig
         for (; ai < par_c[_i]; ++ai) {
             end_tpos = ((par_anchors.a[ai] >> 32) & 0x7fffffff) - k + 1; end_id = tpos_to_node_id[end_tpos];
             end_qpos = (uint32_t)par_anchors.a[ai] - k + 1;
-
-#ifdef __DEBUG__
-            fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos);
-#endif
+            if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos);
 
             res.graph_cigar = 0; res.n_cigar = 0;
             abpoa_align_sequence_to_subgraph(ab, abpt, beg_id, end_id, qseq+beg_qpos, end_qpos-beg_qpos, &res);
@@ -277,9 +272,7 @@ int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weig
         }
         end_id = ABPOA_SINK_NODE_ID; end_qpos = seq_lens[i];
 
-#ifdef __DEBUG__
-            fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos);
-#endif
+        if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos);
         res.graph_cigar = 0; res.n_cigar = 0;
         abpoa_align_sequence_to_subgraph(ab, abpt, beg_id, end_id, qseq+beg_qpos, end_qpos-beg_qpos, &res);
         abpoa_push_whole_cigar(&whole_res.n_cigar, &whole_res.m_cigar, &whole_res.graph_cigar, res.n_cigar, res.graph_cigar);
@@ -307,9 +300,7 @@ int abpoa_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weights, in
     // uint8_t *seq1;
     for (i = 0; i < n_seq; ++i) {
         qlen = seq_lens[i]; qseq = seqs[i]; weight = weights[i]; read_id = exist_n_seq + i;
-#ifdef __DEBUG__
-        fprintf(stderr, "seq: # %d\n", i);
-#endif
+        if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "seq: # %d\n", i);
         res.graph_cigar = 0; res.n_cigar = 0;
         if (abpoa_align_sequence_to_graph(ab, abpt, qseq, qlen, &res) >= 0) {
             if (abpt->amb_strand && (res.best_score < MIN_OF_TWO(qlen, ab->abg->node_n-2) * abpt->max_mat * .3333)) { // TODO .3333
diff --git a/src/abpoa_output.c b/src/abpoa_output.c
index 697962d..72c1192 100644
--- a/src/abpoa_output.c
+++ b/src/abpoa_output.c
@@ -728,7 +728,7 @@ int reassign_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int rea
 
 // read_weight is NOT used here, no matter use_qv is set or not.
 // collect minimized set of het bases
-int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids, int *het_poss, int **rc_weight, int msa_l, int n_seq, int m, int min_w, int read_ids_n) {
+int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids, int *het_poss, int **rc_weight, int msa_l, int n_seq, int m, int min_w, int read_ids_n, int verbose) {
     int i, j, k, n, rank;
     uint64_t b, one = 1, *whole_read_ids = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t));
     for (i = 0; i < n_seq; ++i) {
@@ -782,12 +782,12 @@ int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids
             if (iden == 1) continue;
 
             het_poss[n_het_pos++] = rank;
-#ifdef __DEBUG__
-            fprintf(stderr, "%d\t", rank);
-            for (j = 0; j < m; ++j) {
-                fprintf(stderr, "%c: %d\t", "ACGT-"[j], rc_weight[rank][j]);
-            } fprintf(stderr, "\n");
-#endif
+            if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) {
+                fprintf(stderr, "%d\t", rank);
+                for (j = 0; j < m; ++j) {
+                    fprintf(stderr, "%c: %d\t", "ACGT-"[j], rc_weight[rank][j]);
+                } fprintf(stderr, "\n");
+            }
         }
     }
     free(n_branch); free(node_map);
@@ -796,7 +796,8 @@ int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids
 
 // group read into clusters based on all het bases
 // initial cluster size could be > max_n_cons
-int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***read_ids, int read_ids_n, int n_seq, int m, int min_w, int max_n_cons, uint64_t ***clu_read_ids, int *_m_clu) {
+int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***read_ids, int read_ids_n, 
+                                   int n_seq, int m, int min_w, int max_n_cons, uint64_t ***clu_read_ids, int *_m_clu, int verbose) {
     if (n_het_pos == 0) return 1;
     int i, j, k, n_clu = 0, m_clu = 2;
     int **clu_haps = (int**)_err_malloc(2 * sizeof(int*));
@@ -832,25 +833,25 @@ int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***rea
         }
     }
     if (n_clu < 2) err_fatal(__func__, "# haplotypes: %d\n", n_clu);
-#ifdef __DEBUG__
-    fprintf(stderr, "n_clu: %d\n", n_clu);
-    for (i = 0; i < n_clu; ++i) {
-        for (j = 0; j < n_het_pos; ++j) {
-            fprintf(stderr, "%d\t", clu_haps[i][j]);
+    if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) {
+        fprintf(stderr, "n_clu: %d\n", n_clu);
+        for (i = 0; i < n_clu; ++i) {
+            for (j = 0; j < n_het_pos; ++j) {
+                fprintf(stderr, "%d\t", clu_haps[i][j]);
+            }
+            fprintf(stderr, "\tsize: %d\n", clu_size[i]);
         }
-        fprintf(stderr, "\tsize: %d\n", clu_size[i]);
     }
-#endif
 
     // assign haplotype with reads < min_w to haplotype with reads >= min_w
     // keep at most _max_n_cons_ haps and read ids, weight need to >= min_w
     n_clu = reassign_hap(clu_haps, clu_size, *clu_read_ids, read_ids_n, n_clu, min_w, max_n_cons, n_het_pos);
-#ifdef __DEBUG__
-    fprintf(stderr, "After re-assign: n_clu: %d\n", n_clu);
-    for (i = 0; i < n_clu; ++i) {
-        fprintf(stderr, "%d:\tsize: %d\n", i, clu_size[i]);
+    if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) {
+        fprintf(stderr, "After re-assign: n_clu: %d\n", n_clu);
+        for (i = 0; i < n_clu; ++i) {
+            fprintf(stderr, "%d:\tsize: %d\n", i, clu_size[i]);
+        }
     }
-#endif
     for (i = 0; i < m_clu; ++i) free(clu_haps[i]); free(clu_haps); free(clu_size);
     *_m_clu = m_clu;
     return n_clu;
@@ -858,7 +859,7 @@ int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***rea
 
 // read_weight is NOT used here
 // cluster reads into _n_clu_ groups based on heterogeneous bases
-int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, int m, int max_n_cons, double min_freq, uint64_t ***clu_read_ids, int *_m_clu) {
+int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, int m, int max_n_cons, double min_freq, uint64_t ***clu_read_ids, int *_m_clu, int verbose) {
     abpoa_set_msa_rank(abg, src_id, sink_id);
     int i, j, n_clu, m_clu, read_ids_n = (n_seq-1)/64+1;
     int msa_l = abg->node_id_to_msa_rank[sink_id]-1, min_w = MAX_OF_TWO(1, n_seq * min_freq); // TODO fastq-qual weight
@@ -878,11 +879,11 @@ int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq
     } 
     // find min set of het nodes
     int *het_poss = (int*)_err_calloc(msa_l, sizeof(int));
-    int n_het_pos = abpoa_set_het_row_column_ids_weight(abg, read_ids, het_poss, rc_weight, msa_l, n_seq, m, min_w, read_ids_n);
+    int n_het_pos = abpoa_set_het_row_column_ids_weight(abg, read_ids, het_poss, rc_weight, msa_l, n_seq, m, min_w, read_ids_n, verbose);
     
     if (n_het_pos < 1) n_clu = 1;
     // collect at most _max_n_cons_ haplotypes and corresponding read ids
-    else n_clu = abpoa_collect_clu_hap_read_ids(het_poss, n_het_pos, read_ids, read_ids_n, n_seq, m, min_w, max_n_cons, clu_read_ids, &m_clu);
+    else n_clu = abpoa_collect_clu_hap_read_ids(het_poss, n_het_pos, read_ids, read_ids_n, n_seq, m, min_w, max_n_cons, clu_read_ids, &m_clu, verbose);
 
     for (i = 0; i < msa_l; ++i) {
         for (j = 0; j < m; ++j) free(read_ids[i][j]);
@@ -906,7 +907,7 @@ void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt) {
     int n_clu, m_clu, n_seq = ab->abs->n_seq; uint64_t **clu_read_ids;
     int read_ids_n = (n_seq-1)/64+1;
 
-    if (abpt->max_n_cons > 1) n_clu = abpoa_multip_read_clu(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, abpt->m, abpt->max_n_cons, abpt->min_freq, &clu_read_ids, &m_clu);
+    if (abpt->max_n_cons > 1) n_clu = abpoa_multip_read_clu(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, abpt->m, abpt->max_n_cons, abpt->min_freq, &clu_read_ids, &m_clu, abpt->verbose);
     else n_clu = 1;
 
     abpoa_cons_t *abc = ab->abc;
diff --git a/src/abpoa_seed.c b/src/abpoa_seed.c
index 13cbb05..277413e 100644
--- a/src/abpoa_seed.c
+++ b/src/abpoa_seed.c
@@ -231,7 +231,7 @@ void mm_aa_sketch(void *km, const uint8_t *str, int len, int w, int k, uint32_t
 int abpoa_build_guide_tree(abpoa_para_t *abpt, int n_seq, ab_u128_v *mm, int *tree_id_map) {
     if (mm->n == 0) return 0;
 
-    if (abpt->verbose > 0) fprintf(stderr, "[%s] Building progressive guide tree ... ", __func__);
+    if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "[%s] Building progressive guide tree ... ", __func__);
     size_t i, _i, j; int rid1, rid2;                                          // mm_hit_n: mimizer hits between each two sequences
                                                                               // 0: 0
                                                                               // 1: 0 1
@@ -319,7 +319,7 @@ int abpoa_build_guide_tree(abpoa_para_t *abpt, int n_seq, ab_u128_v *mm, int *tr
     }
 
     free(mm_hit_n); free(jac_sim);
-    if (abpt->verbose > 0) fprintf(stderr, "done!\n");
+    if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "done!\n");
     return 0;
 }
 
@@ -382,7 +382,9 @@ int get_local_chain_score(int j_end_tpos, int j_end_qpos, int i_end_anchor_i, ab
 // local chains:
 //   x: strand | end_tpos | end_qpos
 //   y: end_anchor_i | start_anchor_i
-int abpoa_dp_chaining_of_local_chains(void *km, ab_u128_t *local_chains, int n_local_chains, ab_u64_v *anchors, int *score, int *pre_id, ab_u64_v *par_anchors, int min_w, int tlen, int qlen) {
+int abpoa_dp_chaining_of_local_chains(void *km, ab_u128_t *local_chains, int n_local_chains, 
+                                      ab_u64_v *anchors, int *score, int *pre_id, ab_u64_v *par_anchors, 
+                                      int min_w, int tlen, int qlen, int verbose) {
     int i, j, st, score1, global_max_score=INT32_MIN, global_max_i=-1;
     int *chain_score = (int*)kmalloc(km, n_local_chains * 4), *pre_chain_id = (int*)kmalloc(km, n_local_chains * 4);
     size_t _n = par_anchors->n;
@@ -452,13 +454,13 @@ int abpoa_dp_chaining_of_local_chains(void *km, ab_u128_t *local_chains, int n_l
         par_anchors->a[par_anchors->n-i-1] = tmp;
     }
 
-#ifdef __DEBUG__
-    for (i = _n; i < par_anchors->n; ++i) {
-        uint64_t ia = par_anchors->a[i];
-        // strand, rpos, qpos
-        fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia));
+    if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) {
+        for (i = _n; i < par_anchors->n; ++i) {
+            uint64_t ia = par_anchors->a[i];
+            // strand, rpos, qpos
+            fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia));
+        }
     }
-#endif
     kfree(km, chain_score), kfree(km, pre_chain_id);
     return 0;
 }
@@ -482,7 +484,7 @@ static int get_chain_score(int max_bw, int *score, int i_qpos, int i_tpos, int j
 // Dynamic Programming-based Chaining for global alignment mode
 // anchors:
 //          strand<<63 | tpos<<32 | qpos
-int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_para_t *abpt, int tlen, int qlen) {
+int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_para_t *abpt, int tlen, int qlen, int verbose) {
     int i, j, st, n_a = anchors->n;
     int *score = (int32_t*)kmalloc(km, n_a * 4), *pre_id = (int32_t*)kmalloc(km, n_a * 4), *end_pos = (int32_t*)kmalloc(km, n_a * 4);
     memset(end_pos, 0, n_a * 4);
@@ -516,9 +518,7 @@ int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_
 
             if (pre_id[j] >= 0) end_pos[pre_id[j]] = i;
         }
-#ifdef __DEBUG__
-        fprintf(stderr, "%d pre_id: %d, score: %d, tpos: %d, qpos: %d\n", i, max_j, max_score, i_tpos, i_qpos);
-#endif
+        if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) fprintf(stderr, "%d pre_id: %d, score: %d, tpos: %d, qpos: %d\n", i, max_j, max_score, i_tpos, i_qpos);
         score[i] = max_score, pre_id[i] = max_j;
     }
 
@@ -570,7 +570,8 @@ int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_
     }
 
     radix_sort_ab_128x(local_chains+tot_chain_i+1, local_chains + n_local_chains);
-    abpoa_dp_chaining_of_local_chains(km, local_chains+tot_chain_i+1, n_local_chains-1-tot_chain_i, anchors, score, pre_id, par_anchors, min_w, tlen, qlen);
+    abpoa_dp_chaining_of_local_chains(km, local_chains+tot_chain_i+1, n_local_chains-1-tot_chain_i, 
+                                      anchors, score, pre_id, par_anchors, min_w, tlen, qlen, abpt->verbose);
 
     kfree(km, score); kfree(km, pre_id); kfree(km, end_pos); kfree(km, local_chains);
     return 0;
@@ -630,7 +631,7 @@ int LIS(void *km, int tot_n, uint64_t *rank, int n) {
 // output:
 //   anchor list size: n
 //   list of anchors: anchors
-int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w) {
+int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w, int verbose) {
     size_t i, j, n_a = anchors->n, n_for, n_rev;
     uint64_t *for_rank = (uint64_t*)kmalloc(km, sizeof(uint64_t) * n_a);
     uint64_t *rev_rank = (uint64_t*)kmalloc(km, sizeof(uint64_t) * n_a);
@@ -663,10 +664,8 @@ int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w)
         n = n_rev; rank = rev_rank; kfree(km, for_rank);
     }
     // filter anchors
-    int last_tpos = -1, last_qpos = -1, cur_tpos, cur_qpos;
-#ifdef __DEBUG__
-    size_t _n = par_anchors->n;
-#endif
+    int last_tpos = -1, last_qpos = -1, cur_tpos, cur_qpos; size_t _n = 0;
+    if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) _n = par_anchors->n;
     for (i = 0; i < n; ++i) {
         j = (int)rank[i]-1;
         cur_tpos = (anchors->a[j] >> 32) & 0x7fffffff;
@@ -677,18 +676,19 @@ int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w)
         kv_push(uint64_t, 0, *par_anchors, anchors->a[j]); // store LIS anchors into par_anchors
         last_tpos = cur_tpos; last_qpos = cur_qpos;
     }
-#ifdef __DEBUG__
-    for (i = _n; i < par_anchors->n; ++i) {
-        uint64_t ia = par_anchors->a[i];
-        // strand, rpos, qpos
-        fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia));
+
+    if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) {
+        for (i = _n; i < par_anchors->n; ++i) {
+            uint64_t ia = par_anchors->a[i];
+            // strand, rpos, qpos
+            fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia));
+        }
     }
-#endif
     return 0;
 }
 
 int abpoa_collect_mm(void *km, uint8_t **seqs, int *seq_lens, int n_seq, abpoa_para_t *abpt, ab_u128_v *mm, int *mm_c) {
-    if (abpt->verbose > 0) fprintf(stderr, "[%s] Collecting minimizers ... ", __func__);
+    if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "[%s] Collecting minimizers ... ", __func__);
     int i;
     mm_c[0] = 0;
     for (i = 0; i < n_seq; ++i) { // collect minimizers
@@ -696,7 +696,7 @@ int abpoa_collect_mm(void *km, uint8_t **seqs, int *seq_lens, int n_seq, abpoa_p
         else mm_sketch(km, seqs[i], seq_lens[i], abpt->w, abpt->k, i, 0, abpt->amb_strand, mm);
         mm_c[i+1] = mm->n;
     }
-    if (abpt->verbose > 0) fprintf(stderr, "done!\n");
+    if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "done!\n");
     return mm->n;
 }
 
@@ -731,11 +731,9 @@ int abpoa_build_guide_tree_partition(uint8_t **seqs, int *seq_lens, int n_seq, a
         // collect minimizer hit anchors between t and q
         collect_anchors1(km, &anchors, mm1, mm_c, tid, qid, seq_lens[qid], abpt->k);
         // filtering and only keep LIS anchors
-#ifdef __DEBUG__
-        fprintf(stderr, "%d vs %d (tot_n: %ld)\n", tid, qid, anchors.n);
-#endif
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) fprintf(stderr, "%d vs %d (tot_n: %ld)\n", tid, qid, anchors.n);
         // alignment mode: different chaining result for global/local/extend
-        abpoa_dp_chaining(km, &anchors, par_anchors, abpt, seq_lens[tid], seq_lens[qid]);
+        abpoa_dp_chaining(km, &anchors, par_anchors, abpt, seq_lens[tid], seq_lens[qid], abpt->verbose);
         par_c[i] = par_anchors->n;
         kfree(km, anchors.a);
     }
diff --git a/src/simd_abpoa_align.c b/src/simd_abpoa_align.c
index 9c2363c..6362ec1 100644
--- a/src/simd_abpoa_align.c
+++ b/src/simd_abpoa_align.c
@@ -35,6 +35,16 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
 #define SIMDTotalBytes 16
 #endif
 
+void simd_output_pre_nodes(int *pre_index, int pre_n, int dp_i, int dp_j, int cur_op, int verbose) {
+    if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) {
+        int k;
+        fprintf(stderr, "%d, %d, %d\t", dp_i, dp_j, cur_op);
+        for (k = 0; k < pre_n; ++k) { /* print all pre node for node i*/
+            fprintf(stderr, "%d, ", pre_index[k]);
+        } fprintf(stderr, "\n");
+    }
+}
+
 #define print_simd(s, str, score_t) {                                \
     int _i; score_t *_a = (score_t*)(s);                             \
     fprintf(stderr, "%s\t", str);                                    \
@@ -43,7 +53,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     } fprintf(stderr, "\n");                                         \
 }
 
-
 #define simd_abpoa_print_lg_matrix(score_t, beg_index, end_index) { \
     for (j = 0; j < end_index-beg_index; ++j) {                     \
         fprintf(stderr, "index: %d\t", j);                          \
@@ -56,7 +65,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
 }
 
 #define simd_abpoa_print_ag_matrix(score_t, beg_index, end_index) {  \
-    for (j = beg_index; j < end_index; ++j) {                        \
+    for (j = 0; j < end_index-beg_index; ++j) {                      \
         fprintf(stderr, "index: %d\t", j);                           \
         dp_h = DP_HEF + j * 3 * dp_sn; dp_e1 = dp_h + dp_sn;         \
         _dp_h = (score_t*)dp_h, _dp_e1 = (score_t*)dp_e1;            \
@@ -70,7 +79,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     score_t *_dp_h = (score_t*)dp_h, *_dp_e1 = (score_t*)dp_e1;                                          \
     score_t *_dp_e2 = (score_t*)dp_e2, *_dp_f1 = (score_t*)dp_f1, *_dp_f2 = (score_t*)dp_f2;             \
     fprintf(stderr, "%s\tindex: %d\t", str, index_i);                                                    \
-    for (i = dp_beg[index_i]; i <= (dp_end[index_i]/16+1)*16-1; ++i) {                                   \
+    for (i = dp_beg[index_i]; i <= dp_end[index_i]; ++i) {                                               \
         fprintf(stderr, "%d:(%d,%d,%d,%d,%d)\t", i, _dp_h[i], _dp_e1[i],_dp_e2[i], _dp_f1[i],_dp_f2[i]); \
     } fprintf(stderr, "\n");                                                                             \
 }
@@ -169,7 +178,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
             }                                                                                               \
         }                                                                                                   \
         if (hit == 0) err_fatal_simple("Error in lg_backtrack.");                                           \
-     /* fprintf(stderr, "%d, %d, (%d)\n", i, j, indel_first); */                                            \
+        simd_output_pre_nodes(pre_index[i], pre_n[i], i, j, 0, abpt->verbose);                              \
     }                                                                                                       \
     if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1);                         \
     /* reverse cigar */                                                                                     \
@@ -273,7 +282,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
             }                                                                                               \
         }                                                                                                   \
         if (hit == 0) err_fatal_simple("Error in ag_backtrack.");                                           \
-     /* fprintf(stderr, "%d, %d, %d (indel_first: %d)\n", i, j, cur_op, indel_first); */                    \
+        simd_output_pre_nodes(pre_index[i], pre_n[i], i, j, cur_op, abpt->verbose);                         \
     }                                                                                                       \
     if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1);                         \
     /* reverse cigar */                                                                                     \
@@ -290,7 +299,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     abpoa_cigar_t *cigar = 0;                                                                               \
     i = best_i, j = best_j, _start_i = best_i, _start_j = best_j;                                           \
     id = abpoa_graph_index_to_node_id(graph, i+beg_index);                                                  \
-    if (best_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-j, -1, qlen-1);         \
+    if (best_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-best_j, -1, qlen-1);    \
     SIMDi *dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                                         \
     int indel_first = 1; /* prefer to keep gaps at the end */                                               \
     while (i > 0 && j > 0) {                                                                                \
@@ -305,24 +314,25 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
                 if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
                 _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                                       \
                 if (_pre_dp_h[j-1] + s == _dp_h[j]) {                                                       \
-                    cur_op = ABPOA_ALL_OP; hit = 1;                                                         \
                     cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
                     i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index); hit = 1;         \
                     dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                                \
+                    cur_op = ABPOA_ALL_OP;                                                                  \
                     ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
                     break;                                                                                  \
                 }                                                                                           \
             }                                                                                               \
         }                                                                                                   \
         if (hit == 0 && cur_op & ABPOA_E_OP) { /* deletion */                                               \
+            _dp_e1 = (score_t*)(dp_h + dp_sn); _dp_e2 = (score_t*)(dp_h + dp_sn * 2);                       \
             for (k = 0; k < pre_n[i]; ++k) {                                                                \
                 pre_i = pre_index_i[k];                                                                     \
                 if (j < dp_beg[pre_i] || j > dp_end[pre_i]) continue;                                       \
+                _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                                       \
                 if (cur_op & ABPOA_E1_OP) {                                                                 \
                     _pre_dp_e1 = (score_t*)(DP_H2E2F + dp_sn * (pre_i * 5 + 1));                            \
                     if (cur_op & ABPOA_M_OP) {                                                              \
                         if (_dp_h[j] == _pre_dp_e1[j]) {                                                    \
-                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
                             if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
                             else cur_op = ABPOA_E1_OP;                                                      \
                             hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
@@ -331,9 +341,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
                             break;                                                                          \
                         }                                                                                   \
                     } else {                                                                                \
-                        _dp_e1 = (score_t*)(dp_h + dp_sn);                                                  \
                         if (_dp_e1[j] == _pre_dp_e1[j] - gap_ext1) {                                        \
-                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
                             if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
                             else cur_op = ABPOA_E1_OP;                                                      \
                             hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
@@ -347,7 +355,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
                     _pre_dp_e2 = (score_t*)(DP_H2E2F + dp_sn * (pre_i * 5 + 2));                            \
                     if (cur_op & ABPOA_M_OP) {                                                              \
                         if (_dp_h[j] == _pre_dp_e2[j]) {                                                    \
-                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
                             if (_pre_dp_h[j] - gap_oe2 == _pre_dp_e2[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
                             else cur_op = ABPOA_E2_OP;                                                      \
                             hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
@@ -356,9 +363,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
                             break;                                                                          \
                         }                                                                                   \
                     } else {                                                                                \
-                        _dp_e2 = (score_t*)(dp_h + dp_sn * 2);                                              \
                         if (_dp_e2[j] == _pre_dp_e2[j] - gap_ext2) {                                        \
-                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
                             if (_pre_dp_h[j] - gap_oe2 == _pre_dp_e2[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
                             else cur_op = ABPOA_E2_OP;                                                      \
                             hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
@@ -400,16 +405,16 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
                 ++res->n_aln_bases;                                                                         \
             }                                                                                               \
         }                                                                                                   \
-        if (hit == 0 && cur_op & ABPOA_M_OP && indel_first == 1) { /* match/mismatch */                     \
+        if (hit == 0 && (cur_op & ABPOA_M_OP) && (indel_first == 1)) { /* match/mismatch */                 \
             for (k = 0; k < pre_n[i]; ++k) {                                                                \
                 pre_i = pre_index_i[k];                                                                     \
                 if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
                 _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                                       \
                 if (_pre_dp_h[j-1] + s == _dp_h[j]) {                                                       \
-                    cur_op = ABPOA_ALL_OP; hit = 1;                                                         \
                     cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
                     i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index); hit = 1;         \
                     dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                                \
+                    cur_op = ABPOA_ALL_OP;                                                                  \
                     ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
                     indel_first = 0;                                                                        \
                     break;                                                                                  \
@@ -417,7 +422,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
             }                                                                                               \
         }                                                                                                   \
         if (hit == 0) err_fatal_simple("Error in cg_backtrack.");                                           \
-     /* fprintf(stderr, "%d, %d, %d\n", i, j, cur_op); */                                                   \
+        simd_output_pre_nodes(pre_index[i], pre_n[i], i, j, cur_op, abpt->verbose);                         \
     }                                                                                                       \
     if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1);                         \
     /* reverse cigar */                                                                                     \
@@ -434,16 +439,16 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
 #define simd_abpoa_var(score_t, sp, SIMDSetOne, SIMDShiftOneN)                                      \
     /* int tot_dp_sn = 0; */                                                                        \
     abpoa_graph_t *graph = ab->abg; abpoa_simd_matrix_t *abm = ab->abm;                             \
-    int matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1;                              \
+    int64_t matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1;                          \
     int **pre_index, *pre_n, _pre_index, _pre_n, pre_i;                                             \
     int i, j, k, *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn, node_id, index_i, dp_i;                  \
     int beg, end, beg_sn, end_sn, _beg_sn, _end_sn, pre_beg_sn, pre_end, sn_i;                      \
-    int pn, log_n, size, qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/ \
+    int pn, log_n, size; int64_t qp_sn, dp_sn; /* pn: # value per SIMDi, qp_sn/dp_sn: segmented length*/ \
     SIMDi *dp_h, *pre_dp_h, *qp, *qi=NULL;                                                          \
     score_t *_dp_h=NULL, *_qi, best_score = sp.inf_min, inf_min = sp.inf_min;                       \
     int *mat = abpt->mat, m = abpt->m; score_t gap_ext1 = abpt->gap_ext1;                           \
     int w = abpt->wb < 0 ? qlen : abpt->wb+(int)(abpt->wf*qlen); /* when w < 0, do whole global */  \
-    int best_i = 0, best_j = 0, best_id = 0, max, max_i=-1;                                         \
+    int best_i = 0, best_j = 0, best_id = 0, max, left_max_i=-1, right_max_i=-1;                    \
     SIMDi zero = SIMDSetZeroi(), SIMD_INF_MIN = SIMDSetOne(inf_min);                                \
     pn = sp.num_of_value; qp_sn = dp_sn = (matrix_col_n + pn - 1) / pn;                             \
     log_n = sp.log_num, size = sp.size; qp = abm->s_mem;                                            \
@@ -564,7 +569,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
         dp_beg[0] = 0, dp_end[0] = qlen;                                                                \
     }                                                                                                   \
     dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;                                       \
-    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;                                   \
     dp_h = DP_H; _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);                                         \
 }
 
@@ -582,7 +586,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
         dp_beg[0] = 0, dp_end[0] = qlen;                                                                \
     }                                                                                                   \
     dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;                                       \
-    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;                                   \
     dp_h = DP_HEF; dp_e1 = dp_h + dp_sn; dp_f1 = dp_e1 + dp_sn;                                         \
     _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);                                                      \
 }
@@ -601,7 +604,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
         dp_beg[0] = 0, dp_end[0] = qlen;                                                                \
     }                                                                                                   \
     dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;                                       \
-    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;                                   \
     dp_h = DP_H2E2F; dp_e1 = dp_h+dp_sn; dp_e2 = dp_e1+dp_sn; dp_f1 = dp_e2+dp_sn; dp_f2 = dp_f1+dp_sn; \
     _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);                                                      \
 }
@@ -609,7 +611,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
 #define simd_abpoa_lg_first_dp(score_t) {                                   \
     simd_abpoa_lg_first_row;                                                \
     if (abpt->align_mode == ABPOA_LOCAL_MODE) {                             \
-        for (i = 0; i <= _end_sn; ++i)                                       \
+        for (i = 0; i <= _end_sn; ++i)                                      \
             dp_h[i] = zero;                                                 \
     } else {                                                                \
         for (i = 0; i <= _end_sn; ++i) {                                    \
@@ -625,7 +627,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
 #define simd_abpoa_ag_first_dp(score_t) {                                   \
     simd_abpoa_ag_first_row;                                                \
     if (abpt->align_mode == ABPOA_LOCAL_MODE) {                             \
-        for (i = 0; i <= _end_sn; ++i)                                       \
+        for (i = 0; i <= _end_sn; ++i)                                      \
             dp_h[i] = dp_e1[i] = dp_f1[i] = zero;                           \
     } else {                                                                \
         for (i = 0; i <= _end_sn; ++i) {                                    \
@@ -643,7 +645,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
 #define simd_abpoa_cg_first_dp(score_t) {                                             \
     simd_abpoa_cg_first_row;                                                          \
     if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                       \
-        for (i = 0; i <= _end_sn; ++i)                                                 \
+        for (i = 0; i <= _end_sn; ++i)                                                \
             dp_h[i] = dp_e1[i] = dp_e2[i] = dp_f1[i] = dp_f2[i] = zero;               \
     } else {                                                                          \
         for (i = 0; i <= _end_sn; ++i) {                                              \
@@ -698,47 +700,48 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     }                                                                                                                                                              \
 }
 
-#define simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub) {                       \
-    node_id = abpoa_graph_index_to_node_id(graph, index_i);                                         \
-    SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain;                               \
-    dp_h = &DP_H[dp_i * dp_sn]; _dp_h = (score_t*)dp_h;                                             \
-    int min_pre_beg_sn, max_pre_end_sn;                                                             \
-    if (abpt->wb < 0) {                                                                             \
-        beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;                                          \
-        beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn; \
-        min_pre_beg_sn = 0, max_pre_end_sn = end_sn;                                                \
-    } else {                                                                                        \
-        beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen); \
-        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                         \
-        for (i = 0; i < pre_n[dp_i]; ++i) {                                                         \
-            pre_i = pre_index[dp_i][i];                                                             \
-            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];               \
-            if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];               \
-        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;                                     \
-        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;                        \
-        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;           \
-    }                                                                                               \
-    /* loop query */                                                                                                         \
-    /* first pre_node */                                                                                                     \
-    pre_i = pre_index[dp_i][0];                                                                                              \
-    pre_dp_h = DP_H + pre_i * dp_sn;                                                                                         \
-    pre_end = dp_end[pre_i];                                                                                                 \
-    pre_beg_sn = dp_beg_sn[pre_i];                                                                                           \
-    /* set M from (pre_i, q_i-1), E from (pre_i, q_i) */                                                                     \
-    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                              \
-        _beg_sn = 0, _end_sn = end_sn; first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                           \
-    } else {                                                                                                                 \
-        if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN); \
-        else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                       \
-        _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                             \
-        for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN;                                                           \
-        for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN;                                 \
-    }                                                                                                                        \
-    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                               \
-        remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                               \
-        dp_h[sn_i] = SIMDMax(SIMDAdd(SIMDOri(first, remain), q[sn_i]), SIMDSub(pre_dp_h[sn_i], GAP_E1));                     \
-        first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                \
-    }                                                                                                                        \
+#define simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub) {                                                     \
+    node_id = abpoa_graph_index_to_node_id(graph, index_i);                                                                       \
+    SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain;                                                             \
+    dp_h = &DP_H[dp_i * dp_sn]; _dp_h = (score_t*)dp_h;                                                                           \
+    int min_pre_beg, min_pre_beg_sn, max_pre_end_sn;                                                                              \
+    if (abpt->wb < 0) {                                                                                                           \
+        beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;                                                                        \
+        beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn;                               \
+        min_pre_beg_sn = 0, max_pre_end_sn = end_sn;                                                                              \
+    } else {                                                                                                                      \
+        beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen);   \
+        beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                                         \
+        for (i = 0; i < pre_n[dp_i]; ++i) {                                                                                       \
+            pre_i = pre_index[dp_i][i];                                                                                           \
+            if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i];                      \
+            if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];                                             \
+        }                                                                                                                         \
+        if (beg_sn < min_pre_beg_sn) {                                                                                            \
+            beg = min_pre_beg; beg_sn = min_pre_beg_sn;                                                                           \
+        }                                                                                                                         \
+        dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end;                      \
+    }                                                                                                                             \
+    /* loop query */                                                                                                              \
+    /* first pre_node */                                                                                                          \
+    pre_i = pre_index[dp_i][0];                                                                                                   \
+    pre_dp_h = DP_H + pre_i * dp_sn;                                                                                              \
+    pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i];                                                                       \
+    /* set M from (pre_i, q_i-1), E from (pre_i, q_i) */                                                                          \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                                   \
+        _beg_sn = 0, _end_sn = end_sn; first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                                \
+    } else {                                                                                                                      \
+        if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN);      \
+        else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                            \
+        _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                                  \
+        for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN;                                                                \
+        for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN;                                      \
+    }                                                                                                                             \
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                    \
+        remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                                    \
+        dp_h[sn_i] = SIMDMax(SIMDAdd(SIMDOri(first, remain), q[sn_i]), SIMDSub(pre_dp_h[sn_i], GAP_E1));                          \
+        first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                     \
+    }                                                                                                                             \
     /* get max m and e */                                                                                                         \
     for (i = 1; i < pre_n[dp_i]; ++i) {                                                                                           \
         pre_i = pre_index[dp_i][i];                                                                                               \
@@ -759,23 +762,25 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
             first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                 \
         } /* now we have max(h,e) stored at dp_h */                                                                               \
     }                                                                                                                             \
-    /* new F start */                                                                                                                  \
-    first = SIMDOri(SIMDAndi(dp_h[beg_sn], PRE_MASK[0]), SUF_MIN[0]);                                                                  \
-    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {                                                                                      \
-        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                                    \
-            set_num = pn;                                                                                                              \
-        } else {                                                                                                                       \
-            if (sn_i < min_pre_beg_sn) {                                                                                               \
-                _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n");                                                                \
-            } else if (sn_i > max_pre_end_sn) {                                                                                        \
-                set_num = sn_i == max_pre_end_sn+1 ? 1 : 0;                                                                            \
-            } else set_num = pn;                                                                                                       \
-        }                                                                                                                              \
-        dp_h[sn_i] = SIMDMax(dp_h[sn_i], first);                                                                                       \
-        SIMD_SET_F(dp_h[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN);         \
+    /* new F start */                                                                                                             \
+    for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = inf_min;                                                                       \
+    for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = inf_min;                                                                   \
+    first = SIMDOri(SIMDAndi(dp_h[beg_sn], PRE_MASK[0]), SUF_MIN[0]);                                                             \
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {                                                                                 \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                               \
+            set_num = pn;                                                                                                         \
+        } else {                                                                                                                  \
+            if (sn_i < min_pre_beg_sn) {                                                                                          \
+                _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n");                                                           \
+            } else if (sn_i > max_pre_end_sn) {                                                                                   \
+                set_num = sn_i == max_pre_end_sn+1 ? 1 : 0;                                                                       \
+            } else set_num = pn;                                                                                                  \
+        }                                                                                                                         \
+        dp_h[sn_i] = SIMDMax(dp_h[sn_i], first);                                                                                  \
+        SIMD_SET_F(dp_h[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN);    \
         first = SIMDOri(SIMDAndi(SIMDShiftRight(SIMDSub(dp_h[sn_i], GAP_E1), SIMDTotalBytes-SIMDShiftOneN), PRE_MASK[0]), SUF_MIN[0]); \
-    }                                                                                                                                  \
-    if (abpt->align_mode == ABPOA_LOCAL_MODE) for (sn_i = 0; sn_i <= end_sn; ++sn_i) dp_h[sn_i] = SIMDMax(zero, dp_h[sn_i]);           \
+    }                                                                                                                             \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) for (sn_i = 0; sn_i <= end_sn; ++sn_i) dp_h[sn_i] = SIMDMax(zero, dp_h[sn_i]);      \
 }
 
 #define simd_abpoa_ag_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual) { \
@@ -783,21 +788,23 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain;                                                             \
     dp_h = DP_HEF + dp_i * 3 * dp_sn; dp_e1 = dp_h + dp_sn; dp_f1 = dp_e1 + dp_sn;                                                \
     _dp_h = (score_t*)dp_h, _dp_e1 = (score_t*)dp_e1, _dp_f1 = (score_t*)dp_f1;                                                   \
-    int min_pre_beg_sn, max_pre_end_sn;                                                                                           \
+    int min_pre_beg, min_pre_beg_sn, max_pre_end_sn;                                                                              \
     if (abpt->wb < 0) {                                                                                                           \
         beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;                                                                        \
         beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn;                               \
         min_pre_beg_sn = 0, max_pre_end_sn = end_sn;                                                                              \
     } else {                                                                                                                      \
         beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen);   \
-        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                                                       \
+        beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                                         \
         for (i = 0; i < pre_n[dp_i]; ++i) {                                                                                       \
             pre_i = pre_index[dp_i][i];                                                                                           \
-            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];                                             \
+            if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i];                      \
             if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];                                             \
-        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;                                                                   \
-        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;                                                      \
-        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;                                         \
+        }                                                                                                                         \
+        if (beg_sn < min_pre_beg_sn) {                                                                                            \
+            beg_sn = min_pre_beg_sn; beg = min_pre_beg;                                                                           \
+        }                                                                                                                         \
+        dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end;                      \
     }                                                                                                                             \
     /* loop query */                                                                                                              \
     /* first pre_node */                                                                                                          \
@@ -854,6 +861,8 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { /* SIMD parallelization */                                                      \
         dp_h[sn_i] = SIMDAdd(dp_h[sn_i], q[sn_i]);                                                                                \
     }                                                                                                                             \
+    for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = _dp_e1[i] = inf_min;                                                           \
+    for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = _dp_e1[i] = inf_min;                                                       \
     /* new F start */                                                                                                             \
     first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneN), SIMDTotalBytes-SIMDShiftOneN);              \
     for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {                                                                                 \
@@ -889,25 +898,31 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain;                                                             \
     dp_h = DP_H2E2F+dp_i*5*dp_sn; dp_e1 = dp_h+dp_sn; dp_e2 = dp_e1+dp_sn; dp_f1 = dp_e2+dp_sn; dp_f2 = dp_f1+dp_sn;              \
     _dp_h=(score_t*)dp_h, _dp_e1=(score_t*)dp_e1, _dp_e2=(score_t*)dp_e2, _dp_f1=(score_t*)dp_f1, _dp_f2=(score_t*)dp_f2;         \
-    int min_pre_beg_sn, max_pre_end_sn;                                                                                           \
+    int min_pre_beg, min_pre_beg_sn, max_pre_end_sn;                                                                              \
     if (abpt->wb < 0) {                                                                                                           \
         beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;                                                                        \
         beg_sn = dp_beg_sn[dp_i] = beg/pn; end_sn = dp_end_sn[dp_i] = end/pn;                                                     \
         min_pre_beg_sn = 0, max_pre_end_sn = end_sn;                                                                              \
     } else {                                                                                                                      \
         beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen);   \
-        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                                                       \
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)                                                                            \
+            fprintf(stderr, "index: %d (node: %d): raw beg: %d, end: %d\n", index_i-beg_index, node_id, beg, end);                \
+        beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                                         \
         for (i = 0; i < pre_n[dp_i]; ++i) {                                                                                       \
             pre_i = pre_index[dp_i][i];                                                                                           \
-            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];                                             \
+            if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)                                                                        \
+                fprintf(stderr, "\tpre_i: %d, pre_dp_beg: %d, pre_dp_beg_sn: %d\n", pre_i, dp_beg[pre_i], dp_beg_sn[pre_i]);      \
+            if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i];                      \
             if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];                                             \
-        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;                                                                   \
-        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;                                                      \
-        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;                                         \
-     /* fprintf(stderr, "index: %d, beg: %d, end: %d, beg_sn: %d, end_sn: %d\n", index_i, beg, end, beg_sn, end_sn); */           \
+        }                                                                                                                         \
+        if (beg_sn < min_pre_beg_sn) {                                                                                            \
+            assert(beg < min_pre_beg); beg = min_pre_beg; beg_sn = min_pre_beg_sn;                                                \
+        }                                                                                                                         \
+        dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end;                      \
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) {                                                                          \
+            fprintf(stderr, "index: %d (node: %d): beg: %d, end: %d, beg_sn: %d, end_sn: %d, max_left: %d, max_right: %d\n", index_i-beg_index, node_id, beg, end, beg_sn, end_sn, abpoa_graph_node_id_to_max_pos_left(graph, node_id), abpoa_graph_node_id_to_max_pos_right(graph, node_id)); \
+        }                                                                                                                         \
     }                                                                                                                             \
- /* fprintf(stderr, "%d: beg, end: %d, %d\n", index_i, beg, end); */                                                              \
-    /* tot_dp_sn += (end_sn - beg_sn + 1); */                                                                                     \
     /* loop query */                                                                                                              \
     /* first pre_node */                                                                                                          \
     pre_i = pre_index[dp_i][0];                                                                                                   \
@@ -968,10 +983,12 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
         }                                                                                                                         \
     }                                                                                                                             \
     /* compare M, E, and F */                                                                                                     \
- /* fprintf(stderr, "5 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, _beg_sn, _end_sn); */                                     \
+ /* fprintf(stderr, "5 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, beg_sn, end_sn); */                                       \
     for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { /* SIMD parallelization */                                                      \
         dp_h[sn_i] = SIMDAdd(dp_h[sn_i], q[sn_i]);                                                                                \
     }                                                                                                                             \
+    for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min;                                               \
+    for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min;                                           \
     /* new F start */                                                                                                             \
     first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneN), SIMDTotalBytes-SIMDShiftOneN);              \
     SIMDi first2 = first;                                                                                                         \
@@ -1042,28 +1059,27 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
 
 #define simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater) { \
     /* select max dp_h */                                                    \
-    max = inf_min, max_i = -1;                                               \
-    SIMDi a = dp_h[end_sn], b = qi[end_sn];                                  \
-    if (end_sn == qlen / pn) SIMDSetIfGreater(a, zero, b, SIMD_INF_MIN, a);  \
-    for (i = beg_sn; i < end_sn; ++i) {                                      \
-        SIMDGetIfGreater(b, a, dp_h[i], a, qi[i], b);                        \
-    }                                                                        \
-    _dp_h = (score_t*)&a, _qi = (score_t*)&b;                                \
-    for (i = 0; i < pn; ++i) {                                               \
+    max = inf_min, left_max_i = right_max_i = -1;                            \
+    _dp_h = (score_t*)dp_h, _qi = (score_t*)qi;                              \
+    for (i = beg; i <= end; ++i) {                                           \
         if (_dp_h[i] > max) {                                                \
-            max = _dp_h[i]; max_i = _qi[i];                                  \
+            max = _dp_h[i];                                                  \
+            left_max_i = right_max_i = _qi[i];                               \
+        } else if (_dp_h[i] == max) {                                        \
+            right_max_i = _qi[i];                                            \
         }                                                                    \
     }                                                                        \
 }
 
-#define simd_abpoa_ada_max_i   {                                                                                        \
-    /* set max_pos_left/right for next nodes */                                                                         \
-    int out_i = max_i + 1;                                                                                              \
-    for (i = 0; i < graph->node[node_id].out_edge_n; ++i) {                                                             \
-        int out_node_id = graph->node[node_id].out_id[i];                                                               \
-        if (out_i > graph->node_id_to_max_pos_right[out_node_id]) graph->node_id_to_max_pos_right[out_node_id] = out_i; \
-        if (out_i < graph->node_id_to_max_pos_left[out_node_id]) graph->node_id_to_max_pos_left[out_node_id] = out_i;   \
-    }                                                                                                                   \
+#define simd_abpoa_ada_max_i   {                                             \
+    /* set max_pos_left/right for next nodes */                              \
+    for (i = 0; i < graph->node[node_id].out_edge_n; ++i) {                  \
+        int out_node_id = graph->node[node_id].out_id[i];                    \
+        if (right_max_i+1 > graph->node_id_to_max_pos_right[out_node_id])    \
+            graph->node_id_to_max_pos_right[out_node_id] = right_max_i+1;    \
+        if (left_max_i+1 < graph->node_id_to_max_pos_left[out_node_id])      \
+            graph->node_id_to_max_pos_left[out_node_id] = left_max_i+1;      \
+    }                                                                        \
 }
 
 // TODO end_bonus for extension
@@ -1077,11 +1093,11 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
         simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub);                    \
         if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                             \
             simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                 \
-            set_global_max_score(max, dp_i, max_i);                                             \
+            set_global_max_score(max, dp_i, left_max_i);                                        \
         }                                                                                       \
         if (abpt->align_mode == ABPOA_EXTEND_MODE) {                                            \
             simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                 \
-            set_extend_max_score(max, dp_i, max_i);                                             \
+            set_extend_max_score(max, dp_i, right_max_i);                                       \
         }                                                                                       \
         if (abpt->wb >= 0) {                                                                    \
             if (abpt->align_mode == ABPOA_GLOBAL_MODE) {                                        \
@@ -1092,7 +1108,11 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     }                                                                                           \
     if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_H, dp_sn); \
     res->best_score = best_score;                                                               \
- /* simd_abpoa_print_lg_matrix(score_t, beg_index, end_index); printf("best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); */                          \
+    if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) {                                                 \
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)                                          \
+            simd_abpoa_print_lg_matrix(score_t, beg_index, end_index);                          \
+        fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score);            \
+    }                                                                                           \
     if (abpt->ret_cigar) simd_abpoa_lg_backtrack(score_t);                                      \
     simd_abpoa_free_var; SIMDFree(GAP_E1S);                                                     \
 }
@@ -1107,10 +1127,10 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
         simd_abpoa_ag_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual); \
         if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                 \
             simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                     \
-            set_global_max_score(max, dp_i, max_i);                                                 \
+            set_global_max_score(max, dp_i, left_max_i);                                            \
         } else if (abpt->align_mode == ABPOA_EXTEND_MODE) {                                         \
             simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                     \
-            set_extend_max_score(max, dp_i, max_i);                                                 \
+            set_extend_max_score(max, dp_i, right_max_i);                                           \
         }                                                                                           \
         if (abpt->wb >= 0) {                                                                        \
             if (abpt->align_mode == ABPOA_GLOBAL_MODE) {                                            \
@@ -1121,7 +1141,11 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
     }                                                                                               \
     if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_HEF, 3*dp_sn); \
     res->best_score = best_score;                                                                   \
- /* simd_abpoa_print_ag_matrix(score_t, beg_index, end_index); fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); */ \
+    if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) {                                                     \
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)                                              \
+            simd_abpoa_print_ag_matrix(score_t, beg_index, end_index);                              \
+        fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score);                \
+    }                                                                                               \
     if (abpt->ret_cigar) simd_abpoa_ag_backtrack(score_t);                                          \
     simd_abpoa_free_var; SIMDFree(GAP_E1S);                                                         \
 }
@@ -1136,10 +1160,10 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
         simd_abpoa_cg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual); \
         if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                   \
             simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                       \
-            set_global_max_score(max, dp_i, max_i);                                                   \
+            set_global_max_score(max, dp_i, left_max_i);                                              \
         } else if (abpt->align_mode == ABPOA_EXTEND_MODE) {                                           \
             simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                       \
-            set_extend_max_score(max, dp_i, max_i);                                                   \
+            set_extend_max_score(max, dp_i, right_max_i);                                             \
         }                                                                                             \
         if (abpt->wb >= 0) {                                                                          \
             if (abpt->align_mode == ABPOA_GLOBAL_MODE) {                                              \
@@ -1148,11 +1172,13 @@ SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
             simd_abpoa_ada_max_i;                                                                     \
         }                                                                                             \
     }                                                                                                 \
- /* printf("dp_sn: %d\n", tot_dp_sn); */                                                              \
     if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_H2E2F, 5*dp_sn); \
     res->best_score = best_score;                                                                     \
- /* simd_abpoa_print_cg_matrix(score_t, beg_index, end_index); */                                     \
- /* fprintf(stderr,"best_score: (%d, %d) -> %d\n",best_i,best_j,best_score); */                       \
+    if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) {                                                       \
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)                                                \
+            simd_abpoa_print_cg_matrix(score_t, beg_index, end_index);                                \
+        fprintf(stderr,"best_score: (%d, %d) -> %d\n",best_i, best_j, best_score);                    \
+    }                                                                                                 \
     if (abpt->ret_cigar) simd_abpoa_cg_backtrack(score_t);                                            \
     simd_abpoa_free_var; SIMDFree(GAP_E1S); SIMDFree(GAP_E2S);                                        \
 }
@@ -1191,7 +1217,8 @@ int simd_abpoa_realloc(abpoa_t *ab, int gn, int qlen, abpoa_para_t *abpt, SIMD_p
         // err_func_format_printf(__func__, "Warning: Graph is too large or query is too long.\n");
         // return 1;
     // }
-    // fprintf(stderr, "%lld, %lld, %lld\n", (long long)gn, (long long)ab->abm->s_msize, (long long)s_msize);
+    if (abpt->verbose >= ABPOA_DEBUG_VERBOSE)
+        fprintf(stderr, "realloc: graph_node %lld, qlen: %d, (%lld, %lld)\n", (long long)gn, qlen, (long long)ab->abm->s_msize, (long long)s_msize);
     if (s_msize > ab->abm->s_msize) {
         if (ab->abm->s_mem) SIMDFree(ab->abm->s_mem);
         kroundup64(s_msize); ab->abm->s_msize = s_msize;
@@ -1208,7 +1235,7 @@ int simd_abpoa_realloc(abpoa_t *ab, int gn, int qlen, abpoa_para_t *abpt, SIMD_p
     return 0;
 }
 
-void abpoa_init_var(abpoa_para_t *abpt, uint8_t *query, int qlen, SIMDi *qp, SIMDi *qi, int *mat, int qp_sn, int pn, SIMDi SIMD_INF_MIN) {
+void abpoa_init_var(abpoa_para_t *abpt, uint8_t *query, int qlen, SIMDi *qp, SIMDi *qi, int *mat, int64_t qp_sn, int pn, SIMDi SIMD_INF_MIN) {
     int i, j, k; int32_t *_qi;
     /* generate the query profile */
     for (i = 0; i < qp_sn * abpt->m; ++i) qp[i] = SIMD_INF_MIN;
@@ -1225,7 +1252,10 @@ void abpoa_init_var(abpoa_para_t *abpt, uint8_t *query, int qlen, SIMDi *qp, SIM
     }
 }
 
-void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_map, int beg_node_id, int end_node_id, int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int pn, int qlen, int w, int dp_sn, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, int32_t inf_min, int gap_open1, int gap_ext1, int gap_open2, int gap_ext2, int gap_oe1, int gap_oe2) {
+void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_map, int beg_node_id, int end_node_id, 
+                       int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int pn, int qlen, int w, int64_t dp_sn, 
+                       SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, int32_t inf_min, 
+                       int gap_open1, int gap_ext1, int gap_open2, int gap_ext2, int gap_oe1, int gap_oe2) {
     int i, _end_sn;
     if (abpt->wb >= 0) {
         graph->node_id_to_max_pos_left[beg_node_id] = graph->node_id_to_max_pos_right[beg_node_id] = 0;
@@ -1234,12 +1264,12 @@ void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_
             if (index_map[abpoa_graph_node_id_to_index(graph, out_id)])
                 graph->node_id_to_max_pos_left[out_id] = graph->node_id_to_max_pos_right[out_id] = 1;
         }
-        dp_beg[0] = GET_AD_DP_BEGIN(graph, w, beg_node_id, end_node_id, qlen), dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen);
+        dp_beg[0] = 0; // GET_AD_DP_BEGIN(graph, w, beg_node_id, end_node_id, qlen)
+        dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen);
     } else {
         dp_beg[0] = 0, dp_end[0] = qlen;
     }
     dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;
-    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;
     SIMDi *dp_h = DP_H2E2F; SIMDi *dp_e1 = dp_h + dp_sn; SIMDi *dp_e2 = dp_e1 + dp_sn, *dp_f1 = dp_e2 + dp_sn, *dp_f2 = dp_f1 + dp_sn;
     _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);
 
@@ -1249,40 +1279,37 @@ void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_
     int32_t *_dp_h = (int32_t*)dp_h, *_dp_e1 = (int32_t*)dp_e1, *_dp_e2 = (int32_t*)dp_e2, *_dp_f1 = (int32_t*)dp_f1, *_dp_f2 = (int32_t*)dp_f2;
     _dp_h[0] = 0; _dp_e1[0] = -(gap_oe1); _dp_e2[0] = -(gap_oe2); _dp_f1[0] = _dp_f2[0] = inf_min;
     for (i = 1; i <= dp_end[0]; ++i) { /* no SIMD parallelization */
-        _dp_f1[i] = -(gap_open1 + gap_ext1 * i);
-        _dp_f2[i] = -(gap_open2 + gap_ext2 * i);
-        _dp_h[i] = MAX_OF_TWO(_dp_f1[i], _dp_f2[i]); // -MIN_OF_TWO(gap_open1+gap_ext1*i, gap_open2+gap_ext2*i);
+        _dp_f1[i] = -(gap_open1 + gap_ext1 * i); _dp_f2[i] = -(gap_open2 + gap_ext2 * i);
+        _dp_h[i] = MAX_OF_TWO(_dp_f1[i], _dp_f2[i]);
     }
 }
 
-int abpoa_max(SIMDi SIMD_INF_MIN, SIMDi zero, int inf_min, SIMDi *dp_h, SIMDi *qi, int qlen, int pn, int beg_sn, int end_sn) {
+void abpoa_max(SIMDi SIMD_INF_MIN, SIMDi zero, int inf_min, SIMDi *dp_h, SIMDi *qi, int qlen, int pn, int beg, int end, int *left_max_i, int *right_max_i) {
     /* select max dp_h */
-    int max = inf_min, max_i = -1, i;
-    SIMDi a = dp_h[end_sn], b = qi[end_sn];
-    if (end_sn == qlen / pn) SIMDSetIfGreateri32(a, zero, b, SIMD_INF_MIN, a);
-    for (i = beg_sn; i < end_sn; ++i) {
-        SIMDGetIfGreateri32(b, a, dp_h[i], a, qi[i], b);
-    }
-    int32_t *_dp_h = (int32_t*)&a, *_qi = (int32_t*)&b;
-    for (i = 0; i < pn; ++i) {
+    int max = inf_min, i; *left_max_i = *right_max_i = -1;
+    int32_t *_dp_h = (int32_t*)dp_h, *_qi = (int32_t*)qi;
+    for (i = beg; i <= end; ++i) {
         if (_dp_h[i] > max) {
-            max = _dp_h[i]; max_i = _qi[i];
+            max = _dp_h[i]; *left_max_i = *right_max_i = _qi[i];
+        } else if (_dp_h[i] == max) {
+            *right_max_i = _qi[i];
         }
     }
-    return max_i;
 }
 
-void abpoa_ada_max_i(int max_i, abpoa_graph_t *graph, int node_id) {
+void abpoa_ada_max_i(int left_max_i, int right_max_i, abpoa_graph_t *graph, int node_id) {
     /* set max_pos_left/right for next nodes */
-    int out_i = max_i + 1; int i;
+    int i;
     for (i = 0; i < graph->node[node_id].out_edge_n; ++i) {
         int out_node_id = graph->node[node_id].out_id[i];
-        if (out_i > graph->node_id_to_max_pos_right[out_node_id]) graph->node_id_to_max_pos_right[out_node_id] = out_i;
-        if (out_i < graph->node_id_to_max_pos_left[out_node_id]) graph->node_id_to_max_pos_left[out_node_id] = out_i;
+        if (right_max_i+1 > graph->node_id_to_max_pos_right[out_node_id])
+            graph->node_id_to_max_pos_right[out_node_id] = right_max_i+1;
+        if (left_max_i+1 < graph->node_id_to_max_pos_left[out_node_id])
+            graph->node_id_to_max_pos_left[out_node_id] = left_max_i+1;
     }
 }
 
-void abpoa_global_get_max(abpoa_graph_t *graph, int beg_index, int end_node_id, uint8_t *index_map, SIMDi *DP_H_HE, int dp_sn, int qlen, int *dp_end, int32_t *best_score, int *best_i, int *best_j) {
+void abpoa_global_get_max(abpoa_graph_t *graph, int beg_index, int end_node_id, uint8_t *index_map, SIMDi *DP_H_HE, int64_t dp_sn, int qlen, int *dp_end, int32_t *best_score, int *best_i, int *best_j) {
     int in_id, in_index, dp_i, i;
     for (i = 0; i < graph->node[end_node_id].in_edge_n; ++i) {
         in_id = graph->node[end_node_id].in_id[i];
@@ -1300,26 +1327,37 @@ void abpoa_global_get_max(abpoa_graph_t *graph, int beg_index, int end_node_id,
     }
 }
 
-int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, SIMDi *dp_f2, int **pre_index, int *pre_n, int index_i, int dp_i, abpoa_graph_t *graph, abpoa_para_t *abpt, int dp_sn, int pn, int qlen, int w, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, SIMDi GAP_O1, SIMDi GAP_O2, SIMDi GAP_E1, SIMDi GAP_E2, SIMDi GAP_OE1, SIMDi GAP_OE2, SIMDi* GAP_E1S, SIMDi* GAP_E2S, SIMDi *PRE_MIN, SIMDi *PRE_MASK, SIMDi *SUF_MIN, int log_n, int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int end_node_id) {
+int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, SIMDi *dp_f2, int inf_min,
+                int **pre_index, int *pre_n, int beg_index, int index_i, int dp_i, abpoa_graph_t *graph, abpoa_para_t *abpt, 
+                int64_t dp_sn, int pn, int qlen, int w, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, 
+                SIMDi GAP_O1, SIMDi GAP_O2, SIMDi GAP_E1, SIMDi GAP_E2, SIMDi GAP_OE1, SIMDi GAP_OE2, SIMDi* GAP_E1S, SIMDi* GAP_E2S, 
+                SIMDi *PRE_MIN, SIMDi *PRE_MASK, SIMDi *SUF_MIN, int log_n, 
+                int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int end_node_id) {
     int tot_dp_sn = 0, i, pre_i, node_id = abpoa_graph_index_to_node_id(graph, index_i);
-    int min_pre_beg_sn, max_pre_end_sn, beg, end, beg_sn, end_sn, pre_end, pre_end_sn, pre_beg_sn, sn_i;
+    int min_pre_beg, min_pre_beg_sn, max_pre_end_sn, beg, end, beg_sn, end_sn, pre_end, pre_end_sn, pre_beg_sn, sn_i;
     if (abpt->wb < 0) {
         beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;
         beg_sn = dp_beg_sn[dp_i] = beg/pn; end_sn = dp_end_sn[dp_i] = end/pn;
         min_pre_beg_sn = 0, max_pre_end_sn = end_sn;
     } else {
         beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen);
-        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)
+            fprintf(stderr, "index: %d (node: %d): raw beg: %d, end: %d\n", index_i-beg_index, node_id, beg, end);
+        beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;
         for (i = 0; i < pre_n[dp_i]; ++i) {
             pre_i = pre_index[dp_i][i];
-            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];
+            if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)
+                fprintf(stderr, "\tpre_i: %d, pre_dp_beg: %d, pre_dp_beg_sn: %d\n", pre_i, dp_beg[pre_i], dp_beg_sn[pre_i]);
+            if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i];
             if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];
-        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;
-        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;
-        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;
-#ifdef __DEBUG__
-    fprintf(stderr, "index: %d (node: %d): beg: %d, end: %d, beg_sn: %d, end_sn: %d\n", index_i, node_id, beg, end, beg_sn, end_sn);
-#endif
+        } 
+        if (beg_sn < min_pre_beg_sn) {
+            assert(beg < min_pre_beg); beg_sn = min_pre_beg_sn; beg = min_pre_beg;
+        }
+        // NOT extend dp_beg/end based on dp_beg/end_sn
+        dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end;
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE)
+            fprintf(stderr, "index: %d (node: %d): beg: %d, end: %d, beg_sn: %d, end_sn: %d, max_left: %d, max_right: %d\n", index_i-beg_index, node_id, beg, end, beg_sn, end_sn, graph->node_id_to_max_pos_left[node_id], graph->node_id_to_max_pos_right[node_id]);
     }
     tot_dp_sn += (end_sn - beg_sn + 1);
     /* loop query */
@@ -1333,7 +1371,7 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1,
     /* set M from (pre_i, q_i-1) */
     if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneNi32);
     else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneNi32);
-    _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);
+    _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1); // pre_end+1: match/mismatch operations
     for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN;
     for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN;
     for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */
@@ -1349,7 +1387,7 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1,
         dp_e1[sn_i] = pre_dp_e1[sn_i];
         dp_e2[sn_i] = pre_dp_e2[sn_i];
     }
-    // if (index_i == 13095) debug_simd_abpoa_print_cg_matrix_row("1", int32_t, index_i);
+    // debug_simd_abpoa_print_cg_matrix_row("1", int32_t, index_i);
     // new init end
     /* get max m and e */
     for (i = 1; i < pre_n[dp_i]; ++i) {
@@ -1381,6 +1419,10 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1,
     /* new F start */
     first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneNi32), SIMDTotalBytes-SIMDShiftOneNi32);
     int set_num; SIMDi first2 = first;//, tmp;
+    int32_t *_dp_h = (int32_t*)dp_h, *_dp_e1 = (int32_t*)dp_e1, *_dp_e2 = (int32_t*)dp_e2;
+    // set h/e as inf_min for cells out of range: < dp_beg or > dp_end
+    for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min;
+    for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min;
     for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {
         if (sn_i < min_pre_beg_sn) {
             _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n");
@@ -1410,10 +1452,14 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1,
         // SIMDSetIfEquali32(dp_e2[sn_i], dp_h[sn_i], tmp, SIMDMaxi32(SIMDSubi32(dp_e2[sn_i], GAP_E2), SIMDSubi32(dp_h[sn_i], GAP_OE2)), SIMD_INF_MIN);
         dp_e2[sn_i] = SIMDMaxi32(SIMDSubi32(dp_e2[sn_i], GAP_E2), SIMDSubi32(dp_h[sn_i], GAP_OE2));
     }
+    // debug_simd_abpoa_print_cg_matrix_row("0", int32_t, index_i);
     return tot_dp_sn;
 }
 
-void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_beg, int *dp_end, int dp_sn, int m, int *mat, int gap_ext1, int gap_ext2, int gap_oe1, int gap_oe2, int beg_index, int best_dp_i, int best_dp_j, int qlen, abpoa_graph_t *graph, abpoa_para_t *abpt, uint8_t *query, abpoa_res_t *res) {
+void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_beg, int *dp_end, int64_t dp_sn, 
+                        int m, int *mat, int gap_ext1, int gap_ext2, int gap_oe1, int gap_oe2, 
+                        int beg_index, int best_dp_i, int best_dp_j, int qlen, 
+                        abpoa_graph_t *graph, abpoa_para_t *abpt, uint8_t *query, abpoa_res_t *res) {
     int dp_i, dp_j, k, pre_i, n_c = 0, m_c = 0, id, hit, cur_op = ABPOA_ALL_OP, _start_i, _start_j;
     SIMDi *dp_h;
     int32_t s, is_match, *_dp_h=NULL, *_dp_e1, *_dp_e2, *_pre_dp_h, *_pre_dp_e1, *_pre_dp_e2, *_dp_f1, *_dp_f2; abpoa_cigar_t *cigar = 0;
@@ -1540,10 +1586,8 @@ void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_be
                 }
             }
         }
-        if (hit == 0) exit(1);
-#ifdef __DEBUG__
-        fprintf(stderr, "%d, %d, %d\n", dp_i, dp_j, cur_op);
-#endif
+        if (hit == 0) err_fatal_simple("Error in cg_backtrack.");
+        simd_output_pre_nodes(pre_index[dp_i], pre_n[dp_i], dp_i, dp_j, cur_op, abpt->verbose);
     }
     if (dp_j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, dp_j, -1, dp_j-1);
     /* reverse cigar */
@@ -1554,15 +1598,23 @@ void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_be
     /*abpoa_print_cigar(n_c, *graph_cigar, graph);*/
 }
 
-int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, int beg_index, int end_node_id, int end_index, uint8_t *index_map, int qlen, uint8_t *query, abpoa_para_t *abpt, SIMD_para_t sp, abpoa_res_t *res) {
-    int tot_dp_sn = 0;
+int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, 
+                                                 int beg_node_id, int beg_index, int end_node_id, int end_index, 
+                                                 uint8_t *index_map, int qlen, uint8_t *query, 
+                                                 abpoa_para_t *abpt, SIMD_para_t sp, abpoa_res_t *res) {
+    int i, j;
     abpoa_graph_t *graph = ab->abg; abpoa_simd_matrix_t *abm = ab->abm;
-    int matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1;
+    int64_t matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1;
     int **pre_index, *pre_n, _pre_index, _pre_n;
-    int i, j, *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn, node_id, index_i, dp_i;
-    int beg_sn, end_sn;
-    int pn, log_n, size, qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/
+    int node_id, index_i, dp_i;
+    int pn, log_n, size; int64_t qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/
     SIMDi *dp_h, *qp, *qi;
+    // dp_beg/end: beg/end position of dp band in query (column) for each node (row)
+    // dp_beg/end_sn: beg/end VECTOR position of dp band in query (segmented column) for each node (row)
+    // XXX to keep consistency between different VECTOR size (SSE41/AVX2/AVX/512):
+    // .   dp cells within dp_beg/end_sn VECTORs but not within dp_beg/end need to be set to -inf
+    int *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn; 
+
     int32_t best_score = sp.inf_min, inf_min = sp.inf_min;
     int *mat = abpt->mat, best_i = 0, best_j = 0; int32_t gap_ext1 = abpt->gap_ext1;
     int w = abpt->wb < 0 ? qlen : abpt->wb + (int)(abpt->wf * qlen); /* when w < 0, do whole global */
@@ -1573,12 +1625,15 @@ int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, i
     int32_t gap_open1 = abpt->gap_open1, gap_oe1 = gap_open1 + gap_ext1;
     int32_t gap_open2 = abpt->gap_open2, gap_ext2 = abpt->gap_ext2, gap_oe2 = gap_open2 + gap_ext2;
     SIMDi *DP_H2E2F, *dp_e1, *dp_e2, *dp_f2, *dp_f1;
-    SIMDi GAP_O1 = SIMDSetOnei32(gap_open1), GAP_O2 = SIMDSetOnei32(gap_open2), GAP_E1 = SIMDSetOnei32(gap_ext1), GAP_E2 = SIMDSetOnei32(gap_ext2), GAP_OE1 = SIMDSetOnei32(gap_oe1), GAP_OE2 = SIMDSetOnei32(gap_oe2);
+    SIMDi GAP_O1 = SIMDSetOnei32(gap_open1), GAP_O2 = SIMDSetOnei32(gap_open2);
+    SIMDi GAP_E1 = SIMDSetOnei32(gap_ext1), GAP_E2 = SIMDSetOnei32(gap_ext2);
+    SIMDi GAP_OE1 = SIMDSetOnei32(gap_oe1), GAP_OE2 = SIMDSetOnei32(gap_oe2);
     DP_H2E2F = qp + qp_sn * abpt->m; qi = DP_H2E2F + dp_sn * matrix_row_n * 5;
 
     // for SET_F mask[pn], suf_min[pn], pre_min[logN]
     SIMDi *PRE_MASK, *SUF_MIN, *PRE_MIN, *GAP_E1S, *GAP_E2S;
-    PRE_MASK = (SIMDi*)SIMDMalloc((pn+1) * size, size), SUF_MIN = (SIMDi*)SIMDMalloc((pn+1) * size, size), PRE_MIN = (SIMDi*)SIMDMalloc(pn * size, size), GAP_E1S =  (SIMDi*)SIMDMalloc(log_n * size, size), GAP_E2S =  (SIMDi*)SIMDMalloc(log_n * size, size);
+    PRE_MASK = (SIMDi*)SIMDMalloc((pn+1) * size, size), SUF_MIN = (SIMDi*)SIMDMalloc((pn+1) * size, size); 
+    PRE_MIN = (SIMDi*)SIMDMalloc(pn * size, size), GAP_E1S = (SIMDi*)SIMDMalloc(log_n * size, size), GAP_E2S =  (SIMDi*)SIMDMalloc(log_n * size, size);
     for (i = 0; i < pn; ++i) {
         int32_t *pre_mask = (int32_t*)(PRE_MASK + i);
         for (j = 0; j <= i; ++j) pre_mask[j] = -1;
@@ -1612,26 +1667,34 @@ int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, i
         }
         pre_n[dp_i] = _pre_n;
     }
-    abpoa_cg_first_dp(abpt, graph, index_map, beg_node_id, end_node_id, dp_beg, dp_end, dp_beg_sn, dp_end_sn, pn, qlen, w, dp_sn, DP_H2E2F, SIMD_INF_MIN, inf_min, gap_open1, gap_ext1, gap_open2, gap_ext2, gap_oe1, gap_oe2);
+    abpoa_cg_first_dp(abpt, graph, index_map, beg_node_id, end_node_id, 
+                      dp_beg, dp_end, dp_beg_sn, dp_end_sn, pn, qlen, w, dp_sn, 
+                      DP_H2E2F, SIMD_INF_MIN, inf_min, gap_open1, gap_ext1, gap_open2, gap_ext2, gap_oe1, gap_oe2);
 
     for (index_i=beg_index+1, dp_i=1; index_i<end_index; ++index_i, ++dp_i) {
         if (index_map[index_i] == 0) continue;
         node_id = abpoa_graph_index_to_node_id(graph, index_i);
         SIMDi *q = qp + graph->node[node_id].base * qp_sn;
-        dp_h = DP_H2E2F + (dp_i*5) * dp_sn; dp_e1 = dp_h + dp_sn; dp_e2 = dp_e1 + dp_sn; dp_f1 = dp_e2 + dp_sn; dp_f2 = dp_f1 + dp_sn;
-        tot_dp_sn += abpoa_cg_dp(q, dp_h, dp_e1, dp_e2, dp_f1, dp_f2, pre_index, pre_n, index_i, dp_i, graph, abpt, dp_sn, pn, qlen, w, DP_H2E2F, SIMD_INF_MIN, GAP_O1, GAP_O2, GAP_E1, GAP_E2, GAP_OE1, GAP_OE2, GAP_E1S, GAP_E2S, PRE_MIN, PRE_MASK, SUF_MIN, log_n, dp_beg, dp_end, dp_beg_sn, dp_end_sn, end_node_id);
+        dp_h = DP_H2E2F + (dp_i*5) * dp_sn; 
+        dp_e1 = dp_h + dp_sn; dp_e2 = dp_e1 + dp_sn; 
+        dp_f1 = dp_e2 + dp_sn; dp_f2 = dp_f1 + dp_sn;
+        abpoa_cg_dp(q, dp_h, dp_e1, dp_e2, dp_f1, dp_f2, inf_min,
+                    pre_index, pre_n, beg_index, index_i, dp_i, 
+                    graph, abpt, dp_sn, pn, qlen, w, 
+                    DP_H2E2F, SIMD_INF_MIN, GAP_O1, GAP_O2, GAP_E1, GAP_E2, GAP_OE1, GAP_OE2, GAP_E1S, GAP_E2S, 
+                    PRE_MIN, PRE_MASK, SUF_MIN, log_n, 
+                    dp_beg, dp_end, dp_beg_sn, dp_end_sn, end_node_id);
         if (abpt->wb >= 0) {
-            beg_sn = dp_beg_sn[dp_i], end_sn = dp_end_sn[dp_i];
-            int max_i = abpoa_max(SIMD_INF_MIN, zero, inf_min, dp_h, qi, qlen, pn, beg_sn, end_sn);
-            abpoa_ada_max_i(max_i, graph, node_id);
+            int left_max_i, right_max_i;
+            abpoa_max(SIMD_INF_MIN, zero, inf_min, dp_h, qi, qlen, pn, dp_beg[dp_i], dp_end[dp_i], &left_max_i, &right_max_i);
+            abpoa_ada_max_i(left_max_i, right_max_i, graph, node_id);
         }
     }
-    // printf("dp_sn: %d\n", tot_dp_sn);
-    // printf("dp_sn: %d, node_n: %d, seq_n: %d\n", tot_dp_sn, graph->node_n, qlen);
     abpoa_global_get_max(graph, beg_index, end_node_id, index_map, DP_H2E2F, 5*dp_sn, qlen, dp_end, &best_score, &best_i, &best_j);
-#ifdef __DEBUG__
-    simd_abpoa_print_cg_matrix(int32_t, beg_index, end_index); fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score);
-#endif
+    if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) {
+        if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) simd_abpoa_print_cg_matrix(int32_t, beg_index, end_index); 
+        fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score);
+    }
     res->best_score = best_score;
     abpoa_cg_backtrack(DP_H2E2F, pre_index, pre_n, dp_beg, dp_end, dp_sn, abpt->m, mat, gap_ext1, gap_ext2, gap_oe1, gap_oe2, beg_index, best_i, best_j, qlen, graph, abpt, query, res);
     for (i = 0; i < matrix_row_n; ++i) free(pre_index[i]);
@@ -1644,8 +1707,10 @@ int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, i
 // align query to subgraph between beg_node_id and end_node_id (both are excluded)
 // generally: beg/end are the SRC/SINK_node
 int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int qlen, abpoa_res_t *res) {
-    // if (abpt->simd_flag == 0) err_fatal_simple("No SIMD instruction available.");
-
+    int old_verbose = abpt->verbose;
+    if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "beg_id: %d, end_id: %d, qlen: %d\n", beg_node_id, end_node_id, qlen);
+    // if (qlen == 8957) // for debug
+        // abpt->verbose = ABPOA_LONG_DEBUG_VERBOSE;
     int i, j, beg_index = ab->abg->node_id_to_index[beg_node_id], end_index = ab->abg->node_id_to_index[end_node_id];
     int gn = end_index - beg_index + 1;
     uint8_t *index_map = (uint8_t*)_err_calloc(ab->abg->node_n, sizeof(uint8_t));
@@ -1660,19 +1725,16 @@ int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int b
         }
     }
 
-#ifdef __DEBUG__
+#ifdef __SIMD_DEBUG__
     _simd_p32.inf_min = MAX_OF_TWO(abpt->gap_ext1, abpt->gap_ext2) * 31 +MAX_OF_THREE(INT32_MIN + abpt->min_mis, INT32_MIN + abpt->gap_open1 + abpt->gap_ext1, INT32_MIN + abpt->gap_open2 + abpt->gap_ext2);
     if (simd_abpoa_realloc(ab, gn, qlen, abpt, _simd_p32)) return 0;
-    if (abpt->gap_mode == ABPOA_CONVEX_GAP) abpoa_cg_global_align_sequence_to_graph_core(ab, beg_node_id, beg_index, end_node_id, end_index, index_map, qlen, query, abpt, _simd_p32, res);
+    if (abpt->gap_mode == ABPOA_CONVEX_GAP)
+        abpoa_cg_global_align_sequence_to_graph_core(ab, beg_node_id, beg_index, end_node_id, end_index, index_map, qlen, query, abpt, _simd_p32, res);
 #else
     int32_t max_score, bits, mem_ret=0, gap_ext1 = abpt->gap_ext1, gap_ext2 = abpt->gap_ext2;
     int32_t gap_oe1 = abpt->gap_open1+gap_ext1, gap_oe2 = abpt->gap_open2+gap_ext2;
-    // if (abpt->simd_flag & SIMD_AVX512F && !(abpt->simd_flag & SIMD_AVX512BW))
-    //    max_score = INT16_MAX + 1; // AVX512F has no 8/16 bits operations
-    // else {
     int len = qlen > gn ? qlen : gn;
     max_score = MAX_OF_TWO(qlen * abpt->max_mat, len * abpt->gap_ext1 + abpt->gap_open1);
-    // }
     if (max_score <= INT16_MAX - abpt->min_mis - gap_oe1 - gap_oe2) {
         _simd_p16.inf_min = MAX_OF_THREE(INT16_MIN + abpt->min_mis, INT16_MIN + gap_oe1, INT16_MIN + gap_oe2) + 31 * MAX_OF_TWO(gap_ext1, gap_ext2);
         mem_ret = simd_abpoa_realloc(ab, gn, qlen, abpt, _simd_p16);
@@ -1709,6 +1771,7 @@ int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int b
     }
 #endif
     free(index_map);
+    abpt->verbose = old_verbose;
     return 0;
 }