From e234dcef3061691293faa65c2dcb07a38622dc7c Mon Sep 17 00:00:00 2001 From: Yan Gao Date: Fri, 5 Apr 2024 16:20:11 -0400 Subject: [PATCH] fix instruction-dependent result difference and seg fault due to large memory usage (#66) --- Makefile | 18 +- TODO.md | 9 + include/abpoa.h | 5 + src/abpoa.c | 2 + src/abpoa.h | 5 + src/abpoa_align.c | 19 +- src/abpoa_output.c | 49 ++--- src/abpoa_seed.c | 60 +++--- src/simd_abpoa_align.c | 451 +++++++++++++++++++++++------------------ 9 files changed, 350 insertions(+), 268 deletions(-) create mode 100644 TODO.md diff --git a/Makefile b/Makefile index 0b6df3f..2c5f8f0 100644 --- a/Makefile +++ b/Makefile @@ -4,20 +4,28 @@ default: abpoa #CC = gcc OS := $(shell uname) ARCH := $(shell arch) -EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES -CFLAGS = -Wall -O3 $(EXTRA_FLAGS) +# add -fno-tree-vectorize to avoid certain vectorization errors in O3 optimization +# right now, we are using -O3 for the best performance, and no vectorization errors were found +EXTRA_FLAGS = -Wall -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES # -fno-tree-vectorize +DFLAGS = # for debug ifneq ($(debug),) - DFLAGS = -D __DEBUG__ + DFLAGS += -D __DEBUG__ endif +ifneq ($(sdebug),) + DFLAGS += -D __SIMD_DEBUG__ +endif + # for gdb ifneq ($(gdb),) - CFLAGS = -Wall -g ${DFLAGS} $(EXTRA_FLAGS) + OPT_FLAGS = -g else - CFLAGS = -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS) + OPT_FLAGS = -O3 endif +CFLAGS = OPT_FLAGS ${DFLAGS} $(EXTRA_FLAGS) + # for gprof ifneq ($(pg),) PG_FLAG = -pg diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..3c7e96d --- /dev/null +++ b/TODO.md @@ -0,0 +1,9 @@ +# To test after commit +1. run through + 1. SSE/AVX2/AVX512 + 2. 16/32bits + 3. global/local + 4. lg/ag/cg +2. same result + 1. SSE vs AVX2 vs AVX512 + 2. 16bits vs 32 bits \ No newline at end of file diff --git a/include/abpoa.h b/include/abpoa.h index 87bbdcf..ae9a6b7 100644 --- a/include/abpoa.h +++ b/include/abpoa.h @@ -38,6 +38,11 @@ #define ABPOA_HB 0 #define ABPOA_HC 1 +#define ABPOA_NONE_VERBOSE 0 +#define ABPOA_INFO_VERBOSE 1 +#define ABPOA_DEBUG_VERBOSE 2 +#define ABPOA_LONG_DEBUG_VERBOSE 3 + // NOTE: upper boundary of in_edge_n is pow(2,30) // for MATCH/MISMATCH: node_id << 34 | query_id << 4 | op // for INSERTION: query_id << 34 | op_len << 4 | op diff --git a/src/abpoa.c b/src/abpoa.c index 64105c1..031ee4b 100644 --- a/src/abpoa.c +++ b/src/abpoa.c @@ -54,6 +54,7 @@ const struct option abpoa_long_opt [] = { { "help", 0, NULL, 'h' }, { "version", 0, NULL, 'v' }, + { "verbose", 1, NULL, 'V'}, { 0, 0, 0, 0} }; @@ -122,6 +123,7 @@ int abpoa_usage(void) err_printf(" -h --help print this help usage information\n"); err_printf(" -v --version show version number\n"); + err_printf(" -V --verbose INT verbose level (0-2). 0: none, 1: information, 2: debug [0]\n"); err_printf("\n"); diff --git a/src/abpoa.h b/src/abpoa.h index 87bbdcf..ae9a6b7 100644 --- a/src/abpoa.h +++ b/src/abpoa.h @@ -38,6 +38,11 @@ #define ABPOA_HB 0 #define ABPOA_HC 1 +#define ABPOA_NONE_VERBOSE 0 +#define ABPOA_INFO_VERBOSE 1 +#define ABPOA_DEBUG_VERBOSE 2 +#define ABPOA_LONG_DEBUG_VERBOSE 3 + // NOTE: upper boundary of in_edge_n is pow(2,30) // for MATCH/MISMATCH: node_id << 34 | query_id << 4 | op // for INSERTION: query_id << 34 | op_len << 4 | op diff --git a/src/abpoa_align.c b/src/abpoa_align.c index 7eca502..3829392 100644 --- a/src/abpoa_align.c +++ b/src/abpoa_align.c @@ -133,7 +133,7 @@ abpoa_para_t *abpoa_init_para(void) { abpt->min_w = ABPOA_MIN_POA_WIN; abpt->progressive_poa = 0; // progressive partial order alignment - abpt->verbose = 0; + abpt->verbose = ABPOA_NONE_VERBOSE; // abpt->simd_flag = simd_check(); @@ -199,9 +199,7 @@ int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weig // uint8_t *seq1; for (_i = 0; _i < n_seq; ++_i) { i = read_id_map[_i]; read_id = exist_n_seq + i; qlen = seq_lens[i]; whole_res.n_cigar = 0, whole_res.m_cigar = 0, whole_res.graph_cigar = 0; -#ifdef __DEBUG__ - fprintf(stderr, "seq: # %d\n", i); -#endif + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "seq: # %d\n", i); // seq-to-graph alignment and add alignment within each split window if (_i == 0) ai = 0; else ai = par_c[_i-1]; @@ -252,10 +250,7 @@ int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weig for (; ai < par_c[_i]; ++ai) { end_tpos = ((par_anchors.a[ai] >> 32) & 0x7fffffff) - k + 1; end_id = tpos_to_node_id[end_tpos]; end_qpos = (uint32_t)par_anchors.a[ai] - k + 1; - -#ifdef __DEBUG__ - fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos); -#endif + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos); res.graph_cigar = 0; res.n_cigar = 0; abpoa_align_sequence_to_subgraph(ab, abpt, beg_id, end_id, qseq+beg_qpos, end_qpos-beg_qpos, &res); @@ -277,9 +272,7 @@ int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weig } end_id = ABPOA_SINK_NODE_ID; end_qpos = seq_lens[i]; -#ifdef __DEBUG__ - fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos); -#endif + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos); res.graph_cigar = 0; res.n_cigar = 0; abpoa_align_sequence_to_subgraph(ab, abpt, beg_id, end_id, qseq+beg_qpos, end_qpos-beg_qpos, &res); abpoa_push_whole_cigar(&whole_res.n_cigar, &whole_res.m_cigar, &whole_res.graph_cigar, res.n_cigar, res.graph_cigar); @@ -307,9 +300,7 @@ int abpoa_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weights, in // uint8_t *seq1; for (i = 0; i < n_seq; ++i) { qlen = seq_lens[i]; qseq = seqs[i]; weight = weights[i]; read_id = exist_n_seq + i; -#ifdef __DEBUG__ - fprintf(stderr, "seq: # %d\n", i); -#endif + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "seq: # %d\n", i); res.graph_cigar = 0; res.n_cigar = 0; if (abpoa_align_sequence_to_graph(ab, abpt, qseq, qlen, &res) >= 0) { if (abpt->amb_strand && (res.best_score < MIN_OF_TWO(qlen, ab->abg->node_n-2) * abpt->max_mat * .3333)) { // TODO .3333 diff --git a/src/abpoa_output.c b/src/abpoa_output.c index 697962d..72c1192 100644 --- a/src/abpoa_output.c +++ b/src/abpoa_output.c @@ -728,7 +728,7 @@ int reassign_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int rea // read_weight is NOT used here, no matter use_qv is set or not. // collect minimized set of het bases -int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids, int *het_poss, int **rc_weight, int msa_l, int n_seq, int m, int min_w, int read_ids_n) { +int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids, int *het_poss, int **rc_weight, int msa_l, int n_seq, int m, int min_w, int read_ids_n, int verbose) { int i, j, k, n, rank; uint64_t b, one = 1, *whole_read_ids = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); for (i = 0; i < n_seq; ++i) { @@ -782,12 +782,12 @@ int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids if (iden == 1) continue; het_poss[n_het_pos++] = rank; -#ifdef __DEBUG__ - fprintf(stderr, "%d\t", rank); - for (j = 0; j < m; ++j) { - fprintf(stderr, "%c: %d\t", "ACGT-"[j], rc_weight[rank][j]); - } fprintf(stderr, "\n"); -#endif + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) { + fprintf(stderr, "%d\t", rank); + for (j = 0; j < m; ++j) { + fprintf(stderr, "%c: %d\t", "ACGT-"[j], rc_weight[rank][j]); + } fprintf(stderr, "\n"); + } } } free(n_branch); free(node_map); @@ -796,7 +796,8 @@ int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids // group read into clusters based on all het bases // initial cluster size could be > max_n_cons -int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***read_ids, int read_ids_n, int n_seq, int m, int min_w, int max_n_cons, uint64_t ***clu_read_ids, int *_m_clu) { +int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***read_ids, int read_ids_n, + int n_seq, int m, int min_w, int max_n_cons, uint64_t ***clu_read_ids, int *_m_clu, int verbose) { if (n_het_pos == 0) return 1; int i, j, k, n_clu = 0, m_clu = 2; int **clu_haps = (int**)_err_malloc(2 * sizeof(int*)); @@ -832,25 +833,25 @@ int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***rea } } if (n_clu < 2) err_fatal(__func__, "# haplotypes: %d\n", n_clu); -#ifdef __DEBUG__ - fprintf(stderr, "n_clu: %d\n", n_clu); - for (i = 0; i < n_clu; ++i) { - for (j = 0; j < n_het_pos; ++j) { - fprintf(stderr, "%d\t", clu_haps[i][j]); + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) { + fprintf(stderr, "n_clu: %d\n", n_clu); + for (i = 0; i < n_clu; ++i) { + for (j = 0; j < n_het_pos; ++j) { + fprintf(stderr, "%d\t", clu_haps[i][j]); + } + fprintf(stderr, "\tsize: %d\n", clu_size[i]); } - fprintf(stderr, "\tsize: %d\n", clu_size[i]); } -#endif // assign haplotype with reads < min_w to haplotype with reads >= min_w // keep at most _max_n_cons_ haps and read ids, weight need to >= min_w n_clu = reassign_hap(clu_haps, clu_size, *clu_read_ids, read_ids_n, n_clu, min_w, max_n_cons, n_het_pos); -#ifdef __DEBUG__ - fprintf(stderr, "After re-assign: n_clu: %d\n", n_clu); - for (i = 0; i < n_clu; ++i) { - fprintf(stderr, "%d:\tsize: %d\n", i, clu_size[i]); + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) { + fprintf(stderr, "After re-assign: n_clu: %d\n", n_clu); + for (i = 0; i < n_clu; ++i) { + fprintf(stderr, "%d:\tsize: %d\n", i, clu_size[i]); + } } -#endif for (i = 0; i < m_clu; ++i) free(clu_haps[i]); free(clu_haps); free(clu_size); *_m_clu = m_clu; return n_clu; @@ -858,7 +859,7 @@ int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***rea // read_weight is NOT used here // cluster reads into _n_clu_ groups based on heterogeneous bases -int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, int m, int max_n_cons, double min_freq, uint64_t ***clu_read_ids, int *_m_clu) { +int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, int m, int max_n_cons, double min_freq, uint64_t ***clu_read_ids, int *_m_clu, int verbose) { abpoa_set_msa_rank(abg, src_id, sink_id); int i, j, n_clu, m_clu, read_ids_n = (n_seq-1)/64+1; int msa_l = abg->node_id_to_msa_rank[sink_id]-1, min_w = MAX_OF_TWO(1, n_seq * min_freq); // TODO fastq-qual weight @@ -878,11 +879,11 @@ int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq } // find min set of het nodes int *het_poss = (int*)_err_calloc(msa_l, sizeof(int)); - int n_het_pos = abpoa_set_het_row_column_ids_weight(abg, read_ids, het_poss, rc_weight, msa_l, n_seq, m, min_w, read_ids_n); + int n_het_pos = abpoa_set_het_row_column_ids_weight(abg, read_ids, het_poss, rc_weight, msa_l, n_seq, m, min_w, read_ids_n, verbose); if (n_het_pos < 1) n_clu = 1; // collect at most _max_n_cons_ haplotypes and corresponding read ids - else n_clu = abpoa_collect_clu_hap_read_ids(het_poss, n_het_pos, read_ids, read_ids_n, n_seq, m, min_w, max_n_cons, clu_read_ids, &m_clu); + else n_clu = abpoa_collect_clu_hap_read_ids(het_poss, n_het_pos, read_ids, read_ids_n, n_seq, m, min_w, max_n_cons, clu_read_ids, &m_clu, verbose); for (i = 0; i < msa_l; ++i) { for (j = 0; j < m; ++j) free(read_ids[i][j]); @@ -906,7 +907,7 @@ void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt) { int n_clu, m_clu, n_seq = ab->abs->n_seq; uint64_t **clu_read_ids; int read_ids_n = (n_seq-1)/64+1; - if (abpt->max_n_cons > 1) n_clu = abpoa_multip_read_clu(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, abpt->m, abpt->max_n_cons, abpt->min_freq, &clu_read_ids, &m_clu); + if (abpt->max_n_cons > 1) n_clu = abpoa_multip_read_clu(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, abpt->m, abpt->max_n_cons, abpt->min_freq, &clu_read_ids, &m_clu, abpt->verbose); else n_clu = 1; abpoa_cons_t *abc = ab->abc; diff --git a/src/abpoa_seed.c b/src/abpoa_seed.c index 13cbb05..277413e 100644 --- a/src/abpoa_seed.c +++ b/src/abpoa_seed.c @@ -231,7 +231,7 @@ void mm_aa_sketch(void *km, const uint8_t *str, int len, int w, int k, uint32_t int abpoa_build_guide_tree(abpoa_para_t *abpt, int n_seq, ab_u128_v *mm, int *tree_id_map) { if (mm->n == 0) return 0; - if (abpt->verbose > 0) fprintf(stderr, "[%s] Building progressive guide tree ... ", __func__); + if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "[%s] Building progressive guide tree ... ", __func__); size_t i, _i, j; int rid1, rid2; // mm_hit_n: mimizer hits between each two sequences // 0: 0 // 1: 0 1 @@ -319,7 +319,7 @@ int abpoa_build_guide_tree(abpoa_para_t *abpt, int n_seq, ab_u128_v *mm, int *tr } free(mm_hit_n); free(jac_sim); - if (abpt->verbose > 0) fprintf(stderr, "done!\n"); + if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "done!\n"); return 0; } @@ -382,7 +382,9 @@ int get_local_chain_score(int j_end_tpos, int j_end_qpos, int i_end_anchor_i, ab // local chains: // x: strand | end_tpos | end_qpos // y: end_anchor_i | start_anchor_i -int abpoa_dp_chaining_of_local_chains(void *km, ab_u128_t *local_chains, int n_local_chains, ab_u64_v *anchors, int *score, int *pre_id, ab_u64_v *par_anchors, int min_w, int tlen, int qlen) { +int abpoa_dp_chaining_of_local_chains(void *km, ab_u128_t *local_chains, int n_local_chains, + ab_u64_v *anchors, int *score, int *pre_id, ab_u64_v *par_anchors, + int min_w, int tlen, int qlen, int verbose) { int i, j, st, score1, global_max_score=INT32_MIN, global_max_i=-1; int *chain_score = (int*)kmalloc(km, n_local_chains * 4), *pre_chain_id = (int*)kmalloc(km, n_local_chains * 4); size_t _n = par_anchors->n; @@ -452,13 +454,13 @@ int abpoa_dp_chaining_of_local_chains(void *km, ab_u128_t *local_chains, int n_l par_anchors->a[par_anchors->n-i-1] = tmp; } -#ifdef __DEBUG__ - for (i = _n; i < par_anchors->n; ++i) { - uint64_t ia = par_anchors->a[i]; - // strand, rpos, qpos - fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia)); + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) { + for (i = _n; i < par_anchors->n; ++i) { + uint64_t ia = par_anchors->a[i]; + // strand, rpos, qpos + fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia)); + } } -#endif kfree(km, chain_score), kfree(km, pre_chain_id); return 0; } @@ -482,7 +484,7 @@ static int get_chain_score(int max_bw, int *score, int i_qpos, int i_tpos, int j // Dynamic Programming-based Chaining for global alignment mode // anchors: // strand<<63 | tpos<<32 | qpos -int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_para_t *abpt, int tlen, int qlen) { +int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_para_t *abpt, int tlen, int qlen, int verbose) { int i, j, st, n_a = anchors->n; int *score = (int32_t*)kmalloc(km, n_a * 4), *pre_id = (int32_t*)kmalloc(km, n_a * 4), *end_pos = (int32_t*)kmalloc(km, n_a * 4); memset(end_pos, 0, n_a * 4); @@ -516,9 +518,7 @@ int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_ if (pre_id[j] >= 0) end_pos[pre_id[j]] = i; } -#ifdef __DEBUG__ - fprintf(stderr, "%d pre_id: %d, score: %d, tpos: %d, qpos: %d\n", i, max_j, max_score, i_tpos, i_qpos); -#endif + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) fprintf(stderr, "%d pre_id: %d, score: %d, tpos: %d, qpos: %d\n", i, max_j, max_score, i_tpos, i_qpos); score[i] = max_score, pre_id[i] = max_j; } @@ -570,7 +570,8 @@ int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_ } radix_sort_ab_128x(local_chains+tot_chain_i+1, local_chains + n_local_chains); - abpoa_dp_chaining_of_local_chains(km, local_chains+tot_chain_i+1, n_local_chains-1-tot_chain_i, anchors, score, pre_id, par_anchors, min_w, tlen, qlen); + abpoa_dp_chaining_of_local_chains(km, local_chains+tot_chain_i+1, n_local_chains-1-tot_chain_i, + anchors, score, pre_id, par_anchors, min_w, tlen, qlen, abpt->verbose); kfree(km, score); kfree(km, pre_id); kfree(km, end_pos); kfree(km, local_chains); return 0; @@ -630,7 +631,7 @@ int LIS(void *km, int tot_n, uint64_t *rank, int n) { // output: // anchor list size: n // list of anchors: anchors -int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w) { +int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w, int verbose) { size_t i, j, n_a = anchors->n, n_for, n_rev; uint64_t *for_rank = (uint64_t*)kmalloc(km, sizeof(uint64_t) * n_a); uint64_t *rev_rank = (uint64_t*)kmalloc(km, sizeof(uint64_t) * n_a); @@ -663,10 +664,8 @@ int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w) n = n_rev; rank = rev_rank; kfree(km, for_rank); } // filter anchors - int last_tpos = -1, last_qpos = -1, cur_tpos, cur_qpos; -#ifdef __DEBUG__ - size_t _n = par_anchors->n; -#endif + int last_tpos = -1, last_qpos = -1, cur_tpos, cur_qpos; size_t _n = 0; + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) _n = par_anchors->n; for (i = 0; i < n; ++i) { j = (int)rank[i]-1; cur_tpos = (anchors->a[j] >> 32) & 0x7fffffff; @@ -677,18 +676,19 @@ int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w) kv_push(uint64_t, 0, *par_anchors, anchors->a[j]); // store LIS anchors into par_anchors last_tpos = cur_tpos; last_qpos = cur_qpos; } -#ifdef __DEBUG__ - for (i = _n; i < par_anchors->n; ++i) { - uint64_t ia = par_anchors->a[i]; - // strand, rpos, qpos - fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia)); + + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) { + for (i = _n; i < par_anchors->n; ++i) { + uint64_t ia = par_anchors->a[i]; + // strand, rpos, qpos + fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia)); + } } -#endif return 0; } int abpoa_collect_mm(void *km, uint8_t **seqs, int *seq_lens, int n_seq, abpoa_para_t *abpt, ab_u128_v *mm, int *mm_c) { - if (abpt->verbose > 0) fprintf(stderr, "[%s] Collecting minimizers ... ", __func__); + if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "[%s] Collecting minimizers ... ", __func__); int i; mm_c[0] = 0; for (i = 0; i < n_seq; ++i) { // collect minimizers @@ -696,7 +696,7 @@ int abpoa_collect_mm(void *km, uint8_t **seqs, int *seq_lens, int n_seq, abpoa_p else mm_sketch(km, seqs[i], seq_lens[i], abpt->w, abpt->k, i, 0, abpt->amb_strand, mm); mm_c[i+1] = mm->n; } - if (abpt->verbose > 0) fprintf(stderr, "done!\n"); + if (abpt->verbose >= ABPOA_INFO_VERBOSE) fprintf(stderr, "done!\n"); return mm->n; } @@ -731,11 +731,9 @@ int abpoa_build_guide_tree_partition(uint8_t **seqs, int *seq_lens, int n_seq, a // collect minimizer hit anchors between t and q collect_anchors1(km, &anchors, mm1, mm_c, tid, qid, seq_lens[qid], abpt->k); // filtering and only keep LIS anchors -#ifdef __DEBUG__ - fprintf(stderr, "%d vs %d (tot_n: %ld)\n", tid, qid, anchors.n); -#endif + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) fprintf(stderr, "%d vs %d (tot_n: %ld)\n", tid, qid, anchors.n); // alignment mode: different chaining result for global/local/extend - abpoa_dp_chaining(km, &anchors, par_anchors, abpt, seq_lens[tid], seq_lens[qid]); + abpoa_dp_chaining(km, &anchors, par_anchors, abpt, seq_lens[tid], seq_lens[qid], abpt->verbose); par_c[i] = par_anchors->n; kfree(km, anchors.a); } diff --git a/src/simd_abpoa_align.c b/src/simd_abpoa_align.c index 9c2363c..6362ec1 100644 --- a/src/simd_abpoa_align.c +++ b/src/simd_abpoa_align.c @@ -35,6 +35,16 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; #define SIMDTotalBytes 16 #endif +void simd_output_pre_nodes(int *pre_index, int pre_n, int dp_i, int dp_j, int cur_op, int verbose) { + if (verbose >= ABPOA_LONG_DEBUG_VERBOSE) { + int k; + fprintf(stderr, "%d, %d, %d\t", dp_i, dp_j, cur_op); + for (k = 0; k < pre_n; ++k) { /* print all pre node for node i*/ + fprintf(stderr, "%d, ", pre_index[k]); + } fprintf(stderr, "\n"); + } +} + #define print_simd(s, str, score_t) { \ int _i; score_t *_a = (score_t*)(s); \ fprintf(stderr, "%s\t", str); \ @@ -43,7 +53,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } fprintf(stderr, "\n"); \ } - #define simd_abpoa_print_lg_matrix(score_t, beg_index, end_index) { \ for (j = 0; j < end_index-beg_index; ++j) { \ fprintf(stderr, "index: %d\t", j); \ @@ -56,7 +65,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } #define simd_abpoa_print_ag_matrix(score_t, beg_index, end_index) { \ - for (j = beg_index; j < end_index; ++j) { \ + for (j = 0; j < end_index-beg_index; ++j) { \ fprintf(stderr, "index: %d\t", j); \ dp_h = DP_HEF + j * 3 * dp_sn; dp_e1 = dp_h + dp_sn; \ _dp_h = (score_t*)dp_h, _dp_e1 = (score_t*)dp_e1; \ @@ -70,7 +79,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; score_t *_dp_h = (score_t*)dp_h, *_dp_e1 = (score_t*)dp_e1; \ score_t *_dp_e2 = (score_t*)dp_e2, *_dp_f1 = (score_t*)dp_f1, *_dp_f2 = (score_t*)dp_f2; \ fprintf(stderr, "%s\tindex: %d\t", str, index_i); \ - for (i = dp_beg[index_i]; i <= (dp_end[index_i]/16+1)*16-1; ++i) { \ + for (i = dp_beg[index_i]; i <= dp_end[index_i]; ++i) { \ fprintf(stderr, "%d:(%d,%d,%d,%d,%d)\t", i, _dp_h[i], _dp_e1[i],_dp_e2[i], _dp_f1[i],_dp_f2[i]); \ } fprintf(stderr, "\n"); \ } @@ -169,7 +178,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } \ } \ if (hit == 0) err_fatal_simple("Error in lg_backtrack."); \ - /* fprintf(stderr, "%d, %d, (%d)\n", i, j, indel_first); */ \ + simd_output_pre_nodes(pre_index[i], pre_n[i], i, j, 0, abpt->verbose); \ } \ if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1); \ /* reverse cigar */ \ @@ -273,7 +282,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } \ } \ if (hit == 0) err_fatal_simple("Error in ag_backtrack."); \ - /* fprintf(stderr, "%d, %d, %d (indel_first: %d)\n", i, j, cur_op, indel_first); */ \ + simd_output_pre_nodes(pre_index[i], pre_n[i], i, j, cur_op, abpt->verbose); \ } \ if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1); \ /* reverse cigar */ \ @@ -290,7 +299,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; abpoa_cigar_t *cigar = 0; \ i = best_i, j = best_j, _start_i = best_i, _start_j = best_j; \ id = abpoa_graph_index_to_node_id(graph, i+beg_index); \ - if (best_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-j, -1, qlen-1); \ + if (best_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-best_j, -1, qlen-1); \ SIMDi *dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h; \ int indel_first = 1; /* prefer to keep gaps at the end */ \ while (i > 0 && j > 0) { \ @@ -305,24 +314,25 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue; \ _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5); \ if (_pre_dp_h[j-1] + s == _dp_h[j]) { \ - cur_op = ABPOA_ALL_OP; hit = 1; \ cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1); \ i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index); hit = 1; \ dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h; \ + cur_op = ABPOA_ALL_OP; \ ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0; \ break; \ } \ } \ } \ if (hit == 0 && cur_op & ABPOA_E_OP) { /* deletion */ \ + _dp_e1 = (score_t*)(dp_h + dp_sn); _dp_e2 = (score_t*)(dp_h + dp_sn * 2); \ for (k = 0; k < pre_n[i]; ++k) { \ pre_i = pre_index_i[k]; \ if (j < dp_beg[pre_i] || j > dp_end[pre_i]) continue; \ + _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5); \ if (cur_op & ABPOA_E1_OP) { \ _pre_dp_e1 = (score_t*)(DP_H2E2F + dp_sn * (pre_i * 5 + 1)); \ if (cur_op & ABPOA_M_OP) { \ if (_dp_h[j] == _pre_dp_e1[j]) { \ - _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5); \ if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP; \ else cur_op = ABPOA_E1_OP; \ hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1); \ @@ -331,9 +341,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; break; \ } \ } else { \ - _dp_e1 = (score_t*)(dp_h + dp_sn); \ if (_dp_e1[j] == _pre_dp_e1[j] - gap_ext1) { \ - _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5); \ if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP; \ else cur_op = ABPOA_E1_OP; \ hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1); \ @@ -347,7 +355,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; _pre_dp_e2 = (score_t*)(DP_H2E2F + dp_sn * (pre_i * 5 + 2)); \ if (cur_op & ABPOA_M_OP) { \ if (_dp_h[j] == _pre_dp_e2[j]) { \ - _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5); \ if (_pre_dp_h[j] - gap_oe2 == _pre_dp_e2[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP; \ else cur_op = ABPOA_E2_OP; \ hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1); \ @@ -356,9 +363,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; break; \ } \ } else { \ - _dp_e2 = (score_t*)(dp_h + dp_sn * 2); \ if (_dp_e2[j] == _pre_dp_e2[j] - gap_ext2) { \ - _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5); \ if (_pre_dp_h[j] - gap_oe2 == _pre_dp_e2[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP; \ else cur_op = ABPOA_E2_OP; \ hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1); \ @@ -400,16 +405,16 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; ++res->n_aln_bases; \ } \ } \ - if (hit == 0 && cur_op & ABPOA_M_OP && indel_first == 1) { /* match/mismatch */ \ + if (hit == 0 && (cur_op & ABPOA_M_OP) && (indel_first == 1)) { /* match/mismatch */ \ for (k = 0; k < pre_n[i]; ++k) { \ pre_i = pre_index_i[k]; \ if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue; \ _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5); \ if (_pre_dp_h[j-1] + s == _dp_h[j]) { \ - cur_op = ABPOA_ALL_OP; hit = 1; \ cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1); \ i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index); hit = 1; \ dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h; \ + cur_op = ABPOA_ALL_OP; \ ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0; \ indel_first = 0; \ break; \ @@ -417,7 +422,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } \ } \ if (hit == 0) err_fatal_simple("Error in cg_backtrack."); \ - /* fprintf(stderr, "%d, %d, %d\n", i, j, cur_op); */ \ + simd_output_pre_nodes(pre_index[i], pre_n[i], i, j, cur_op, abpt->verbose); \ } \ if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1); \ /* reverse cigar */ \ @@ -434,16 +439,16 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; #define simd_abpoa_var(score_t, sp, SIMDSetOne, SIMDShiftOneN) \ /* int tot_dp_sn = 0; */ \ abpoa_graph_t *graph = ab->abg; abpoa_simd_matrix_t *abm = ab->abm; \ - int matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1; \ + int64_t matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1; \ int **pre_index, *pre_n, _pre_index, _pre_n, pre_i; \ int i, j, k, *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn, node_id, index_i, dp_i; \ int beg, end, beg_sn, end_sn, _beg_sn, _end_sn, pre_beg_sn, pre_end, sn_i; \ - int pn, log_n, size, qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/ \ + int pn, log_n, size; int64_t qp_sn, dp_sn; /* pn: # value per SIMDi, qp_sn/dp_sn: segmented length*/ \ SIMDi *dp_h, *pre_dp_h, *qp, *qi=NULL; \ score_t *_dp_h=NULL, *_qi, best_score = sp.inf_min, inf_min = sp.inf_min; \ int *mat = abpt->mat, m = abpt->m; score_t gap_ext1 = abpt->gap_ext1; \ int w = abpt->wb < 0 ? qlen : abpt->wb+(int)(abpt->wf*qlen); /* when w < 0, do whole global */ \ - int best_i = 0, best_j = 0, best_id = 0, max, max_i=-1; \ + int best_i = 0, best_j = 0, best_id = 0, max, left_max_i=-1, right_max_i=-1; \ SIMDi zero = SIMDSetZeroi(), SIMD_INF_MIN = SIMDSetOne(inf_min); \ pn = sp.num_of_value; qp_sn = dp_sn = (matrix_col_n + pn - 1) / pn; \ log_n = sp.log_num, size = sp.size; qp = abm->s_mem; \ @@ -564,7 +569,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; dp_beg[0] = 0, dp_end[0] = qlen; \ } \ dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn; \ - dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1; \ dp_h = DP_H; _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1); \ } @@ -582,7 +586,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; dp_beg[0] = 0, dp_end[0] = qlen; \ } \ dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn; \ - dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1; \ dp_h = DP_HEF; dp_e1 = dp_h + dp_sn; dp_f1 = dp_e1 + dp_sn; \ _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1); \ } @@ -601,7 +604,6 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; dp_beg[0] = 0, dp_end[0] = qlen; \ } \ dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn; \ - dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1; \ dp_h = DP_H2E2F; dp_e1 = dp_h+dp_sn; dp_e2 = dp_e1+dp_sn; dp_f1 = dp_e2+dp_sn; dp_f2 = dp_f1+dp_sn; \ _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1); \ } @@ -609,7 +611,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; #define simd_abpoa_lg_first_dp(score_t) { \ simd_abpoa_lg_first_row; \ if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ - for (i = 0; i <= _end_sn; ++i) \ + for (i = 0; i <= _end_sn; ++i) \ dp_h[i] = zero; \ } else { \ for (i = 0; i <= _end_sn; ++i) { \ @@ -625,7 +627,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; #define simd_abpoa_ag_first_dp(score_t) { \ simd_abpoa_ag_first_row; \ if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ - for (i = 0; i <= _end_sn; ++i) \ + for (i = 0; i <= _end_sn; ++i) \ dp_h[i] = dp_e1[i] = dp_f1[i] = zero; \ } else { \ for (i = 0; i <= _end_sn; ++i) { \ @@ -643,7 +645,7 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; #define simd_abpoa_cg_first_dp(score_t) { \ simd_abpoa_cg_first_row; \ if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ - for (i = 0; i <= _end_sn; ++i) \ + for (i = 0; i <= _end_sn; ++i) \ dp_h[i] = dp_e1[i] = dp_e2[i] = dp_f1[i] = dp_f2[i] = zero; \ } else { \ for (i = 0; i <= _end_sn; ++i) { \ @@ -698,47 +700,48 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } \ } -#define simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub) { \ - node_id = abpoa_graph_index_to_node_id(graph, index_i); \ - SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain; \ - dp_h = &DP_H[dp_i * dp_sn]; _dp_h = (score_t*)dp_h; \ - int min_pre_beg_sn, max_pre_end_sn; \ - if (abpt->wb < 0) { \ - beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen; \ - beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn; \ - min_pre_beg_sn = 0, max_pre_end_sn = end_sn; \ - } else { \ - beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen); \ - beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; \ - for (i = 0; i < pre_n[dp_i]; ++i) { \ - pre_i = pre_index[dp_i][i]; \ - if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i]; \ - if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i]; \ - } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn; \ - dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn; \ - end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1; \ - } \ - /* loop query */ \ - /* first pre_node */ \ - pre_i = pre_index[dp_i][0]; \ - pre_dp_h = DP_H + pre_i * dp_sn; \ - pre_end = dp_end[pre_i]; \ - pre_beg_sn = dp_beg_sn[pre_i]; \ - /* set M from (pre_i, q_i-1), E from (pre_i, q_i) */ \ - if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ - _beg_sn = 0, _end_sn = end_sn; first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN); \ - } else { \ - if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN); \ - else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN); \ - _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1); \ - for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN; \ - for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN; \ - } \ - for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */ \ - remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN); \ - dp_h[sn_i] = SIMDMax(SIMDAdd(SIMDOri(first, remain), q[sn_i]), SIMDSub(pre_dp_h[sn_i], GAP_E1)); \ - first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN); \ - } \ +#define simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub) { \ + node_id = abpoa_graph_index_to_node_id(graph, index_i); \ + SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain; \ + dp_h = &DP_H[dp_i * dp_sn]; _dp_h = (score_t*)dp_h; \ + int min_pre_beg, min_pre_beg_sn, max_pre_end_sn; \ + if (abpt->wb < 0) { \ + beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen; \ + beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn; \ + min_pre_beg_sn = 0, max_pre_end_sn = end_sn; \ + } else { \ + beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen); \ + beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; \ + for (i = 0; i < pre_n[dp_i]; ++i) { \ + pre_i = pre_index[dp_i][i]; \ + if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i]; \ + if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i]; \ + } \ + if (beg_sn < min_pre_beg_sn) { \ + beg = min_pre_beg; beg_sn = min_pre_beg_sn; \ + } \ + dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end; \ + } \ + /* loop query */ \ + /* first pre_node */ \ + pre_i = pre_index[dp_i][0]; \ + pre_dp_h = DP_H + pre_i * dp_sn; \ + pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i]; \ + /* set M from (pre_i, q_i-1), E from (pre_i, q_i) */ \ + if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ + _beg_sn = 0, _end_sn = end_sn; first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN); \ + } else { \ + if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN); \ + else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN); \ + _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1); \ + for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN; \ + for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN; \ + } \ + for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */ \ + remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN); \ + dp_h[sn_i] = SIMDMax(SIMDAdd(SIMDOri(first, remain), q[sn_i]), SIMDSub(pre_dp_h[sn_i], GAP_E1)); \ + first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN); \ + } \ /* get max m and e */ \ for (i = 1; i < pre_n[dp_i]; ++i) { \ pre_i = pre_index[dp_i][i]; \ @@ -759,23 +762,25 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN); \ } /* now we have max(h,e) stored at dp_h */ \ } \ - /* new F start */ \ - first = SIMDOri(SIMDAndi(dp_h[beg_sn], PRE_MASK[0]), SUF_MIN[0]); \ - for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { \ - if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ - set_num = pn; \ - } else { \ - if (sn_i < min_pre_beg_sn) { \ - _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n"); \ - } else if (sn_i > max_pre_end_sn) { \ - set_num = sn_i == max_pre_end_sn+1 ? 1 : 0; \ - } else set_num = pn; \ - } \ - dp_h[sn_i] = SIMDMax(dp_h[sn_i], first); \ - SIMD_SET_F(dp_h[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN); \ + /* new F start */ \ + for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = inf_min; \ + for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = inf_min; \ + first = SIMDOri(SIMDAndi(dp_h[beg_sn], PRE_MASK[0]), SUF_MIN[0]); \ + for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { \ + if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ + set_num = pn; \ + } else { \ + if (sn_i < min_pre_beg_sn) { \ + _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n"); \ + } else if (sn_i > max_pre_end_sn) { \ + set_num = sn_i == max_pre_end_sn+1 ? 1 : 0; \ + } else set_num = pn; \ + } \ + dp_h[sn_i] = SIMDMax(dp_h[sn_i], first); \ + SIMD_SET_F(dp_h[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN); \ first = SIMDOri(SIMDAndi(SIMDShiftRight(SIMDSub(dp_h[sn_i], GAP_E1), SIMDTotalBytes-SIMDShiftOneN), PRE_MASK[0]), SUF_MIN[0]); \ - } \ - if (abpt->align_mode == ABPOA_LOCAL_MODE) for (sn_i = 0; sn_i <= end_sn; ++sn_i) dp_h[sn_i] = SIMDMax(zero, dp_h[sn_i]); \ + } \ + if (abpt->align_mode == ABPOA_LOCAL_MODE) for (sn_i = 0; sn_i <= end_sn; ++sn_i) dp_h[sn_i] = SIMDMax(zero, dp_h[sn_i]); \ } #define simd_abpoa_ag_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual) { \ @@ -783,21 +788,23 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain; \ dp_h = DP_HEF + dp_i * 3 * dp_sn; dp_e1 = dp_h + dp_sn; dp_f1 = dp_e1 + dp_sn; \ _dp_h = (score_t*)dp_h, _dp_e1 = (score_t*)dp_e1, _dp_f1 = (score_t*)dp_f1; \ - int min_pre_beg_sn, max_pre_end_sn; \ + int min_pre_beg, min_pre_beg_sn, max_pre_end_sn; \ if (abpt->wb < 0) { \ beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen; \ beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn; \ min_pre_beg_sn = 0, max_pre_end_sn = end_sn; \ } else { \ beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen); \ - beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; \ + beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; \ for (i = 0; i < pre_n[dp_i]; ++i) { \ pre_i = pre_index[dp_i][i]; \ - if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i]; \ + if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i]; \ if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i]; \ - } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn; \ - dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn; \ - end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1; \ + } \ + if (beg_sn < min_pre_beg_sn) { \ + beg_sn = min_pre_beg_sn; beg = min_pre_beg; \ + } \ + dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end; \ } \ /* loop query */ \ /* first pre_node */ \ @@ -854,6 +861,8 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { /* SIMD parallelization */ \ dp_h[sn_i] = SIMDAdd(dp_h[sn_i], q[sn_i]); \ } \ + for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = _dp_e1[i] = inf_min; \ + for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = _dp_e1[i] = inf_min; \ /* new F start */ \ first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneN), SIMDTotalBytes-SIMDShiftOneN); \ for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { \ @@ -889,25 +898,31 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain; \ dp_h = DP_H2E2F+dp_i*5*dp_sn; dp_e1 = dp_h+dp_sn; dp_e2 = dp_e1+dp_sn; dp_f1 = dp_e2+dp_sn; dp_f2 = dp_f1+dp_sn; \ _dp_h=(score_t*)dp_h, _dp_e1=(score_t*)dp_e1, _dp_e2=(score_t*)dp_e2, _dp_f1=(score_t*)dp_f1, _dp_f2=(score_t*)dp_f2; \ - int min_pre_beg_sn, max_pre_end_sn; \ + int min_pre_beg, min_pre_beg_sn, max_pre_end_sn; \ if (abpt->wb < 0) { \ beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen; \ beg_sn = dp_beg_sn[dp_i] = beg/pn; end_sn = dp_end_sn[dp_i] = end/pn; \ min_pre_beg_sn = 0, max_pre_end_sn = end_sn; \ } else { \ beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen); \ - beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; \ + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) \ + fprintf(stderr, "index: %d (node: %d): raw beg: %d, end: %d\n", index_i-beg_index, node_id, beg, end); \ + beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; \ for (i = 0; i < pre_n[dp_i]; ++i) { \ pre_i = pre_index[dp_i][i]; \ - if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i]; \ + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) \ + fprintf(stderr, "\tpre_i: %d, pre_dp_beg: %d, pre_dp_beg_sn: %d\n", pre_i, dp_beg[pre_i], dp_beg_sn[pre_i]); \ + if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i]; \ if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i]; \ - } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn; \ - dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn; \ - end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1; \ - /* fprintf(stderr, "index: %d, beg: %d, end: %d, beg_sn: %d, end_sn: %d\n", index_i, beg, end, beg_sn, end_sn); */ \ + } \ + if (beg_sn < min_pre_beg_sn) { \ + assert(beg < min_pre_beg); beg = min_pre_beg; beg_sn = min_pre_beg_sn; \ + } \ + dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end; \ + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) { \ + fprintf(stderr, "index: %d (node: %d): beg: %d, end: %d, beg_sn: %d, end_sn: %d, max_left: %d, max_right: %d\n", index_i-beg_index, node_id, beg, end, beg_sn, end_sn, abpoa_graph_node_id_to_max_pos_left(graph, node_id), abpoa_graph_node_id_to_max_pos_right(graph, node_id)); \ + } \ } \ - /* fprintf(stderr, "%d: beg, end: %d, %d\n", index_i, beg, end); */ \ - /* tot_dp_sn += (end_sn - beg_sn + 1); */ \ /* loop query */ \ /* first pre_node */ \ pre_i = pre_index[dp_i][0]; \ @@ -968,10 +983,12 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } \ } \ /* compare M, E, and F */ \ - /* fprintf(stderr, "5 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, _beg_sn, _end_sn); */ \ + /* fprintf(stderr, "5 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, beg_sn, end_sn); */ \ for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { /* SIMD parallelization */ \ dp_h[sn_i] = SIMDAdd(dp_h[sn_i], q[sn_i]); \ } \ + for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min; \ + for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min; \ /* new F start */ \ first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneN), SIMDTotalBytes-SIMDShiftOneN); \ SIMDi first2 = first; \ @@ -1042,28 +1059,27 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; #define simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater) { \ /* select max dp_h */ \ - max = inf_min, max_i = -1; \ - SIMDi a = dp_h[end_sn], b = qi[end_sn]; \ - if (end_sn == qlen / pn) SIMDSetIfGreater(a, zero, b, SIMD_INF_MIN, a); \ - for (i = beg_sn; i < end_sn; ++i) { \ - SIMDGetIfGreater(b, a, dp_h[i], a, qi[i], b); \ - } \ - _dp_h = (score_t*)&a, _qi = (score_t*)&b; \ - for (i = 0; i < pn; ++i) { \ + max = inf_min, left_max_i = right_max_i = -1; \ + _dp_h = (score_t*)dp_h, _qi = (score_t*)qi; \ + for (i = beg; i <= end; ++i) { \ if (_dp_h[i] > max) { \ - max = _dp_h[i]; max_i = _qi[i]; \ + max = _dp_h[i]; \ + left_max_i = right_max_i = _qi[i]; \ + } else if (_dp_h[i] == max) { \ + right_max_i = _qi[i]; \ } \ } \ } -#define simd_abpoa_ada_max_i { \ - /* set max_pos_left/right for next nodes */ \ - int out_i = max_i + 1; \ - for (i = 0; i < graph->node[node_id].out_edge_n; ++i) { \ - int out_node_id = graph->node[node_id].out_id[i]; \ - if (out_i > graph->node_id_to_max_pos_right[out_node_id]) graph->node_id_to_max_pos_right[out_node_id] = out_i; \ - if (out_i < graph->node_id_to_max_pos_left[out_node_id]) graph->node_id_to_max_pos_left[out_node_id] = out_i; \ - } \ +#define simd_abpoa_ada_max_i { \ + /* set max_pos_left/right for next nodes */ \ + for (i = 0; i < graph->node[node_id].out_edge_n; ++i) { \ + int out_node_id = graph->node[node_id].out_id[i]; \ + if (right_max_i+1 > graph->node_id_to_max_pos_right[out_node_id]) \ + graph->node_id_to_max_pos_right[out_node_id] = right_max_i+1; \ + if (left_max_i+1 < graph->node_id_to_max_pos_left[out_node_id]) \ + graph->node_id_to_max_pos_left[out_node_id] = left_max_i+1; \ + } \ } // TODO end_bonus for extension @@ -1077,11 +1093,11 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub); \ if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater); \ - set_global_max_score(max, dp_i, max_i); \ + set_global_max_score(max, dp_i, left_max_i); \ } \ if (abpt->align_mode == ABPOA_EXTEND_MODE) { \ simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater); \ - set_extend_max_score(max, dp_i, max_i); \ + set_extend_max_score(max, dp_i, right_max_i); \ } \ if (abpt->wb >= 0) { \ if (abpt->align_mode == ABPOA_GLOBAL_MODE) { \ @@ -1092,7 +1108,11 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } \ if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_H, dp_sn); \ res->best_score = best_score; \ - /* simd_abpoa_print_lg_matrix(score_t, beg_index, end_index); printf("best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); */ \ + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) { \ + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) \ + simd_abpoa_print_lg_matrix(score_t, beg_index, end_index); \ + fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); \ + } \ if (abpt->ret_cigar) simd_abpoa_lg_backtrack(score_t); \ simd_abpoa_free_var; SIMDFree(GAP_E1S); \ } @@ -1107,10 +1127,10 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; simd_abpoa_ag_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual); \ if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater); \ - set_global_max_score(max, dp_i, max_i); \ + set_global_max_score(max, dp_i, left_max_i); \ } else if (abpt->align_mode == ABPOA_EXTEND_MODE) { \ simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater); \ - set_extend_max_score(max, dp_i, max_i); \ + set_extend_max_score(max, dp_i, right_max_i); \ } \ if (abpt->wb >= 0) { \ if (abpt->align_mode == ABPOA_GLOBAL_MODE) { \ @@ -1121,7 +1141,11 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; } \ if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_HEF, 3*dp_sn); \ res->best_score = best_score; \ - /* simd_abpoa_print_ag_matrix(score_t, beg_index, end_index); fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); */ \ + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) { \ + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) \ + simd_abpoa_print_ag_matrix(score_t, beg_index, end_index); \ + fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); \ + } \ if (abpt->ret_cigar) simd_abpoa_ag_backtrack(score_t); \ simd_abpoa_free_var; SIMDFree(GAP_E1S); \ } @@ -1136,10 +1160,10 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; simd_abpoa_cg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual); \ if (abpt->align_mode == ABPOA_LOCAL_MODE) { \ simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater); \ - set_global_max_score(max, dp_i, max_i); \ + set_global_max_score(max, dp_i, left_max_i); \ } else if (abpt->align_mode == ABPOA_EXTEND_MODE) { \ simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater); \ - set_extend_max_score(max, dp_i, max_i); \ + set_extend_max_score(max, dp_i, right_max_i); \ } \ if (abpt->wb >= 0) { \ if (abpt->align_mode == ABPOA_GLOBAL_MODE) { \ @@ -1148,11 +1172,13 @@ SIMD_para_t _simd_p64 = {128, 64, 1, 2, 16, -1}; simd_abpoa_ada_max_i; \ } \ } \ - /* printf("dp_sn: %d\n", tot_dp_sn); */ \ if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_H2E2F, 5*dp_sn); \ res->best_score = best_score; \ - /* simd_abpoa_print_cg_matrix(score_t, beg_index, end_index); */ \ - /* fprintf(stderr,"best_score: (%d, %d) -> %d\n",best_i,best_j,best_score); */ \ + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) { \ + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) \ + simd_abpoa_print_cg_matrix(score_t, beg_index, end_index); \ + fprintf(stderr,"best_score: (%d, %d) -> %d\n",best_i, best_j, best_score); \ + } \ if (abpt->ret_cigar) simd_abpoa_cg_backtrack(score_t); \ simd_abpoa_free_var; SIMDFree(GAP_E1S); SIMDFree(GAP_E2S); \ } @@ -1191,7 +1217,8 @@ int simd_abpoa_realloc(abpoa_t *ab, int gn, int qlen, abpoa_para_t *abpt, SIMD_p // err_func_format_printf(__func__, "Warning: Graph is too large or query is too long.\n"); // return 1; // } - // fprintf(stderr, "%lld, %lld, %lld\n", (long long)gn, (long long)ab->abm->s_msize, (long long)s_msize); + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) + fprintf(stderr, "realloc: graph_node %lld, qlen: %d, (%lld, %lld)\n", (long long)gn, qlen, (long long)ab->abm->s_msize, (long long)s_msize); if (s_msize > ab->abm->s_msize) { if (ab->abm->s_mem) SIMDFree(ab->abm->s_mem); kroundup64(s_msize); ab->abm->s_msize = s_msize; @@ -1208,7 +1235,7 @@ int simd_abpoa_realloc(abpoa_t *ab, int gn, int qlen, abpoa_para_t *abpt, SIMD_p return 0; } -void abpoa_init_var(abpoa_para_t *abpt, uint8_t *query, int qlen, SIMDi *qp, SIMDi *qi, int *mat, int qp_sn, int pn, SIMDi SIMD_INF_MIN) { +void abpoa_init_var(abpoa_para_t *abpt, uint8_t *query, int qlen, SIMDi *qp, SIMDi *qi, int *mat, int64_t qp_sn, int pn, SIMDi SIMD_INF_MIN) { int i, j, k; int32_t *_qi; /* generate the query profile */ for (i = 0; i < qp_sn * abpt->m; ++i) qp[i] = SIMD_INF_MIN; @@ -1225,7 +1252,10 @@ void abpoa_init_var(abpoa_para_t *abpt, uint8_t *query, int qlen, SIMDi *qp, SIM } } -void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_map, int beg_node_id, int end_node_id, int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int pn, int qlen, int w, int dp_sn, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, int32_t inf_min, int gap_open1, int gap_ext1, int gap_open2, int gap_ext2, int gap_oe1, int gap_oe2) { +void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_map, int beg_node_id, int end_node_id, + int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int pn, int qlen, int w, int64_t dp_sn, + SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, int32_t inf_min, + int gap_open1, int gap_ext1, int gap_open2, int gap_ext2, int gap_oe1, int gap_oe2) { int i, _end_sn; if (abpt->wb >= 0) { graph->node_id_to_max_pos_left[beg_node_id] = graph->node_id_to_max_pos_right[beg_node_id] = 0; @@ -1234,12 +1264,12 @@ void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_ if (index_map[abpoa_graph_node_id_to_index(graph, out_id)]) graph->node_id_to_max_pos_left[out_id] = graph->node_id_to_max_pos_right[out_id] = 1; } - dp_beg[0] = GET_AD_DP_BEGIN(graph, w, beg_node_id, end_node_id, qlen), dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen); + dp_beg[0] = 0; // GET_AD_DP_BEGIN(graph, w, beg_node_id, end_node_id, qlen) + dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen); } else { dp_beg[0] = 0, dp_end[0] = qlen; } dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn; - dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1; SIMDi *dp_h = DP_H2E2F; SIMDi *dp_e1 = dp_h + dp_sn; SIMDi *dp_e2 = dp_e1 + dp_sn, *dp_f1 = dp_e2 + dp_sn, *dp_f2 = dp_f1 + dp_sn; _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1); @@ -1249,40 +1279,37 @@ void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_ int32_t *_dp_h = (int32_t*)dp_h, *_dp_e1 = (int32_t*)dp_e1, *_dp_e2 = (int32_t*)dp_e2, *_dp_f1 = (int32_t*)dp_f1, *_dp_f2 = (int32_t*)dp_f2; _dp_h[0] = 0; _dp_e1[0] = -(gap_oe1); _dp_e2[0] = -(gap_oe2); _dp_f1[0] = _dp_f2[0] = inf_min; for (i = 1; i <= dp_end[0]; ++i) { /* no SIMD parallelization */ - _dp_f1[i] = -(gap_open1 + gap_ext1 * i); - _dp_f2[i] = -(gap_open2 + gap_ext2 * i); - _dp_h[i] = MAX_OF_TWO(_dp_f1[i], _dp_f2[i]); // -MIN_OF_TWO(gap_open1+gap_ext1*i, gap_open2+gap_ext2*i); + _dp_f1[i] = -(gap_open1 + gap_ext1 * i); _dp_f2[i] = -(gap_open2 + gap_ext2 * i); + _dp_h[i] = MAX_OF_TWO(_dp_f1[i], _dp_f2[i]); } } -int abpoa_max(SIMDi SIMD_INF_MIN, SIMDi zero, int inf_min, SIMDi *dp_h, SIMDi *qi, int qlen, int pn, int beg_sn, int end_sn) { +void abpoa_max(SIMDi SIMD_INF_MIN, SIMDi zero, int inf_min, SIMDi *dp_h, SIMDi *qi, int qlen, int pn, int beg, int end, int *left_max_i, int *right_max_i) { /* select max dp_h */ - int max = inf_min, max_i = -1, i; - SIMDi a = dp_h[end_sn], b = qi[end_sn]; - if (end_sn == qlen / pn) SIMDSetIfGreateri32(a, zero, b, SIMD_INF_MIN, a); - for (i = beg_sn; i < end_sn; ++i) { - SIMDGetIfGreateri32(b, a, dp_h[i], a, qi[i], b); - } - int32_t *_dp_h = (int32_t*)&a, *_qi = (int32_t*)&b; - for (i = 0; i < pn; ++i) { + int max = inf_min, i; *left_max_i = *right_max_i = -1; + int32_t *_dp_h = (int32_t*)dp_h, *_qi = (int32_t*)qi; + for (i = beg; i <= end; ++i) { if (_dp_h[i] > max) { - max = _dp_h[i]; max_i = _qi[i]; + max = _dp_h[i]; *left_max_i = *right_max_i = _qi[i]; + } else if (_dp_h[i] == max) { + *right_max_i = _qi[i]; } } - return max_i; } -void abpoa_ada_max_i(int max_i, abpoa_graph_t *graph, int node_id) { +void abpoa_ada_max_i(int left_max_i, int right_max_i, abpoa_graph_t *graph, int node_id) { /* set max_pos_left/right for next nodes */ - int out_i = max_i + 1; int i; + int i; for (i = 0; i < graph->node[node_id].out_edge_n; ++i) { int out_node_id = graph->node[node_id].out_id[i]; - if (out_i > graph->node_id_to_max_pos_right[out_node_id]) graph->node_id_to_max_pos_right[out_node_id] = out_i; - if (out_i < graph->node_id_to_max_pos_left[out_node_id]) graph->node_id_to_max_pos_left[out_node_id] = out_i; + if (right_max_i+1 > graph->node_id_to_max_pos_right[out_node_id]) + graph->node_id_to_max_pos_right[out_node_id] = right_max_i+1; + if (left_max_i+1 < graph->node_id_to_max_pos_left[out_node_id]) + graph->node_id_to_max_pos_left[out_node_id] = left_max_i+1; } } -void abpoa_global_get_max(abpoa_graph_t *graph, int beg_index, int end_node_id, uint8_t *index_map, SIMDi *DP_H_HE, int dp_sn, int qlen, int *dp_end, int32_t *best_score, int *best_i, int *best_j) { +void abpoa_global_get_max(abpoa_graph_t *graph, int beg_index, int end_node_id, uint8_t *index_map, SIMDi *DP_H_HE, int64_t dp_sn, int qlen, int *dp_end, int32_t *best_score, int *best_i, int *best_j) { int in_id, in_index, dp_i, i; for (i = 0; i < graph->node[end_node_id].in_edge_n; ++i) { in_id = graph->node[end_node_id].in_id[i]; @@ -1300,26 +1327,37 @@ void abpoa_global_get_max(abpoa_graph_t *graph, int beg_index, int end_node_id, } } -int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, SIMDi *dp_f2, int **pre_index, int *pre_n, int index_i, int dp_i, abpoa_graph_t *graph, abpoa_para_t *abpt, int dp_sn, int pn, int qlen, int w, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, SIMDi GAP_O1, SIMDi GAP_O2, SIMDi GAP_E1, SIMDi GAP_E2, SIMDi GAP_OE1, SIMDi GAP_OE2, SIMDi* GAP_E1S, SIMDi* GAP_E2S, SIMDi *PRE_MIN, SIMDi *PRE_MASK, SIMDi *SUF_MIN, int log_n, int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int end_node_id) { +int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, SIMDi *dp_f2, int inf_min, + int **pre_index, int *pre_n, int beg_index, int index_i, int dp_i, abpoa_graph_t *graph, abpoa_para_t *abpt, + int64_t dp_sn, int pn, int qlen, int w, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, + SIMDi GAP_O1, SIMDi GAP_O2, SIMDi GAP_E1, SIMDi GAP_E2, SIMDi GAP_OE1, SIMDi GAP_OE2, SIMDi* GAP_E1S, SIMDi* GAP_E2S, + SIMDi *PRE_MIN, SIMDi *PRE_MASK, SIMDi *SUF_MIN, int log_n, + int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int end_node_id) { int tot_dp_sn = 0, i, pre_i, node_id = abpoa_graph_index_to_node_id(graph, index_i); - int min_pre_beg_sn, max_pre_end_sn, beg, end, beg_sn, end_sn, pre_end, pre_end_sn, pre_beg_sn, sn_i; + int min_pre_beg, min_pre_beg_sn, max_pre_end_sn, beg, end, beg_sn, end_sn, pre_end, pre_end_sn, pre_beg_sn, sn_i; if (abpt->wb < 0) { beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen; beg_sn = dp_beg_sn[dp_i] = beg/pn; end_sn = dp_end_sn[dp_i] = end/pn; min_pre_beg_sn = 0, max_pre_end_sn = end_sn; } else { beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen); - beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) + fprintf(stderr, "index: %d (node: %d): raw beg: %d, end: %d\n", index_i-beg_index, node_id, beg, end); + beg_sn = beg / pn; min_pre_beg = min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1; for (i = 0; i < pre_n[dp_i]; ++i) { pre_i = pre_index[dp_i][i]; - if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i]; + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) + fprintf(stderr, "\tpre_i: %d, pre_dp_beg: %d, pre_dp_beg_sn: %d\n", pre_i, dp_beg[pre_i], dp_beg_sn[pre_i]); + if (min_pre_beg > dp_beg[pre_i]) min_pre_beg = dp_beg[pre_i], min_pre_beg_sn = dp_beg_sn[pre_i]; if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i]; - } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn; - dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn; - end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1; -#ifdef __DEBUG__ - fprintf(stderr, "index: %d (node: %d): beg: %d, end: %d, beg_sn: %d, end_sn: %d\n", index_i, node_id, beg, end, beg_sn, end_sn); -#endif + } + if (beg_sn < min_pre_beg_sn) { + assert(beg < min_pre_beg); beg_sn = min_pre_beg_sn; beg = min_pre_beg; + } + // NOT extend dp_beg/end based on dp_beg/end_sn + dp_beg_sn[dp_i] = beg_sn; dp_beg[dp_i] = beg; end_sn = dp_end_sn[dp_i] = end/pn; dp_end[dp_i] = end; + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) + fprintf(stderr, "index: %d (node: %d): beg: %d, end: %d, beg_sn: %d, end_sn: %d, max_left: %d, max_right: %d\n", index_i-beg_index, node_id, beg, end, beg_sn, end_sn, graph->node_id_to_max_pos_left[node_id], graph->node_id_to_max_pos_right[node_id]); } tot_dp_sn += (end_sn - beg_sn + 1); /* loop query */ @@ -1333,7 +1371,7 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, /* set M from (pre_i, q_i-1) */ if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneNi32); else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneNi32); - _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1); + _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1); // pre_end+1: match/mismatch operations for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN; for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN; for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */ @@ -1349,7 +1387,7 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, dp_e1[sn_i] = pre_dp_e1[sn_i]; dp_e2[sn_i] = pre_dp_e2[sn_i]; } - // if (index_i == 13095) debug_simd_abpoa_print_cg_matrix_row("1", int32_t, index_i); + // debug_simd_abpoa_print_cg_matrix_row("1", int32_t, index_i); // new init end /* get max m and e */ for (i = 1; i < pre_n[dp_i]; ++i) { @@ -1381,6 +1419,10 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, /* new F start */ first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneNi32), SIMDTotalBytes-SIMDShiftOneNi32); int set_num; SIMDi first2 = first;//, tmp; + int32_t *_dp_h = (int32_t*)dp_h, *_dp_e1 = (int32_t*)dp_e1, *_dp_e2 = (int32_t*)dp_e2; + // set h/e as inf_min for cells out of range: < dp_beg or > dp_end + for (i = beg_sn * pn; i < beg; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min; + for (i = end+1; i < (end_sn+1)*pn; ++i) _dp_h[i] = _dp_e1[i] = _dp_e2[i] = inf_min; for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { if (sn_i < min_pre_beg_sn) { _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n"); @@ -1410,10 +1452,14 @@ int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, // SIMDSetIfEquali32(dp_e2[sn_i], dp_h[sn_i], tmp, SIMDMaxi32(SIMDSubi32(dp_e2[sn_i], GAP_E2), SIMDSubi32(dp_h[sn_i], GAP_OE2)), SIMD_INF_MIN); dp_e2[sn_i] = SIMDMaxi32(SIMDSubi32(dp_e2[sn_i], GAP_E2), SIMDSubi32(dp_h[sn_i], GAP_OE2)); } + // debug_simd_abpoa_print_cg_matrix_row("0", int32_t, index_i); return tot_dp_sn; } -void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_beg, int *dp_end, int dp_sn, int m, int *mat, int gap_ext1, int gap_ext2, int gap_oe1, int gap_oe2, int beg_index, int best_dp_i, int best_dp_j, int qlen, abpoa_graph_t *graph, abpoa_para_t *abpt, uint8_t *query, abpoa_res_t *res) { +void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_beg, int *dp_end, int64_t dp_sn, + int m, int *mat, int gap_ext1, int gap_ext2, int gap_oe1, int gap_oe2, + int beg_index, int best_dp_i, int best_dp_j, int qlen, + abpoa_graph_t *graph, abpoa_para_t *abpt, uint8_t *query, abpoa_res_t *res) { int dp_i, dp_j, k, pre_i, n_c = 0, m_c = 0, id, hit, cur_op = ABPOA_ALL_OP, _start_i, _start_j; SIMDi *dp_h; int32_t s, is_match, *_dp_h=NULL, *_dp_e1, *_dp_e2, *_pre_dp_h, *_pre_dp_e1, *_pre_dp_e2, *_dp_f1, *_dp_f2; abpoa_cigar_t *cigar = 0; @@ -1540,10 +1586,8 @@ void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_be } } } - if (hit == 0) exit(1); -#ifdef __DEBUG__ - fprintf(stderr, "%d, %d, %d\n", dp_i, dp_j, cur_op); -#endif + if (hit == 0) err_fatal_simple("Error in cg_backtrack."); + simd_output_pre_nodes(pre_index[dp_i], pre_n[dp_i], dp_i, dp_j, cur_op, abpt->verbose); } if (dp_j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, dp_j, -1, dp_j-1); /* reverse cigar */ @@ -1554,15 +1598,23 @@ void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_be /*abpoa_print_cigar(n_c, *graph_cigar, graph);*/ } -int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, int beg_index, int end_node_id, int end_index, uint8_t *index_map, int qlen, uint8_t *query, abpoa_para_t *abpt, SIMD_para_t sp, abpoa_res_t *res) { - int tot_dp_sn = 0; +int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, + int beg_node_id, int beg_index, int end_node_id, int end_index, + uint8_t *index_map, int qlen, uint8_t *query, + abpoa_para_t *abpt, SIMD_para_t sp, abpoa_res_t *res) { + int i, j; abpoa_graph_t *graph = ab->abg; abpoa_simd_matrix_t *abm = ab->abm; - int matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1; + int64_t matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1; int **pre_index, *pre_n, _pre_index, _pre_n; - int i, j, *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn, node_id, index_i, dp_i; - int beg_sn, end_sn; - int pn, log_n, size, qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/ + int node_id, index_i, dp_i; + int pn, log_n, size; int64_t qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/ SIMDi *dp_h, *qp, *qi; + // dp_beg/end: beg/end position of dp band in query (column) for each node (row) + // dp_beg/end_sn: beg/end VECTOR position of dp band in query (segmented column) for each node (row) + // XXX to keep consistency between different VECTOR size (SSE41/AVX2/AVX/512): + // . dp cells within dp_beg/end_sn VECTORs but not within dp_beg/end need to be set to -inf + int *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn; + int32_t best_score = sp.inf_min, inf_min = sp.inf_min; int *mat = abpt->mat, best_i = 0, best_j = 0; int32_t gap_ext1 = abpt->gap_ext1; int w = abpt->wb < 0 ? qlen : abpt->wb + (int)(abpt->wf * qlen); /* when w < 0, do whole global */ @@ -1573,12 +1625,15 @@ int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, i int32_t gap_open1 = abpt->gap_open1, gap_oe1 = gap_open1 + gap_ext1; int32_t gap_open2 = abpt->gap_open2, gap_ext2 = abpt->gap_ext2, gap_oe2 = gap_open2 + gap_ext2; SIMDi *DP_H2E2F, *dp_e1, *dp_e2, *dp_f2, *dp_f1; - SIMDi GAP_O1 = SIMDSetOnei32(gap_open1), GAP_O2 = SIMDSetOnei32(gap_open2), GAP_E1 = SIMDSetOnei32(gap_ext1), GAP_E2 = SIMDSetOnei32(gap_ext2), GAP_OE1 = SIMDSetOnei32(gap_oe1), GAP_OE2 = SIMDSetOnei32(gap_oe2); + SIMDi GAP_O1 = SIMDSetOnei32(gap_open1), GAP_O2 = SIMDSetOnei32(gap_open2); + SIMDi GAP_E1 = SIMDSetOnei32(gap_ext1), GAP_E2 = SIMDSetOnei32(gap_ext2); + SIMDi GAP_OE1 = SIMDSetOnei32(gap_oe1), GAP_OE2 = SIMDSetOnei32(gap_oe2); DP_H2E2F = qp + qp_sn * abpt->m; qi = DP_H2E2F + dp_sn * matrix_row_n * 5; // for SET_F mask[pn], suf_min[pn], pre_min[logN] SIMDi *PRE_MASK, *SUF_MIN, *PRE_MIN, *GAP_E1S, *GAP_E2S; - PRE_MASK = (SIMDi*)SIMDMalloc((pn+1) * size, size), SUF_MIN = (SIMDi*)SIMDMalloc((pn+1) * size, size), PRE_MIN = (SIMDi*)SIMDMalloc(pn * size, size), GAP_E1S = (SIMDi*)SIMDMalloc(log_n * size, size), GAP_E2S = (SIMDi*)SIMDMalloc(log_n * size, size); + PRE_MASK = (SIMDi*)SIMDMalloc((pn+1) * size, size), SUF_MIN = (SIMDi*)SIMDMalloc((pn+1) * size, size); + PRE_MIN = (SIMDi*)SIMDMalloc(pn * size, size), GAP_E1S = (SIMDi*)SIMDMalloc(log_n * size, size), GAP_E2S = (SIMDi*)SIMDMalloc(log_n * size, size); for (i = 0; i < pn; ++i) { int32_t *pre_mask = (int32_t*)(PRE_MASK + i); for (j = 0; j <= i; ++j) pre_mask[j] = -1; @@ -1612,26 +1667,34 @@ int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, i } pre_n[dp_i] = _pre_n; } - abpoa_cg_first_dp(abpt, graph, index_map, beg_node_id, end_node_id, dp_beg, dp_end, dp_beg_sn, dp_end_sn, pn, qlen, w, dp_sn, DP_H2E2F, SIMD_INF_MIN, inf_min, gap_open1, gap_ext1, gap_open2, gap_ext2, gap_oe1, gap_oe2); + abpoa_cg_first_dp(abpt, graph, index_map, beg_node_id, end_node_id, + dp_beg, dp_end, dp_beg_sn, dp_end_sn, pn, qlen, w, dp_sn, + DP_H2E2F, SIMD_INF_MIN, inf_min, gap_open1, gap_ext1, gap_open2, gap_ext2, gap_oe1, gap_oe2); for (index_i=beg_index+1, dp_i=1; index_inode[node_id].base * qp_sn; - dp_h = DP_H2E2F + (dp_i*5) * dp_sn; dp_e1 = dp_h + dp_sn; dp_e2 = dp_e1 + dp_sn; dp_f1 = dp_e2 + dp_sn; dp_f2 = dp_f1 + dp_sn; - tot_dp_sn += abpoa_cg_dp(q, dp_h, dp_e1, dp_e2, dp_f1, dp_f2, pre_index, pre_n, index_i, dp_i, graph, abpt, dp_sn, pn, qlen, w, DP_H2E2F, SIMD_INF_MIN, GAP_O1, GAP_O2, GAP_E1, GAP_E2, GAP_OE1, GAP_OE2, GAP_E1S, GAP_E2S, PRE_MIN, PRE_MASK, SUF_MIN, log_n, dp_beg, dp_end, dp_beg_sn, dp_end_sn, end_node_id); + dp_h = DP_H2E2F + (dp_i*5) * dp_sn; + dp_e1 = dp_h + dp_sn; dp_e2 = dp_e1 + dp_sn; + dp_f1 = dp_e2 + dp_sn; dp_f2 = dp_f1 + dp_sn; + abpoa_cg_dp(q, dp_h, dp_e1, dp_e2, dp_f1, dp_f2, inf_min, + pre_index, pre_n, beg_index, index_i, dp_i, + graph, abpt, dp_sn, pn, qlen, w, + DP_H2E2F, SIMD_INF_MIN, GAP_O1, GAP_O2, GAP_E1, GAP_E2, GAP_OE1, GAP_OE2, GAP_E1S, GAP_E2S, + PRE_MIN, PRE_MASK, SUF_MIN, log_n, + dp_beg, dp_end, dp_beg_sn, dp_end_sn, end_node_id); if (abpt->wb >= 0) { - beg_sn = dp_beg_sn[dp_i], end_sn = dp_end_sn[dp_i]; - int max_i = abpoa_max(SIMD_INF_MIN, zero, inf_min, dp_h, qi, qlen, pn, beg_sn, end_sn); - abpoa_ada_max_i(max_i, graph, node_id); + int left_max_i, right_max_i; + abpoa_max(SIMD_INF_MIN, zero, inf_min, dp_h, qi, qlen, pn, dp_beg[dp_i], dp_end[dp_i], &left_max_i, &right_max_i); + abpoa_ada_max_i(left_max_i, right_max_i, graph, node_id); } } - // printf("dp_sn: %d\n", tot_dp_sn); - // printf("dp_sn: %d, node_n: %d, seq_n: %d\n", tot_dp_sn, graph->node_n, qlen); abpoa_global_get_max(graph, beg_index, end_node_id, index_map, DP_H2E2F, 5*dp_sn, qlen, dp_end, &best_score, &best_i, &best_j); -#ifdef __DEBUG__ - simd_abpoa_print_cg_matrix(int32_t, beg_index, end_index); fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); -#endif + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) { + if (abpt->verbose >= ABPOA_LONG_DEBUG_VERBOSE) simd_abpoa_print_cg_matrix(int32_t, beg_index, end_index); + fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); + } res->best_score = best_score; abpoa_cg_backtrack(DP_H2E2F, pre_index, pre_n, dp_beg, dp_end, dp_sn, abpt->m, mat, gap_ext1, gap_ext2, gap_oe1, gap_oe2, beg_index, best_i, best_j, qlen, graph, abpt, query, res); for (i = 0; i < matrix_row_n; ++i) free(pre_index[i]); @@ -1644,8 +1707,10 @@ int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, i // align query to subgraph between beg_node_id and end_node_id (both are excluded) // generally: beg/end are the SRC/SINK_node int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int qlen, abpoa_res_t *res) { - // if (abpt->simd_flag == 0) err_fatal_simple("No SIMD instruction available."); - + int old_verbose = abpt->verbose; + if (abpt->verbose >= ABPOA_DEBUG_VERBOSE) fprintf(stderr, "beg_id: %d, end_id: %d, qlen: %d\n", beg_node_id, end_node_id, qlen); + // if (qlen == 8957) // for debug + // abpt->verbose = ABPOA_LONG_DEBUG_VERBOSE; int i, j, beg_index = ab->abg->node_id_to_index[beg_node_id], end_index = ab->abg->node_id_to_index[end_node_id]; int gn = end_index - beg_index + 1; uint8_t *index_map = (uint8_t*)_err_calloc(ab->abg->node_n, sizeof(uint8_t)); @@ -1660,19 +1725,16 @@ int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int b } } -#ifdef __DEBUG__ +#ifdef __SIMD_DEBUG__ _simd_p32.inf_min = MAX_OF_TWO(abpt->gap_ext1, abpt->gap_ext2) * 31 +MAX_OF_THREE(INT32_MIN + abpt->min_mis, INT32_MIN + abpt->gap_open1 + abpt->gap_ext1, INT32_MIN + abpt->gap_open2 + abpt->gap_ext2); if (simd_abpoa_realloc(ab, gn, qlen, abpt, _simd_p32)) return 0; - if (abpt->gap_mode == ABPOA_CONVEX_GAP) abpoa_cg_global_align_sequence_to_graph_core(ab, beg_node_id, beg_index, end_node_id, end_index, index_map, qlen, query, abpt, _simd_p32, res); + if (abpt->gap_mode == ABPOA_CONVEX_GAP) + abpoa_cg_global_align_sequence_to_graph_core(ab, beg_node_id, beg_index, end_node_id, end_index, index_map, qlen, query, abpt, _simd_p32, res); #else int32_t max_score, bits, mem_ret=0, gap_ext1 = abpt->gap_ext1, gap_ext2 = abpt->gap_ext2; int32_t gap_oe1 = abpt->gap_open1+gap_ext1, gap_oe2 = abpt->gap_open2+gap_ext2; - // if (abpt->simd_flag & SIMD_AVX512F && !(abpt->simd_flag & SIMD_AVX512BW)) - // max_score = INT16_MAX + 1; // AVX512F has no 8/16 bits operations - // else { int len = qlen > gn ? qlen : gn; max_score = MAX_OF_TWO(qlen * abpt->max_mat, len * abpt->gap_ext1 + abpt->gap_open1); - // } if (max_score <= INT16_MAX - abpt->min_mis - gap_oe1 - gap_oe2) { _simd_p16.inf_min = MAX_OF_THREE(INT16_MIN + abpt->min_mis, INT16_MIN + gap_oe1, INT16_MIN + gap_oe2) + 31 * MAX_OF_TWO(gap_ext1, gap_ext2); mem_ret = simd_abpoa_realloc(ab, gn, qlen, abpt, _simd_p16); @@ -1709,6 +1771,7 @@ int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int b } #endif free(index_map); + abpt->verbose = old_verbose; return 0; }