Skip to content

Commit

Permalink
add heuristics/path score for graph alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
yangao07 committed Nov 25, 2024
1 parent add820b commit b0276ea
Show file tree
Hide file tree
Showing 10 changed files with 277 additions and 104 deletions.
5 changes: 4 additions & 1 deletion include/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ typedef struct {
int m; int *mat; char *mat_fn; // score matrix
int use_score_matrix; // set _mat_ based on score matrix file, then _match_/_mismatch_ is not used.
int match, max_mat, mismatch, min_mis, gap_open1, gap_open2, gap_ext1, gap_ext2; int inf_min;
int sort_input_seq_by_len; // sort input sequences by length, in descending order
int put_gap_on_right; // put indel on the left-most side of the alignment: minimap2-like, or right-most: wfa2-like
int inc_path_score; // set mismatch/match score based on node weight, i.e., # covered reads * match/mismatch
// minimizer seeding parameter
int k, w, min_w;
int wb; float wf; // extra band width
Expand All @@ -87,7 +90,7 @@ typedef struct {

typedef struct {
int node_id;
int in_edge_n, in_edge_m, *in_id;
int in_edge_n, in_edge_m, *in_id; int *in_edge_weight; // in_edge_weight: for additional path score
int out_edge_n, out_edge_m, *out_id; int *out_edge_weight; // out_edge_weight: edge-wise weight
int *read_weight, n_read, m_read; // read_weight: read-wise weight, valid when use_qv=1
uint64_t **read_ids; int read_ids_n; // for each edge
Expand Down
13 changes: 11 additions & 2 deletions src/abpoa.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@ const struct option abpoa_long_opt [] = {
{ "min-poa-win", 1, NULL, 'n' },
{ "progressive", 0, NULL, 'p'},

{ "inc-path-score", 0, NULL, 'G'},
{ "sort-by-len", 0, NULL, 'L'},
{ "gap-on-right", 0, NULL, 'R' },
{ "use-qual-weight", 0, NULL, 'Q'},
{ "amino-acid", 0, NULL, 'c'},
{ "in-list", 0, NULL, 'l' },
{ "increment", 1, NULL, 'i' },


{ "amb-strand", 0, NULL, 's' },
{ "output", 1, NULL, 'o' },
Expand Down Expand Up @@ -90,6 +92,10 @@ int abpoa_usage(void)
err_printf(" -f --extra-f FLOAT second adaptive banding parameter [%.2f]\n", ABPOA_EXTRA_F);
err_printf(" the number of extra bases added on both sites of the band is\n");
err_printf(" b+f*L, where L is the length of the aligned sequence\n");
err_printf(" Heuristics for graph alignment: (**under development**)\n");
err_printf(" -G --inc-path-score include log-scaled path score for graph alignment [False]\n");
err_printf(" -L --sort-by-len sort input sequences by length in descending order [False]\n");
err_printf(" -R --gap-on-right put indel on the right-most side of the alignment, default is left [False]\n");
// err_printf(" -z --zdrop INT Z-drop score in extension alignment [-1]\n");
// err_printf(" set as <= 0 to disable Z-drop extension\n");
// err_printf(" -e --bonus INT end bonus score in extension alignment [-1]\n");
Expand Down Expand Up @@ -156,7 +162,7 @@ int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){

int main(int argc, char **argv) {
int c, m, in_list=0; char *s; abpoa_para_t *abpt = abpoa_init_para();
while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpso:r:g:a:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QGLRSk:w:n:i:clpso:r:g:a:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) {
switch(c)
{
case 'm': m = atoi(optarg);
Expand All @@ -169,6 +175,9 @@ int main(int argc, char **argv) {
case 'O': abpt->gap_open1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_open2 = strtol(s+1, &s, 10); break;
case 'E': abpt->gap_ext1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_ext2 = strtol(s+1, &s, 10); break;

case 'G': abpt->inc_path_score = 1; break;
case 'L': abpt->sort_input_seq_by_len = 1; break;
case 'R': abpt->put_gap_on_right = 1; break;
case 'b': abpt->wb = atoi(optarg); break;
case 'f': abpt->wf = atof(optarg); break;
case 'z': abpt->zdrop = atoi(optarg); break;
Expand Down
5 changes: 4 additions & 1 deletion src/abpoa.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ typedef struct {
int m; int *mat; char *mat_fn; // score matrix
int use_score_matrix; // set _mat_ based on score matrix file, then _match_/_mismatch_ is not used.
int match, max_mat, mismatch, min_mis, gap_open1, gap_open2, gap_ext1, gap_ext2; int inf_min;
int sort_input_seq_by_len; // sort input sequences by length, in descending order
int put_gap_on_right; // put indel on the left-most side of the alignment: minimap2-like, or right-most: wfa2-like
int inc_path_score; // set mismatch/match score based on node weight, i.e., # covered reads * match/mismatch
// minimizer seeding parameter
int k, w, min_w;
int wb; float wf; // extra band width
Expand All @@ -87,7 +90,7 @@ typedef struct {

typedef struct {
int node_id;
int in_edge_n, in_edge_m, *in_id;
int in_edge_n, in_edge_m, *in_id; int *in_edge_weight; // in_edge_weight: for additional path score
int out_edge_n, out_edge_m, *out_id; int *out_edge_weight; // out_edge_weight: edge-wise weight
int *read_weight, n_read, m_read; // read_weight: read-wise weight, valid when use_qv=1
uint64_t **read_ids; int read_ids_n; // for each edge
Expand Down
27 changes: 27 additions & 0 deletions src/abpoa_align.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ abpoa_para_t *abpoa_init_para(void) {
abpt->gap_mode = ABPOA_CONVEX_GAP;
abpt->zdrop = -1; // disable zdrop
abpt->end_bonus = -1; // disable end bonus

abpt->inc_path_score = 0;
abpt->sort_input_seq_by_len = 0;
abpt->put_gap_on_right = 0;

abpt->wb = ABPOA_EXTRA_B; // extra bandwidth
abpt->wf = ABPOA_EXTRA_F; // extra bandwidth

Expand Down Expand Up @@ -428,6 +433,26 @@ int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seq, char **seq_names, int
return 0;
}

// only sort new sequences
void abpoa_sort_seq_by_length(abpoa_seq_t *abs, int exist_n_seq, int n_seq) {
int i, j;
// sort by length
abpoa_seq_t *tmp_seq = (abpoa_seq_t*)calloc(1, sizeof(abpoa_seq_t));
tmp_seq->n_seq = 1; abpoa_realloc_seq(tmp_seq);
for (i = 0; i < n_seq-1; ++i) {
for (j = i+1; j < n_seq; ++j) {
if (abs->seq[exist_n_seq+i].l < abs->seq[exist_n_seq+j].l) {
// swap i and j
abpoa_cpy_abs(tmp_seq, 0, abs, exist_n_seq+i);
abpoa_cpy_abs(abs, exist_n_seq+i, abs, exist_n_seq+j);
abpoa_cpy_abs(abs, exist_n_seq+j, tmp_seq, 0);
}
}
}
// free
abpoa_free_seq(tmp_seq);
}

int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) {
if (!abpt->out_msa && !abpt->out_cons && !abpt->out_gfa) return 0;
abpoa_reset(ab, abpt, 1024);
Expand All @@ -438,6 +463,8 @@ int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) {
gzFile readfp = xzopen(read_fn, "r"); kseq_t *ks = kseq_init(readfp);
int i, j, n_seq = abpoa_read_seq(abs, ks);

if (abpt->sort_input_seq_by_len) abpoa_sort_seq_by_length(abs, exist_n_seq, n_seq);

// always reset graph before perform POA
int max_len = 0;
for (i = 0; i < abs->n_seq; ++i) {
Expand Down
48 changes: 45 additions & 3 deletions src/abpoa_graph.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ void abpoa_set_graph_node(abpoa_graph_t *abg, int node_i) {
void abpoa_free_node(abpoa_node_t *node, int n) {
int i, j;
for (i = 0; i < n; ++i) {
if (node[i].in_edge_m > 0) free(node[i].in_id);
if (node[i].in_edge_m > 0) {
free(node[i].in_id); free(node[i].in_edge_weight);
}
if (node[i].out_edge_m > 0) {
free(node[i].out_id); free(node[i].out_edge_weight);
if (node[i].read_ids_n > 0) {
Expand All @@ -45,7 +47,9 @@ void abpoa_free_node(abpoa_node_t *node, int n) {
// 0: in_edge, 1: out_edge
abpoa_graph_t *abpoa_realloc_graph_edge(abpoa_graph_t *abg, int io, int id, int use_read_ids) {
if (io == 0) {
_uni_realloc(abg->node[id].in_id, abg->node[id].in_edge_n, abg->node[id].in_edge_m, int);
int m = abg->node[id].in_edge_m;
_uni_realloc(abg->node[id].in_id, abg->node[id].in_edge_n, m, int);
_uni_realloc(abg->node[id].in_edge_weight, abg->node[id].in_edge_n, abg->node[id].in_edge_m, int);
} else {
int edge_m = abg->node[id].out_edge_m;
if (edge_m <= 0) {
Expand Down Expand Up @@ -273,6 +277,14 @@ void abpoa_BFS_set_node_remain(abpoa_graph_t *abg, int src_id, int sink_id) {
err_fatal_simple("Failed to set node remain.");
}

// TODO
void abpoa_merge_nodes(abpoa_graph_t *abg, int src_id, int sink_id) {
if (abg->node_n <= 0) {
err_func_format_printf(__func__, "Empty graph.\n");
return;
}
}

// 1. index_to_node_id
// 2. node_id_to_index
// 3. node_id_to_rank
Expand All @@ -287,7 +299,7 @@ void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt) {
// fprintf(stderr, "node_n: %d, index_rank_m: %d\n", node_n, abg->index_rank_m);
abg->index_to_node_id = (int*)_err_realloc(abg->index_to_node_id, abg->index_rank_m * sizeof(int));
abg->node_id_to_index = (int*)_err_realloc(abg->node_id_to_index, abg->index_rank_m * sizeof(int));
if (abpt->out_msa || abpt->max_n_cons > 1 || abpt->cons_algrm == ABPOA_MF)
if (abpt->out_msa || abpt->max_n_cons > 1 || abpt->cons_algrm == ABPOA_MF)
abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, abg->index_rank_m * sizeof(int));
if (abpt->wb >= 0) {
abg->node_id_to_max_pos_left = (int*)_err_realloc(abg->node_id_to_max_pos_left, abg->index_rank_m * sizeof(int));
Expand Down Expand Up @@ -374,6 +386,24 @@ void abpoa_set_msa_rank(abpoa_graph_t *abg, int src_id, int sink_id) {
}
}

int abpoa_get_node_weight(abpoa_graph_t *abg, int node_id) {
int i, weight = 0;
for (i = 0; i < abg->node[node_id].out_edge_n; ++i) {
weight += abg->node[node_id].out_edge_weight[i];
}
return weight;
}

int abpoa_get_incre_path_score(abpoa_graph_t *abg, int node_id, int in_id_idx) {
if (in_id_idx < 0 || in_id_idx >= abg->node[node_id].in_edge_n) err_fatal(__func__, "Unexpected in_id_idx: %d.", in_id_idx);
int pre_node_id = abg->node[node_id].in_id[in_id_idx];
int node_w = abpoa_get_node_weight(abg, pre_node_id);
int edge_w = abg->node[node_id].in_edge_weight[in_id_idx];
if (node_w == 0 || edge_w == 0) return 0;
int score = round(log((double)edge_w/(double)node_w));
return MAX_OF_TWO(score, -20); // -20 is the minimum score
}

int abpoa_get_aligned_id(abpoa_graph_t *abg, int node_id, uint8_t base) {
int i, aln_id;
abpoa_node_t *node = abg->node;
Expand Down Expand Up @@ -424,6 +454,15 @@ int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_e
int out_edge_i = -1;
if (check_edge) {
int i;
// in edge
for (i = 0; i < abg->node[to_id].in_edge_n; ++i) {
if (abg->node[to_id].in_id[i] == from_id) { // edge exists
abg->node[to_id].in_edge_weight[i] += w; // update weight on existing edge
// update label id
break;
}
}
// out edge
for (i = 0; i < out_edge_n; ++i) {
if (abg->node[from_id].out_id[i] == to_id) { // edge exists
abg->node[from_id].out_edge_weight[i] += w; // update weight on existing edge
Expand All @@ -440,6 +479,7 @@ int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_e
/// in edge
abpoa_realloc_graph_edge(abg, 0, to_id, 0);
abg->node[to_id].in_id[abg->node[to_id].in_edge_n] = from_id;
abg->node[to_id].in_edge_weight[abg->node[to_id].in_edge_n] = w;
++abg->node[to_id].in_edge_n;
/// out edge
abpoa_realloc_graph_edge(abg, 1, from_id, add_read_id);
Expand Down Expand Up @@ -498,6 +538,7 @@ void abpoa_add_graph_sequence(abpoa_graph_t *abg, uint8_t *seq, int *weight, int

abpoa_add_graph_edge(abg, last_node_id, ABPOA_SINK_NODE_ID, 0, weight[seq_l-1], add_read_id, add_read_weight, read_id, read_ids_n, tot_read_n);
abg->is_called_cons = abg->is_set_msa_rank = abg->is_topological_sorted = 0;
// abpoa_merge_nodes(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID);
// abpoa_topological_sort(abg, abpt);
}

Expand Down Expand Up @@ -666,6 +707,7 @@ int abpoa_add_subgraph_alignment(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_i
// abpoa_add_graph_edge(abg, last_id, end_node_id, 1-last_new, w, add_read_id&add, read_id, read_ids_n);
abpoa_add_graph_edge(abg, last_id, end_node_id, 1-last_new, weight[seq_l-1], add_read_id, add_read_weight, read_id, read_ids_n, tot_read_n);
abg->is_called_cons = abg->is_topological_sorted = 0;
// abpoa_merge_nodes(abg, beg_node_id, end_node_id);
// abpoa_topological_sort(abg, abpt);
if (_weight == NULL) free(weight);
return 0;
Expand Down
2 changes: 2 additions & 0 deletions src/abpoa_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ extern "C" {
#endif

int abpoa_get_aligned_id(abpoa_graph_t *abg, int node_id, uint8_t base);
int abpoa_get_incre_path_score(abpoa_graph_t *abg, int node_id, int in_id_idx);
int abpoa_get_node_weight(abpoa_graph_t *abg, int node_id);
void abpoa_add_graph_aligned_node(abpoa_graph_t *abg, int node_id, int aligned_id);
void abpoa_set_msa_rank(abpoa_graph_t *abg, int src_id, int sink_id);
abpoa_graph_t *abpoa_init_graph(void);
Expand Down
2 changes: 1 addition & 1 deletion src/abpoa_plot.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt) {
// out_edge
for (j = 0; j < abg->node[id].out_edge_n; ++j) {
out_id = abg->node[id].out_id[j];
fprintf(fp, "\t%s -> %s [label=\"%d\", penwidth=%d]\n", node_label[id], node_label[out_id], abg->node[id].out_edge_weight[j], abg->node[id].out_edge_weight[j]+1);
fprintf(fp, "\t%s -> %s [label=\"%d\", fontsize=20, fontcolor=red, penwidth=%d]\n", node_label[id], node_label[out_id], abg->node[id].out_edge_weight[j], abg->node[id].out_edge_weight[j]+1);
}
if (abg->node[id].aligned_node_n > 0) {
fprintf(fp, "\t{rank=same; %s ", node_label[id]);
Expand Down
15 changes: 14 additions & 1 deletion src/abpoa_seq.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,26 @@ void abpoa_free_seq(abpoa_seq_t *abs) {

void abpoa_cpy_str(abpoa_str_t *str, char *s, int l) {
if (l > 0) {
// str->s = (char*)_err_malloc(str->m * sizeof(char));
if (str->m != 0) str->s = (char*)_err_realloc(str->s, (l+1) * sizeof(char));
else str->s = (char*)_err_malloc((l+1) * sizeof(char));
str->l = l; str->m = l + 1;
str->s = (char*)_err_malloc(str->m * sizeof(char));
memcpy(str->s, s, l);
str->s[str->l] = 0;
}
}

// copy abs->seq[from_i] to abs->seq[to_i]
// copy abs->name[from_i] to abs->name[to_i]
// copy abs->comment[from_i] to abs->comment[to_i]
// copy abs->qual[from_i] to abs->qual[to_i]
void abpoa_cpy_abs(abpoa_seq_t *to_abs, int to_i, abpoa_seq_t *from_abs, int from_i) {
abpoa_cpy_str(to_abs->seq+to_i, from_abs->seq[from_i].s, from_abs->seq[from_i].l);
abpoa_cpy_str(to_abs->name+to_i, from_abs->name[from_i].s, from_abs->name[from_i].l);
abpoa_cpy_str(to_abs->comment+to_i, from_abs->comment[from_i].s, from_abs->comment[from_i].l);
abpoa_cpy_str(to_abs->qual+to_i, from_abs->qual[from_i].s, from_abs->qual[from_i].l);
}

void abpoa_cpy_seq(abpoa_seq_t *abs, int seq_i, kseq_t *kseq) {
abpoa_cpy_str(abs->seq+seq_i, kseq->seq.s, kseq->seq.l);
abpoa_cpy_str(abs->name+seq_i, kseq->name.s, kseq->name.l);
Expand Down
1 change: 1 addition & 0 deletions src/abpoa_seq.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ extern "C" {
#endif

abpoa_seq_t *abpoa_realloc_seq(abpoa_seq_t *abs);
void abpoa_cpy_abs(abpoa_seq_t *to_abs, int to_i, abpoa_seq_t *from_abs, int from_i);
void abpoa_cpy_str(abpoa_str_t *str, char *s, int l);
abpoa_seq_t *abpoa_init_seq(void);
void abpoa_free_seq(abpoa_seq_t *abs);
Expand Down
Loading

0 comments on commit b0276ea

Please sign in to comment.