From bfe4ac0a4945ed3eadf68282776fc816b299947e Mon Sep 17 00:00:00 2001 From: Yan Gao Date: Tue, 15 Mar 2022 11:45:14 +0800 Subject: [PATCH] support multiple consensus sequences --- CMakeLists.txt | 1 + MANIFEST.in | 1 + Makefile | 6 +- README.md | 136 ++++--- example.c | 88 +++-- include/abpoa.h | 42 ++- python/README.md | 17 +- python/cabpoa.pxd | 45 ++- python/example.py | 61 ++- python/pyabpoa.pyx | 126 ++++--- setup.py | 6 +- src/abpoa.c | 47 +-- src/abpoa.h | 42 ++- src/abpoa_align.c | 46 +-- src/abpoa_align.h | 2 +- src/abpoa_graph.c | 896 ++++++++------------------------------------ src/abpoa_graph.h | 4 +- src/abpoa_output.c | 899 +++++++++++++++++++++++++++++++++++++++++++++ src/abpoa_output.h | 15 + src/abpoa_plot.c | 3 +- sub_example.c | 16 +- test_data/heter.fa | 30 ++ 22 files changed, 1474 insertions(+), 1055 deletions(-) create mode 100644 src/abpoa_output.c create mode 100644 src/abpoa_output.h create mode 100644 test_data/heter.fa diff --git a/CMakeLists.txt b/CMakeLists.txt index e54f1e2..9e4b642 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") add_library(abpoa src/abpoa_align.c src/abpoa_graph.c + src/abpoa_output.c src/abpoa_plot.c src/abpoa_seed.c src/abpoa_seq.c diff --git a/MANIFEST.in b/MANIFEST.in index bccaa53..26d8a80 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include src/abpoa_align.h include src/abpoa_graph.h include src/abpoa.h +include src/abpoa_output.h include src/abpoa_seed.h include src/abpoa_seq.h include src/kalloc.h diff --git a/Makefile b/Makefile index 48a111a..1284a68 100644 --- a/Makefile +++ b/Makefile @@ -45,9 +45,9 @@ LIB_DIR = $(OUT_PRE_DIR)/lib INC_DIR = ./include SRC_DIR = ./src -SOURCE = $(SRC_DIR)/abpoa_align.c $(SRC_DIR)/abpoa.c $(SRC_DIR)/abpoa_graph.c $(SRC_DIR)/abpoa_plot.c $(SRC_DIR)/abpoa_seed.c $(SRC_DIR)/abpoa_seq.c $(SRC_DIR)/kalloc.c $(SRC_DIR)/kstring.c $(SRC_DIR)/simd_abpoa_align.c $(SRC_DIR)/simd_check.c $(SRC_DIR)/utils.c -HEADER = $(SRC_DIR)/abpoa_align.h $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa.h $(SRC_DIR)/abpoa_seed.h $(SRC_DIR)/abpoa_seq.h $(SRC_DIR)/kalloc.h $(SRC_DIR)/kdq.h $(SRC_DIR)/khash.h $(SRC_DIR)/kseq.h $(SRC_DIR)/ksort.h $(SRC_DIR)/kstring.h $(SRC_DIR)/kvec.h $(SRC_DIR)/simd_instruction.h $(SRC_DIR)/simd_abpoa_align.h $(SRC_DIR)/utils.h -OBJS = $(SRC_DIR)/abpoa_align.o $(SRC_DIR)/abpoa_graph.o $(SRC_DIR)/abpoa_plot.o $(SRC_DIR)/abpoa_seed.o $(SRC_DIR)/abpoa_seq.o $(SRC_DIR)/kalloc.o $(SRC_DIR)/kstring.o $(SRC_DIR)/simd_abpoa_align.o $(SRC_DIR)/simd_check.o $(SRC_DIR)/utils.o +SOURCE = $(SRC_DIR)/abpoa_align.c $(SRC_DIR)/abpoa.c $(SRC_DIR)/abpoa_graph.c $(SRC_DIR)/abpoa_plot.c $(SRC_DIR)/abpoa_seed.c $(SRC_DIR)/abpoa_seq.c $(SRC_DIR)/abpoa_output.c $(SRC_DIR)/kalloc.c $(SRC_DIR)/kstring.c $(SRC_DIR)/simd_abpoa_align.c $(SRC_DIR)/simd_check.c $(SRC_DIR)/utils.c +HEADER = $(SRC_DIR)/abpoa_align.h $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa.h $(INC_DIR)/abpoa.h $(SRC_DIR)/abpoa_seed.h $(SRC_DIR)/abpoa_seq.h $(SRC_DIR)/abpoa_output.h $(SRC_DIR)/kalloc.h $(SRC_DIR)/kdq.h $(SRC_DIR)/khash.h $(SRC_DIR)/kseq.h $(SRC_DIR)/ksort.h $(SRC_DIR)/kstring.h $(SRC_DIR)/kvec.h $(SRC_DIR)/simd_instruction.h $(INC_DIR)/simd_instruction.h $(SRC_DIR)/simd_abpoa_align.h $(SRC_DIR)/utils.h +OBJS = $(SRC_DIR)/abpoa_align.o $(SRC_DIR)/abpoa_graph.o $(SRC_DIR)/abpoa_plot.o $(SRC_DIR)/abpoa_seed.o $(SRC_DIR)/abpoa_seq.o $(SRC_DIR)/abpoa_output.o $(SRC_DIR)/kalloc.o $(SRC_DIR)/kstring.o $(SRC_DIR)/simd_abpoa_align.o $(SRC_DIR)/simd_check.o $(SRC_DIR)/utils.o # SIMD label SIMD_CHECK_D = -D __CHECK_SIMD_MAIN__ diff --git a/README.md b/README.md index 64bb426..23c015a 100644 --- a/README.md +++ b/README.md @@ -8,20 +8,16 @@ [![Build Status](https://img.shields.io/travis/yangao07/abPOA/master.svg?label=Master)](https://travis-ci.org/yangao07/abPOA) [![License](https://img.shields.io/badge/License-MIT-black.svg)](https://github.com/yangao07/abPOA/blob/master/LICENSE) -## Updates (v1.3.0) +## Updates (v1.4.0) -- Use SIMDE to handle SIMD instructions -- Set seeding as disabled by default, -S/--seeding to enable seeding, remove -N/--no-seeding -- Set min. size of POA windows as 500 in seeding mode -- Add -c/--amio-acid to align amino acid input sequences -- Add `is_aa` parameter in pyabpoa to align amino acid input sequences -- Fix CMakeLists.txt +- Allow to generate multiple consensus sequences from graph alignment (-d) +- Output sequence name by default in MSA format, remove -A/--msa-header ## Getting started Download the [latest release](https://github.com/yangao07/abPOA/releases): ``` -wget https://github.com/yangao07/abPOA/releases/download/v1.3.0/abPOA-v1.3.0.tar.gz -tar -zxvf abPOA-v1.3.0.tar.gz && cd abPOA-v1.3.0 +wget https://github.com/yangao07/abPOA/releases/download/v1.4.0/abPOA-v1.4.0.tar.gz +tar -zxvf abPOA-v1.4.0.tar.gz && cd abPOA-v1.4.0 ``` Make from source and run with test data: ``` @@ -40,18 +36,23 @@ abpoa ./test_data/seq.fa > cons.fa - [Building abPOA from source files](#build) - [Pre-built binary executable file for Linux/Unix](#binary) - [General usage](#usage) - - [To generate consensus sequence](#gen_cons) + - [To generate one consensus sequence](#gen_1cons) + - [To generate multiple consensus sequences](#gen_mcons) - [To generate row-column multiple sequence alignment](#gen_msa) - [To generate graph information in GFA format](#gen_gfa) - [To align sequence to an existing graph in GFA/MSA format](#aln_to_gfa) - [To generate a plot of the alignment graph](#gen_plot) -- [Commands and options](#cmd) - [Input](#input) - [Output](#output) - [Consensus sequence](#cons) - [Row-column multiple sequence alignment](#msa) - [Full graph information](#gfa) - [Plot of alignment graph](#plot) +- [Algorithm description](#description) + - [Adaptive banding](#banding) + - [Minimizer-based seeding and partition](#seeding) + - [Minimizer-based progressive tree](#tree) + - [Multiple conensus sequences](#mcons) - [For development](#dev) - [Evaluation datasets](#eval) - [Contact](#contact) @@ -84,9 +85,9 @@ You can also build abPOA from source files. Make sure you have gcc (>=6.4.0) and zlib installed before compiling. It is recommended to download the [latest release](https://github.com/yangao07/abPOA/releases). ``` -wget https://github.com/yangao07/abPOA/releases/download/v1.3.0/abPOA-v1.3.0.tar.gz -tar -zxvf abPOA-v1.3.0.tar.gz -cd abPOA-v1.3.0; make +wget https://github.com/yangao07/abPOA/releases/download/v1.4.0/abPOA-v1.4.0.tar.gz +tar -zxvf abPOA-v1.4.0.tar.gz +cd abPOA-v1.4.0; make ``` Or, you can use `git clone` command to download the source code. This gives you the latest version of abPOA, which might be still under development. @@ -98,17 +99,23 @@ cd abPOA; make ### Pre-built binary executable file for Linux/Unix If you meet any compiling issue, please try the pre-built binary file: ``` -wget https://github.com/yangao07/abPOA/releases/download/v1.3.0/abPOA-v1.3.0_x64-linux.tar.gz -tar -zxvf abPOA-v1.3.0_x64-linux.tar.gz +wget https://github.com/yangao07/abPOA/releases/download/v1.4.0/abPOA-v1.4.0_x64-linux.tar.gz +tar -zxvf abPOA-v1.4.0_x64-linux.tar.gz ``` ## General usage -### To generate consensus sequence +### To generate consensus sequence ``` abpoa seq.fa > cons.fa ``` +### To generate multiple consensus sequences + +``` +abpoa heter.fa -d2 > 2cons.fa +``` + ### To generate row-column multiple sequence alignment in PIR format ``` @@ -128,14 +135,14 @@ abpoa seq.fa -r4 > out.gfa ### To align sequence to an existing graph in GFA/MSA format ``` abpoa -i in.gfa seq.fa -r3 > out.gfa -abpoa -i in.msa seq.fa -Ar1 > out.msa +abpoa -i in.msa seq.fa -r1 > out.msa ``` For GFA input file, `S` and `P` lines are required and are used to reconstruct the alignment graph. For MSA input file, which is generally a FASTA format file, `-` in the sequence indicates the alignment gap. If you want to use abPOA to generate a MSA output file and then perform the incremental graph alignment, please do not forget `-A` to include the FASTA header of each sequence: ``` -abpoa seq1.fa -Ar1 > seq1.msa +abpoa seq1.fa -r1 > seq1.msa abpoa -i seq1.msa seq2.fa > cons.fa ``` @@ -146,61 +153,6 @@ abpoa seq.fa -g poa.png > cons.fa ``` See [Plot of alignment graph](#plot) for more details about the plot file. -## Commands and options -``` -Usage: abpoa [options] > cons.fa/msa.out/abpoa.gfa - -Options: - Alignment: - -m --aln-mode INT alignment mode [0] - 0: global, 1: local, 2: extension - -M --match INT match score [2] - -X --mismatch INT mismatch penalty [4] - -t --matrix FILE scoring matrix file, '-M' and '-X' are not used when '-t' is used [Null] - e.g., 'HOXD70.mtx, BLOSUM62.mtx' - -O --gap-open INT(,INT) gap opening penalty (O1,O2) [4,24] - -E --gap-ext INT(,INT) gap extension penalty (E1,E2) [2,1] - abPOA provides three gap penalty modes, cost of a g-long gap: - - convex (default): min{O1+g*E1, O2+g*E2} - - affine (set O2 as 0): O1+g*E1 - - linear (set O1 as 0): g*E1 - -s --amb-strand ambiguous strand mode [False] - for each input sequence, try the reverse complement if the current - alignment score is too low, and pick the strand with a higher score - Adaptive banded DP: - -b --extra-b INT first adaptive banding parameter [10] - set b as < 0 to disable adaptive banded DP - -f --extra-f FLOAT second adaptive banding parameter [0.01] - the number of extra bases added on both sites of the band is - b+f*L, where L is the length of the aligned sequence - Minimizer-based seeding and partition (only effective in global alignment mode): - -S --seeding enable seeding [False] - -k --k-mer INT minimizer k-mer size [19] - -w --window INT minimizer window size [10] - -n --min-poa-win INT min. size of window to perform POA [500] - -p --progressive build guide tree and perform progressive partial order alignment [False] - Input/Output: - -c --amino-acid input sequences are amino acid (default is nucleotide) [False] - -l --in-list input file is a list of sequence file names [False] - each line is one sequence file containing a set of sequences - which will be aligned by abPOA to generate a consensus sequence - -i --incrmnt FILE incrementally align sequences to an existing graph/MSA [Null] - graph could be in GFA or MSA format generated by abPOA - -o --output FILE ouput to FILE [stdout] - -r --result INT output result mode [0] - - 0: consensus in FASTA format - - 1: MSA in PIR format - - 2: both 0 & 1 - - 3: graph in GFA format - - 4: graph with consensus path in GFA format - - 5: consensus in FASTQ format - -A --msa-header add read ID as header of each sequence in MSA output [False] - -g --out-pog FILE dump final alignment graph to FILE (.pdf/.png) [Null] - - -h --help print this help usage information - -v --version show version number -``` - ## Input abPOA works with FASTA, FASTQ, gzip'd FASTA(.fa.gz) and gzip'd FASTQ(.fq.gz) formats. The input file is expected to contains multiple sequences which will be processed sequentially to perform the iterative @@ -254,6 +206,42 @@ The numbers inside the nodes are the node IDs. The numbers on the edges are the Make sure you have `dot` installed beforing using abPOA to generate the plot. For Linux/Unix systems: `sudo apt-get install graphviz`. +## Algorithm description +### Adaptive banding +To understand how the adaptive banding working, please refer to our [Bioinformatics paper](https://dx.doi.org/10.1093/bioinformatics/btaa963). + +### Minimizer-based seeding mode +As abPOA always allocates quadratic size of memory, for very long input sequences (>10 kb), memory usage will be a challenge. + +To solve this issue, we develop a minimizer-based seeding and partition method to split the sequence and graph with a small window. +The full POA DP matrix can be split into several smaller ones and adaptive banded POA can be performed within each small window separately. + +In more detail, abPOA extracts all the minimizers from all the input sequences, then all the minimizer hits between each pair of two sequences can be found. +For each pair of seqences, the minimizer hits are first chained together using relatively stringent criteria to make sure that no big gap exists in the chain. +This usually leads to several separated local chains of minimizer hits. +A second round of chaining is then performed on all the local minimizer chains to generate a global chain going through the entire sequence. +With this global chain, abPOA selects a series of minimizer hits as partition anchors which has at least a distance of 500 bp (by default, -n/--min-poa-win). +Within each partitioned window, abPOA performs banded partial order alignment separately and combines all the alignment results at the end. + +### Minimizer-based progressive tree +Instead of aligning all the sequences in the original order, abPOA can alternatively build a progressive tree to guide the alignment order. +The generation of the progressive tree is also based on minimizers. +For each pair of sequences, abPOA calculates their similarity score which is the Jaccard similarity of the minimizers, i.e. the number of minimizer hits divided by the total number of all minimizers from the two sequences. +With all the similarity scores (minimizer-based Jaccard similarity), abPOA builds the progressive tree in the following way: + +1. Pick the first two sequences that have the highest scores. The progressive tree set is initialized as these first two sequences. +2. For each remaining sequence, sum the scores between the remaining sequence and all the sequences from the current progressive tree set. Pick the one with the highest sum score, and push it to the progressive tree set. +3. Repeat step 2, until no sequence remains. + +Then, abPOA performs partial order alignment following the order of sequences in this progressive tree set. + +### Multiple consensus sequences +Since v1.4.0, abPOA supports generation of multiple consensus sequences (set -d/--max-num-cons as >1). + +The general underlying idea is to group input sequences into multiple clusters based on the heterozygous bases in the graph, +Then, one consensus sequence is separately generated for each cluster of input sequences. +The minimum allele frequency for each heterozygous base is 0.25 (by default, -q/--min-freq). + ## For development abPOA is not only a stand-alone tool for MSA and consensus calling, it can also work as a programming library. [example.c](example.c) shows how to use the C APIs of abPOA to take a set of sequences as input and perform MSA and consensus calling. Basically, the library file `libabpoa.a` and two header files [abpoa.h](include/abpoa.h) and [simd_instruction.h](include/simd_instruction.h) are needed to make the abPOA library work in your program. diff --git a/example.c b/example.c index 693eff2..e05c468 100644 --- a/example.c +++ b/example.c @@ -1,7 +1,7 @@ /* example.c libabpoa usage example To compile: gcc -g example.c -I ./include -L ./lib -labpoa -lz -lm -o example - or: +or: gcc -g example.c -I ./include ./lib/libabpoa.a -lz -lm -o example */ #include @@ -53,17 +53,30 @@ const char nt256_table[256] = { int main(void) { int i, j, n_seqs = 10; + // char seqs[10][100] = { + // "CGTCAATCTATCGAAGCATACGCGGGCAGAGCCGAAGACCTCGGCAATCCA", + // "CCACGTCAATCTATCGAAGCATACGCGGCAGCCGAACTCGACCTCGGCAATCAC", + // "CGTCAATCTATCGAAGCATACGCGGCAGAGCCCGGAAGACCTCGGCAATCAC", + // "CGTCAATGCTAGTCGAAGCAGCTGCGGCAGAGCCGAAGACCTCGGCAATCAC", + // "CGTCAATCTATCGAAGCATTCTACGCGGCAGAGCCGACCTCGGCAATCAC", + // "CGTCAATCTAGAAGCATACGCGGCAAGAGCCGAAGACCTCGGCCAATCAC", + // "CGTCAATCTATCGGTAAAGCATACGCTCTGTAGCCGAAGACCTCGGCAATCAC", + // "CGTCAATCTATCTTCAAGCATACGCGGCAGAGCCGAAGACCTCGGCAATC", + // "CGTCAATGGATCGAGTACGCGGCAGAGCCGAAGACCTCGGCAATCAC", + // "CGTCAATCTAATCGAAGCATACGCGGCAGAGCCGTCTACCTCGGCAATCACGT" + // }; + char seqs[10][100] = { - "CGTCAATCTATCGAAGCATACGCGGGCAGAGCCGAAGACCTCGGCAATCCA", - "CCACGTCAATCTATCGAAGCATACGCGGCAGCCGAACTCGACCTCGGCAATCAC", - "CGTCAATCTATCGAAGCATACGCGGCAGAGCCCGGAAGACCTCGGCAATCAC", - "CGTCAATGCTAGTCGAAGCAGCTGCGGCAGAGCCGAAGACCTCGGCAATCAC", - "CGTCAATCTATCGAAGCATTCTACGCGGCAGAGCCGACCTCGGCAATCAC", - "CGTCAATCTAGAAGCATACGCGGCAAGAGCCGAAGACCTCGGCCAATCAC", - "CGTCAATCTATCGGTAAAGCATACGCTCTGTAGCCGAAGACCTCGGCAATCAC", - "CGTCAATCTATCTTCAAGCATACGCGGCAGAGCCGAAGACCTCGGCAATC", - "CGTCAATGGATCGAGTACGCGGCAGAGCCGAAGACCTCGGCAATCAC", - "CGTCAATCTAATCGAAGCATACGCGGCAGAGCCGTCTACCTCGGCAATCACGT" + "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT", + "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT" }; // initialize variables @@ -89,6 +102,7 @@ int main(void) { abpt->out_cons = 1; // generate consensus sequence, set 0 to disable abpt->w = 6, abpt->k = 9; abpt->min_w = 10; // minimizer-based seeding and partition abpt->progressive_poa = 1; + abpt->max_n_cons = 2; // to generate 2 consensus sequences abpoa_post_set_para(abpt); @@ -102,49 +116,45 @@ int main(void) { bseqs[i][j] = nt4_table[(int)seqs[i][j]]; } - // 1. output to stdout + // 1. directly output to stdout fprintf(stdout, "=== output to stdout ===\n"); - // perform abpoa-msa - abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, stdout, NULL, NULL, NULL, NULL, NULL, NULL); + abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, stdout); - // 2. variables to store result - uint8_t **cons_seq; int **cons_cov, *cons_l, cons_n=0; - uint8_t **msa_seq; int msa_l=0; - - // perform abpoa-msa - ab->abs->n_seq = 0; // To re-use ab, n_seq needs to be set as 0 - abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, NULL, &cons_seq, &cons_cov, &cons_l, &cons_n, &msa_seq, &msa_l); - - fprintf(stdout, "=== output to variables ===\n"); - for (i = 0; i < cons_n; ++i) { - fprintf(stdout, ">Consensus_sequence\n"); - for (j = 0; j < cons_l[i]; ++j) - fprintf(stdout, "%c", nt256_table[cons_seq[i][j]]); - fprintf(stdout, "\n"); - } + // 2. output MSA alignment and consensus sequence stored in (abpoa_cons_t *) + abpoa_cons_t *abc = ab->abc; + fprintf(stdout, "=== stored in variables ===\n"); fprintf(stdout, ">Multiple_sequence_alignment\n"); - for (i = 0; i < n_seqs; ++i) { - for (j = 0; j < msa_l; ++j) { - fprintf(stdout, "%c", nt256_table[msa_seq[i][j]]); + for (i = 0; i < abc->n_seq; ++i) { + for (j = 0; j < abc->msa_len; ++j) { + fprintf(stdout, "%c", nt256_table[abc->msa_base[i][j]]); } fprintf(stdout, "\n"); } - if (cons_n) { - for (i = 0; i < cons_n; ++i) { - free(cons_seq[i]); free(cons_cov[i]); - } free(cons_seq); free(cons_cov); free(cons_l); - } - if (msa_l) { - for (i = 0; i < n_seqs; ++i) free(msa_seq[i]); free(msa_seq); + for (i = 0; i < abc->n_cons; ++i) { + fprintf(stdout, ">Consensus_sequence"); + if (abc->n_cons > 1) { + fprintf(stdout, "_%d ", i+1); + for (j = 0; j < abc->clu_n_seq[i]; ++j) { // output read ids for each cluster/group + fprintf(stdout, "%d", abc->clu_read_ids[i][j]); + if (j != abc->clu_n_seq[i]-1) fprintf(stdout, ","); + } + } + fprintf(stdout, "\n"); + for (j = 0; j < abc->cons_len[i]; ++j) + fprintf(stdout, "%c", nt256_table[abc->cons_base[i][j]]); + fprintf(stdout, "\n"); } /* generate DOT partial order graph plot */ abpt->out_pog = strdup("example.png"); // dump parital order graph to file if (abpt->out_pog != NULL) abpoa_dump_pog(ab, abpt); + // free seq-related variables for (i = 0; i < n_seqs; ++i) free(bseqs[i]); free(bseqs); free(seq_lens); + + // free abpoa-related variables abpoa_free(ab); abpoa_free_para(abpt); return 0; } diff --git a/include/abpoa.h b/include/abpoa.h index 84dc7a8..f8702cb 100644 --- a/include/abpoa.h +++ b/include/abpoa.h @@ -69,10 +69,10 @@ typedef struct { int zdrop, end_bonus; // from minimap2 // int simd_flag; // available SIMD instruction // alignment mode - uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_msa_header:1, out_cons:1, out_gfa:1, is_diploid:1, use_read_ids:1; - uint8_t amb_strand:1, disable_seeding:1, progressive_poa:1, out_fq:1; + uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_cons:1, out_gfa:1, out_fq:1, use_read_ids:1, amb_strand:1; + uint8_t disable_seeding:1, progressive_poa:1; char *incr_fn, *out_pog; - int align_mode, gap_mode, cons_agrm; + int align_mode, gap_mode, max_n_cons; double min_freq; // for multiploid data // char LogTable65536[65536]; @@ -82,8 +82,8 @@ typedef struct { typedef struct { int node_id; int in_edge_n, in_edge_m, *in_id; - int out_edge_n, out_edge_m, *out_id, max_out_id; int *out_weight; - uint64_t *read_ids; int read_ids_n; // for multiploid + int out_edge_n, out_edge_m, *out_id; int *out_weight; + uint64_t **read_ids; int read_ids_n; // for each edge int aligned_node_n, aligned_node_m, *aligned_node_id; // mismatch; aligned node will have same rank // int heaviest_weight, heaviest_out_id; // for consensus @@ -98,6 +98,18 @@ typedef struct { uint8_t is_topological_sorted:1, is_called_cons:1, is_set_msa_rank:1; } abpoa_graph_t; +typedef struct { + int n_cons, n_seq, msa_len; // # cons, # of total seq, length of row-column MSA (including gaps) + int *clu_n_seq; // # of reads in each read cluster/group, size: n_cons + int **clu_read_ids; // read ids for each cluster/group, size: n_cons * clu_n_seq[i] + int *cons_len; // length of each consensus sequence, size: n_cons + int **cons_node_ids; // node id of each consensus, size: n_cons * cons_len[i] + uint8_t **cons_base; // sequence base of each consensus, size: n_cons * cons_len[i] + uint8_t **msa_base; // sequence base of RC-MSA, size: (n_seq + n_cons) * msa_len + int **cons_cov; // coverage of each consensus base, size: n_cons * cons_len[i] + int **cons_phred_score; // phred score for each consensus base, size: n_cons * cons_len[i] +} abpoa_cons_t; + typedef struct { int l, m; char *s; } abpoa_str_t; @@ -117,6 +129,7 @@ typedef struct { abpoa_graph_t *abg; abpoa_seq_t *abs; abpoa_simd_matrix_t *abm; + abpoa_cons_t *abc; } abpoa_t; // init for abpoa parameters @@ -130,12 +143,12 @@ abpoa_t *abpoa_init(void); void abpoa_free(abpoa_t *ab); // perform msa -int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l); +int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp); -int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l); +int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp); // clean alignment graph -void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen); +void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen); // restore graph from GFA/FASTA file abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt); @@ -156,7 +169,7 @@ int abpoa_add_graph_node(abpoa_graph_t *abg, uint8_t base); // para: // from_id/to_id: ids of from and to nodes // check_edge: set as 1 if this edge maybe alread exist and only need to update weight, set as 0 if the edge is new -// add_read_id: set as 1 if read_id is used (to use row-column algorithm/generate MSA result/diploid consensus) +// add_read_id: set as 1 if read_id is used (to use row-column algorithm/generate MSA result/multiple consensus) // read_id: is of sequence // read_ids_n: size of read_id array, each one is 64-bit (1+(tot_read_n-1)/64) int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_edge, int w, uint8_t add_read_id, int read_id, int read_ids_n); @@ -185,16 +198,21 @@ void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt); // cons_l: store consensus sequences length // cons_n: store number of consensus sequences // Note: cons_seq and cons_l need to be freed by user. -int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n); +void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt); +void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); // generate column multiple sequence alignment from graph -void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***msa_seq, int *msa_l); +void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt); +void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); // generate graph in GFA format to _out_fp_ void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); +// output cons/msa +void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); + // generate DOT graph plot and dump graph into PDF/PNG format file -int abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt); +void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt); #ifdef __cplusplus } diff --git a/python/README.md b/python/README.md index 2615c05..972d076 100644 --- a/python/README.md +++ b/python/README.md @@ -64,9 +64,6 @@ This constructs a multiple sequence alignment handler of pyabpoa, it accepts the * **gap_ext2**: second gap extension penalty; default: **1** * **extra_b**: first adaptive banding paremeter; set as < 0 to disable adaptive banded DP; default: **10** * **extra_f**: second adaptive banding paremete; the number of extra bases added on both sites of the band is *b+f\*L*, where *L* is the length of the aligned sequence; default : **0.01** -* **is_diploid**: set as 1 if input is diploid datal default: **0** -* **min_freq**: minimum frequency of each consensus to output for diploid datal default: **0.3** - The `msa_aligner` handler provides one method which performs multiple sequence alignment and takes four arguments: ``` @@ -76,8 +73,10 @@ pyabpoa.msa_aligner.msa(seqs, out_cons, out_msa, out_pog='', incr_fn='') * **seqs**: a list variable containing a set of input sequences; **positional** * **out_cons**: a bool variable to ask pyabpoa to generate consensus sequence; **positional** * **out_msa**: a bool variable to ask pyabpoa to generate RC-MSA; **positional** -* **out_pog**: name of a file (`.png` or `.pdf`) to store the plot of the final alignment graph; **optional**, default: **''** -* **incr_fn**: name of an existing graph (GFA) or MSA (FASTA) file, incrementally align sequence to this graph/MSA; **optional**, default: **''** +* **max_n_cons**: maximum number of consensus sequence to generate; default: **1** +* **min_freq**: minimum frequency of each consensus to output (effective when **max_n_cons** > 1); default: **0.3** +* **out_pog**: name of a file (`.png` or `.pdf`) to store the plot of the final alignment graph; default: **''** +* **incr_fn**: name of an existing graph (GFA) or MSA (FASTA) file, incrementally align sequence to this graph/MSA; default: **''** ### Class pyabpoa.msa_result ``` @@ -85,12 +84,14 @@ pyabpoa.msa_result(seq_n, cons_n, cons_len, ...) ``` This class describes the information of the generated consensus sequence and the RC-MSA. The returned result of `pyabpoa.msa_aligner.msa()` is an object of this class that has the following properties: -* **seq_n**: number of input aligned sequences -* **cons_n**: number of generated consensus sequences (generally 1, could be 2 if input is specified as diploid) +* **n_seq**: number of input aligned sequences +* **n_cons**: number of generated consensus sequences (generally 1, could be 2 or more if **max_n_cons** is set as > 1) +* **clu_n_seq**: an array of sequence cluster size * **cons_len**: an array of consensus sequence length(s) * **cons_seq**: an array of consensus sequence(s) +* **cons_cov**: an array of consensus sequence coverage for each base * **msa_len**: size of each row in the RC-MSA -* **msa_seq**: an array containing `seq_n` strings that demonstrates the RC-MSA, each consisting of one input sequence and several `-` indicating the alignment gaps. +* **msa_seq**: an array containing `n_seq`+`n_cons` strings that demonstrates the RC-MSA, each consisting of one input sequence and several `-` indicating the alignment gaps. `pyabpoa.msa_result()` has a function of `print_msa` which prints the RC-MSA to screen. diff --git a/python/cabpoa.pxd b/python/cabpoa.pxd index a60e82d..ceab261 100644 --- a/python/cabpoa.pxd +++ b/python/cabpoa.pxd @@ -59,11 +59,11 @@ cdef extern from "abpoa.h": int zdrop, end_bonus # from minimap2 # int simd_flag # available SIMD instruction # alignment mode - uint8_t ret_cigar, rev_cigar, out_msa, out_msa_header, out_cons, out_gfa, is_diploid, use_read_ids # mode: 0: global, 1: local, 2: extend - uint8_t amb_strand, disable_seeding, progressive_poa, out_fq + uint8_t ret_cigar, rev_cigar, out_msa, out_cons, out_gfa, out_fq, use_read_ids, amb_strand # mode: 0: global, 1: local, 2: extend + uint8_t disable_seeding, progressive_poa char *incr_fn char *out_pog - int align_mode, gap_mode, cons_agrm + int align_mode, gap_mode, max_n_cons double min_freq # for diploid data @@ -74,8 +74,7 @@ cdef extern from "abpoa.h": int out_edge_n, out_edge_m int *out_id int *out_weight - int max_out_id - uint64_t *read_ids + uint64_t **read_ids int read_ids_n # for diploid int aligned_node_n, aligned_node_m int *aligned_node_id # mismatch; aligned node will have same rank @@ -92,6 +91,17 @@ cdef extern from "abpoa.h": int *node_id_to_msa_rank uint8_t is_topological_sorted, is_called_cons, is_set_msa_rank + ctypedef struct abpoa_cons_t: + int n_cons, n_seq, msa_len + int *clu_n_seq + int **clu_read_ids + int *cons_len + int **cons_node_ids + uint8_t **cons_base + uint8_t **msa_base + int **cons_cov + int **cons_phred_score; + ctypedef struct abpoa_str_t: int l, m char *s @@ -111,6 +121,7 @@ cdef extern from "abpoa.h": abpoa_graph_t *abg abpoa_seq_t *abs abpoa_simd_matrix_t *abm + abpoa_cons_t *abc # init for abpoa parameters abpoa_para_t *abpoa_init_para() @@ -124,11 +135,11 @@ cdef extern from "abpoa.h": void abpoa_free(abpoa_t *ab) # do msa for a set of input sequences - int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp, uint8_t ***cons_seq, uint8_t ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l) - int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l) + int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp) + int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) # clean alignment graph - void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen) + void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen) # restore graph from GFA/MSA file abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt) @@ -153,17 +164,19 @@ cdef extern from "abpoa.h": # generate consensus sequence from graph # para: # out_fp: consensus sequence output in FASTA format, set as NULL to disable - # cons_seq, cons_l, cons_n: store consensus sequences in variables, set cons_n as NULL to disable. - # cons_seq: store consensus sequences - # cons_l: store consensus sequences length - # cons_n: store number of consensus sequences - # Note: cons_seq and cons_l need to be freed by user. - int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n) + void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt) + void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) + + # generate column multiple sequence alignment from graph - void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***msa_seq, int *msa_l) + void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt) + void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) # generate full graph in GFA format void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) + # output to out_fp + void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) + # generate DOT graph plot - int abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt) + void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt) diff --git a/python/example.py b/python/example.py index 48e1c82..585c492 100644 --- a/python/example.py +++ b/python/example.py @@ -12,14 +12,47 @@ # gap_ext2=1 # extra_b = 10 # 1st part of extra band, -1 to disable banded DP # extra_f = 0.01 # 2nd part of eatra band. w = extra_b+extra_f*L (L is sequence length) -# end_bonus=-1 # for extension alignment -# zdrop=-1 # for extension alignment -# cons_agrm=0 # 0: heaviest bundling, 1: heaviest column -# is_diploid=0 # 1: diploid data +# max_n_cons=1 # to output at most N cons, set max_n_cons as N # min_freq=0.3 # minimum frequence of each consensus to output for diploid data # construct msa aligner a = pa.msa_aligner() + +print("==== First exmaple: 2 consensus sequences ====\n") +# for multiple consensus +seqs=[ + 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT', + 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT' + ] + +#@parameters of msa +#seqs: multiple sequences +out_cons=True # generate consensus sequence, set as False to disable +out_msa=True # generate row-column multiple sequence alignment, set as False to disable +out_pog="example1.png" # generate plot of alignment graph, set None to disable +max_n_cons = 2 + +# multiple sequence alignment for 'seqs' +res=a.msa(seqs, out_cons=out_cons, out_msa=out_msa, max_n_cons=max_n_cons, out_pog=out_pog) + +# output result +if out_cons: + for i in range(res.n_cons): + print(">Consensus_sequence_{}".format(i+1)) + print(res.cons_seq[i]) +if out_msa: + res.print_msa() + + +print("\n\n==== Second exmaple: 1 consensus sequence ====\n") seqs=[ 'CGTCAATCTATCGAAGCATACGCGGGCAGAGCCGAAGACCTCGGCAATCCA', 'CCACGTCAATCTATCGAAGCATACGCGGCAGCCGAACTCGACCTCGGCAATCAC', @@ -37,17 +70,21 @@ #seqs: multiple sequences out_cons=True # generate consensus sequence, set as False to disable out_msa=True # generate row-column multiple sequence alignment, set as False to disable -out_pog="example.png" # generate plot of alignment graph, set None to disable +out_pog="example2.png" # generate plot of alignment graph, set None to disable +max_n_cons = 1 # multiple sequence alignment for 'seqs' -res=a.msa(seqs, out_cons=out_cons, out_msa=out_msa, out_pog=out_pog) +res=a.msa(seqs, out_cons=out_cons, out_msa=out_msa, max_n_cons=max_n_cons, out_pog=out_pog) # output result if out_cons: - print(">Consensus_sequence") - for seq in res.cons_seq: - print(seq) - + for i in range(res.n_cons): + print(">Consensus_sequence_{}".format(i+1)) + print(res.cons_seq[i]) if out_msa: - print(">Multiple_sequence_alignment") - res.print_msa() + for i in range(res.n_seq): + print(">Seq_{}".format(i+1)) + print(res.msa_seq[i]) + for i in range(res.n_cons): + print(">Consensus_sequence_{}".format(i+1)) + print(res.msa_seq[res.n_seq+i]) \ No newline at end of file diff --git a/python/pyabpoa.pyx b/python/pyabpoa.pyx index b1f43e4..4a143ca 100644 --- a/python/pyabpoa.pyx +++ b/python/pyabpoa.pyx @@ -7,25 +7,34 @@ from cabpoa cimport * cdef class msa_result: - cdef int seq_n - cdef int cons_n - cdef cons_len, cons_seq # _cons_len:[int], _cons_seq:[''] + cdef int n_seq + cdef int n_cons + cdef clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov # _cons_len:[int], _cons_seq:[''] cdef int msa_len - cdef msa_seq # _msa_seq:[''] + cdef msa_seq # _msa_seq:[''] - def __cinit__(self, seq_n, cons_n, cons_len, cons_seq, msa_len, msa_seq): - self.seq_n = seq_n - self.cons_n = cons_n + def __cinit__(self, n_seq, n_cons, clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov, msa_len, msa_seq): + self.n_seq = n_seq + self.n_cons = n_cons + self.clu_n_seq = clu_n_seq + self.clu_read_ids = clu_read_ids self.cons_len = cons_len self.cons_seq = cons_seq + self.cons_cov = cons_cov self.msa_len = msa_len self.msa_seq = msa_seq @property - def seq_n(self): return self.seq_n + def n_seq(self): return self.n_seq @property - def cons_n(self): return self.cons_n + def n_cons(self): return self.n_cons + + @property + def clu_n_seq(self): return self.clu_n_seq + + @property + def clu_read_ids(self): return self.clu_read_ids @property def cons_len(self): return self.cons_len @@ -33,6 +42,9 @@ cdef class msa_result: @property def cons_seq(self): return self.cons_seq + @property + def cons_cov(self): return self.cons_cov + @property def msa_len(self): return self.msa_len @@ -41,16 +53,24 @@ cdef class msa_result: def print_msa(self): if not self.msa_seq: return - for s in self.msa_seq: + for i, s in enumerate(self.msa_seq): + if i < self.n_seq: + print('>Seq_{}'.format(i+1)) + else: + if self.n_cons > 1: + cons_id = '_{} {}'.format(i-self.n_seq+1, ','.join(list(map(str, self.clu_read_ids[i-self.n_seq])))) + else: + cons_id = '' + print('>Consensus_sequence{}'.format(cons_id)) print(s) - return + return def set_seq_int_dict(m): - if m == 5: # ACGTN ==> 01234, U ==> 4 + if m == 5: # ACGTN ==> 01234, U ==> 4 seqs = 'ACGUTN' ints = [0, 1, 2, 3, 3, 4] - elif m == 27: # ACGTN ==> 01234, BDEFH... ==> 56789... + elif m == 27: # ACGTN ==> 01234, BDEFH... ==> 56789... seqs = 'ACGTNBDEFHIJKLMOPQRSUVWXYZ*' ints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26] else: @@ -64,13 +84,14 @@ def set_seq_int_dict(m): int2seq_dict[i] = s return seq2int_dict, int2seq_dict + cdef class msa_aligner: cdef abpoa_t *ab cdef abpoa_para_t abpt cdef seq2int_dict, int2seq_dict def __cinit__(self, aln_mode='g', is_aa=False, match=2, mismatch=4, score_matrix=b'', gap_open1=4, gap_open2=24, gap_ext1=2, gap_ext2=1, - extra_b=10, extra_f=0.01, end_bonus=-1, zdrop=-1, cons_agrm=ABPOA_HB, is_diploid=0, min_freq=0.3): + extra_b=10, extra_f=0.01): self.ab = abpoa_init() if aln_mode == 'g': @@ -106,12 +127,8 @@ cdef class msa_aligner: self.abpt.wb = extra_b self.abpt.wf = extra_f - self.abpt.end_bonus = end_bonus - self.abpt.zdrop = zdrop - - self.abpt.cons_agrm = cons_agrm - self.abpt.is_diploid = is_diploid - self.abpt.min_freq = min_freq + self.abpt.end_bonus = -1 # disable end_bonus/zdrop + self.abpt.zdrop = -1 self.seq2int_dict, self.int2seq_dict = set_seq_int_dict(self.abpt.m) @@ -123,31 +140,27 @@ cdef class msa_aligner: return self.ab != NULL - def msa(self, seqs, out_cons, out_msa, out_pog=b'', incr_fn=b''): + def msa(self, seqs, out_cons, out_msa, max_n_cons=1, min_freq=0.25, out_pog=b'', incr_fn=b''): cdef int seq_n = len(seqs) cdef int exist_n = 0 cdef int tot_n = seq_n cdef uint8_t *bseq cdef abpoa_res_t res - cdef uint8_t **cons_seq - cdef uint8_t **msa_seq - cdef int *cons_len - cdef int cons_n=0 - cdef int msa_l=0 - - _cons_len, _cons_seq, _msa_seq = [], [], [] + cdef abpoa_cons_t abc if out_cons: self.abpt.out_cons = 1 else: self.abpt.out_cons = 0 if out_msa: self.abpt.out_msa = 1 else: self.abpt.out_msa = 0 + self.abpt.max_n_cons = max_n_cons + self.abpt.min_freq = min_freq if out_pog: if isinstance(out_pog, str): out_pog = bytes(out_pog, 'utf-8') self.abpt.out_pog = out_pog else: self.abpt.out_pog = NULL abpoa_post_set_para(&self.abpt) - abpoa_reset_graph(self.ab, &self.abpt, len(seqs[0])) + abpoa_reset(self.ab, &self.abpt, len(seqs[0])) if incr_fn: if isinstance(incr_fn, str): incr_fn = bytes(incr_fn, 'utf-8') @@ -160,8 +173,6 @@ cdef class msa_aligner: self.ab[0].abs[0].n_seq += seq_n - if tot_n < 2: return msa_result(tot_n, cons_n, _cons_len, _cons_seq, msa_l, _msa_seq) - for read_i, seq in enumerate(seqs): seq_l = len(seq) bseq = malloc(seq_l * cython.sizeof(uint8_t)) @@ -174,32 +185,39 @@ cdef class msa_aligner: free(bseq) if res.n_cigar: free(res.graph_cigar) - if self.abpt.out_cons: - abpoa_generate_consensus(self.ab, &self.abpt, NULL, &cons_seq, NULL, &cons_len, &cons_n) - for i in range(cons_n): - _cons_len.append(cons_len[i]) - cons_seq1 = '' - for c in cons_seq[i][:cons_len[i]]: - if isinstance(c, bytes): c = ord(c) - cons_seq1 += self.int2seq_dict[c] - _cons_seq.append(cons_seq1) - if cons_n > 0: - for i in range(cons_n): - free(cons_seq[i]) - free(cons_seq) - free(cons_len) if self.abpt.out_msa: - abpoa_generate_rc_msa(self.ab, &self.abpt, NULL, &msa_seq, &msa_l) - for i in range(tot_n): + abpoa_generate_rc_msa(self.ab, &self.abpt) + elif self.abpt.out_cons: + abpoa_generate_consensus(self.ab, &self.abpt) + abc = self.ab[0].abc[0] + + n_cons, clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov, msa_len, msa_seq = 0, [], [], [], [], [], 0, [] + n_cons = abc.n_cons + for i in range(n_cons): + clu_n_seq.append(abc.clu_n_seq[i]) + cons_len.append(abc.cons_len[i]) + clu_read_ids1, cons_seq1, cons_cov1 = [], '', [] + for j in range(abc.clu_n_seq[i]): + clu_read_ids1.append(abc.clu_read_ids[i][j]) + clu_read_ids.append(clu_read_ids1) + for j in range(abc.cons_len[i]): + c = abc.cons_base[i][j] + if isinstance(c, bytes): c = ord(c) + cons_seq1 += self.int2seq_dict[c] + cons_cov1.append(abc.cons_cov[i][j]) + cons_seq.append(cons_seq1) + cons_cov.append(cons_cov1) + + msa_len = abc.msa_len + if msa_len > 0: + for i in range(abc.n_seq + n_cons): msa_seq1 = '' - for c in msa_seq[i][:msa_l]: + for c in abc.msa_base[i][:msa_len]: if isinstance(c, bytes): c = ord(c) msa_seq1 += self.int2seq_dict[c] - _msa_seq.append(msa_seq1) - if msa_l > 0: - for i in range(tot_n): - free(msa_seq[i]) - free(msa_seq) + msa_seq.append(msa_seq1) + if self.abpt.out_pog: abpoa_dump_pog(self.ab, &self.abpt) - return msa_result(tot_n, int(cons_n), _cons_len, _cons_seq, int(msa_l), _msa_seq) + return msa_result(tot_n, n_cons, clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov, msa_len, msa_seq) + diff --git a/setup.py b/setup.py index 25b7d95..fb0006d 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ src_dir='src/' inc_dir='include/' -src=[module_src, src_dir+'abpoa_align.c', src_dir+'abpoa_graph.c', src_dir+'abpoa_plot.c', src_dir+'abpoa_seed.c', src_dir+'abpoa_seq.c', src_dir+'kalloc.c', src_dir+'kstring.c', src_dir+'simd_abpoa_align.c', src_dir+'simd_check.c', src_dir+'utils.c'] +src=[module_src, src_dir+'abpoa_align.c', src_dir+'abpoa_graph.c', src_dir+'abpoa_output.c', src_dir+'abpoa_plot.c', src_dir+'abpoa_seed.c', src_dir+'abpoa_seq.c', src_dir+'kalloc.c', src_dir+'kstring.c', src_dir+'simd_abpoa_align.c', src_dir+'simd_check.c', src_dir+'utils.c'] long_description = open('python/README.md').read() @@ -52,7 +52,7 @@ description = "pyabpoa: SIMD-based partial order alignment using adaptive band", long_description = long_description, long_description_content_type="text/markdown", - version = "1.3.0.0", + version = "1.4.0", url = "https://github.com/yangao07/abPOA", author = "Yan Gao", author_email = "gaoy286@mail.sysu.edu.cn", @@ -62,7 +62,7 @@ ext_modules = [Extension("pyabpoa", sources=src, include_dirs=[inc_dir], - depends=[src_dir+'abpoa.h', src_dir+'abpoa_align.h', src_dir+'abpoa_graph.h', src_dir+'abpoa_seed.h', src_dir+'abpoa_seq.h', src_dir+'kalloc.h', src_dir+'khash.h', src_dir+'kdq.h', src_dir+'kseq.h', src_dir+'ksort.h', src_dir+'kstring.h', src_dir+'kvec.h', src_dir+'simd_abpoa_align.h', src_dir+'simd_instruction.h', src_dir+'utils.h', 'python/cabpoa.pxd'], + depends=[src_dir+'abpoa.h', src_dir+'abpoa_align.h', src_dir+'abpoa_graph.h', src_dir+'abpoa_output.h', src_dir+'abpoa_seed.h', src_dir+'abpoa_seq.h', src_dir+'kalloc.h', src_dir+'khash.h', src_dir+'kdq.h', src_dir+'kseq.h', src_dir+'ksort.h', src_dir+'kstring.h', src_dir+'kvec.h', src_dir+'simd_abpoa_align.h', src_dir+'simd_instruction.h', src_dir+'utils.h', 'python/cabpoa.pxd'], libraries = ['z', 'm', 'pthread'], extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde, simd_flag])], install_requires=['cython'], diff --git a/src/abpoa.c b/src/abpoa.c index 166cd0b..b3378ac 100644 --- a/src/abpoa.c +++ b/src/abpoa.c @@ -16,7 +16,7 @@ char PROG[20] = "abpoa"; #define _bO BOLD UNDERLINE "O" NONE #define _bA BOLD UNDERLINE "A" NONE char DESCRIPTION[100] = _ba "daptive " _bb "anded " _bP "artial " _bO "rder " _bA "lignment"; -char VERSION[20] = "1.3.0"; +char VERSION[20] = "1.4.0"; char CONTACT[30] = "gaoy286@mail.sysu.edu.cn"; const struct option abpoa_long_opt [] = { @@ -46,11 +46,9 @@ const struct option abpoa_long_opt [] = { { "amb-strand", 0, NULL, 's' }, { "output", 1, NULL, 'o' }, - { "msa-header", 0, NULL, 'A' }, { "result", 1, NULL, 'r' }, { "out-pog", 1, NULL, 'g' }, - { "cons-alg", 1, NULL, 'a' }, - { "diploid", 0, NULL, 'd', }, + { "max-num-cons", 1, NULL, 'd', }, { "min-freq", 1, NULL, 'q', }, { "help", 0, NULL, 'h' }, @@ -68,11 +66,11 @@ int abpoa_usage(void) err_printf("Usage: %s [options] > cons.fa/msa.out/abpoa.gfa\n\n", PROG); err_printf("Options:\n"); err_printf(" Alignment:\n"); - err_printf(" -m --aln-mode INT alignment mode [%d]\n", ABPOA_GLOBAL_MODE); + err_printf(" -m --aln-mode INT alignment mode [%d]\n", ABPOA_GLOBAL_MODE); err_printf(" %d: global, %d: local, %d: extension\n", ABPOA_GLOBAL_MODE, ABPOA_LOCAL_MODE, ABPOA_EXTEND_MODE); - err_printf(" -M --match INT match score [%d]\n", ABPOA_MATCH); - err_printf(" -X --mismatch INT mismatch penalty [%d]\n", ABPOA_MISMATCH); - err_printf(" -t --matrix FILE scoring matrix file, \'-M\' and \'-X\' are not used when \'-t\' is used [Null]\n"); + err_printf(" -M --match INT match score [%d]\n", ABPOA_MATCH); + err_printf(" -X --mismatch INT mismatch penalty [%d]\n", ABPOA_MISMATCH); + err_printf(" -t --matrix FILE scoring matrix file, \'-M\' and \'-X\' are not used when \'-t\' is used [Null]\n"); err_printf(" e.g., \'HOXD70.mtx, BLOSUM62.mtx\'\n"); err_printf(" -O --gap-open INT(,INT) gap opening penalty (O1,O2) [%d,%d]\n", ABPOA_GAP_OPEN1, ABPOA_GAP_OPEN2); err_printf(" -E --gap-ext INT(,INT) gap extension penalty (E1,E2) [%d,%d]\n", ABPOA_GAP_EXT1, ABPOA_GAP_EXT2); @@ -84,7 +82,7 @@ int abpoa_usage(void) err_printf(" for each input sequence, try the reverse complement if the current\n"); err_printf(" alignment score is too low, and pick the strand with a higher score\n"); err_printf(" Adaptive banded DP:\n"); - err_printf(" -b --extra-b INT first adaptive banding parameter [%d]\n", ABPOA_EXTRA_B); + err_printf(" -b --extra-b INT first adaptive banding parameter [%d]\n", ABPOA_EXTRA_B); err_printf(" set b as < 0 to disable adaptive banded DP\n"); err_printf(" -f --extra-f FLOAT second adaptive banding parameter [%.2f]\n", ABPOA_EXTRA_F); err_printf(" the number of extra bases added on both sites of the band is\n"); @@ -106,32 +104,23 @@ int abpoa_usage(void) err_printf(" -l --in-list input file is a list of sequence file names [False]\n"); err_printf(" each line is one sequence file containing a set of sequences\n"); err_printf(" which will be aligned by abPOA to generate a consensus sequence\n"); - err_printf(" -i --incrmnt FILE incrementally align sequences to an existing graph/MSA [Null]\n"); + err_printf(" -i --incrmnt FILE incrementally align sequences to an existing graph/MSA [Null]\n"); err_printf(" graph could be in GFA or MSA format generated by abPOA\n"); - err_printf(" -o --output FILE ouput to FILE [stdout]\n"); - err_printf(" -r --result INT output result mode [%d]\n", ABPOA_OUT_CONS); - // err_printf(" %d: consensus (FASTA format), %d: MSA (PIR format), %d: both 0 & 1\n", ABPOA_OUT_CONS, ABPOA_OUT_MSA, ABPOA_OUT_BOTH); + err_printf(" -o --output FILE ouput to FILE [stdout]\n"); + err_printf(" -r --result INT output result mode [%d]\n", ABPOA_OUT_CONS); err_printf(" - %d: consensus in FASTA format\n", ABPOA_OUT_CONS); err_printf(" - %d: MSA in PIR format\n", ABPOA_OUT_MSA); err_printf(" - %d: both 0 & 1\n", ABPOA_OUT_CONS_MSA); err_printf(" - %d: graph in GFA format\n", ABPOA_OUT_GFA); err_printf(" - %d: graph with consensus path in GFA format\n", ABPOA_OUT_CONS_GFA); err_printf(" - %d: consensus in FASTQ format\n", ABPOA_OUT_CONS_FQ); - err_printf(" -A --msa-header add read ID as header of each sequence in MSA output [False]\n"); - err_printf(" -g --out-pog FILE dump final alignment graph to FILE (.pdf/.png) [Null]\n\n"); + err_printf(" -d --maxnum-cons INT max. number of consensus sequence to generate [1]\n"); + err_printf(" -q --min-freq FLOAT min. frequency of each consensus sequence (only effective when -d/--num-cons > 1) [%.2f]\n", MULTIP_MIN_FREQ); + err_printf(" -g --out-pog FILE dump final alignment graph to FILE (.pdf/.png) [Null]\n\n"); err_printf(" -h --help print this help usage information\n"); err_printf(" -v --version show version number\n"); - err_printf(" Parameters under development:\n"); - err_printf(" -a --cons-alg INT algorithm to use for consensus calling [%d]\n", ABPOA_HB); - // err_printf(" %d: heaviest bundling, %d: heaviest column\n", ABPOA_HB, ABPOA_HC); - err_printf(" - %d: heaviest bundling\n", ABPOA_HB); - err_printf(" - %d: heaviest in column\n", ABPOA_HC); - err_printf(" -d --diploid input data is diploid [False]\n"); - err_printf(" -a/--cons-alg will be set as %d when input is diploid\n", ABPOA_HC); - err_printf(" and at most two consensus sequences will be generated\n"); - err_printf(" -q --min-freq FLOAT min frequency of each consensus for diploid input [%.2f]\n", DIPLOID_MIN_FREQ); err_printf("\n"); return 1; @@ -145,11 +134,11 @@ int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){ FILE *list_fp = fopen(file_fn, "r"); char read_fn[1024]; while (fgets(read_fn, sizeof(read_fn), list_fp)) { read_fn[strlen(read_fn)-1] = '\0'; - abpoa_msa1(ab, abpt, read_fn, stdout, NULL, NULL, NULL, NULL, NULL, NULL); + abpoa_msa1(ab, abpt, read_fn, stdout); } fclose(list_fp); } else // input file - abpoa_msa1(ab, abpt, file_fn, stdout, NULL, NULL, NULL, NULL, NULL, NULL); + abpoa_msa1(ab, abpt, file_fn, stdout); abpoa_free(ab); err_func_printf(__func__, "Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB.", realtime() - realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0); @@ -158,7 +147,7 @@ int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){ int main(int argc, char **argv) { int c, m, in_list=0; char *s; abpoa_para_t *abpt = abpoa_init_para(); - while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:Sk:w:n:i:clpso:Ar:g:a:dq:hv", abpoa_long_opt, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:Sk:w:n:i:clpso:r:g:d:q:hv", abpoa_long_opt, NULL)) >= 0) { switch(c) { case 'm': m = atoi(optarg); @@ -198,11 +187,9 @@ int main(int argc, char **argv) { else if (atoi(optarg) == ABPOA_OUT_CONS_FQ) abpt->out_cons = 1, abpt->out_fq = 1; else err_printf("Error: unknown output result mode: %s.\n", optarg); break; - case 'A': abpt->out_msa_header = 1; break; case 'g': abpt->out_pog= strdup(optarg); break; - case 'a': abpt->cons_agrm = atoi(optarg); break; - case 'd': abpt->is_diploid = 1; break; + case 'd': abpt->max_n_cons = atoi(optarg); break; case 'q': abpt->min_freq = atof(optarg); break; case 'h': return abpoa_usage(); diff --git a/src/abpoa.h b/src/abpoa.h index 84dc7a8..f8702cb 100644 --- a/src/abpoa.h +++ b/src/abpoa.h @@ -69,10 +69,10 @@ typedef struct { int zdrop, end_bonus; // from minimap2 // int simd_flag; // available SIMD instruction // alignment mode - uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_msa_header:1, out_cons:1, out_gfa:1, is_diploid:1, use_read_ids:1; - uint8_t amb_strand:1, disable_seeding:1, progressive_poa:1, out_fq:1; + uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_cons:1, out_gfa:1, out_fq:1, use_read_ids:1, amb_strand:1; + uint8_t disable_seeding:1, progressive_poa:1; char *incr_fn, *out_pog; - int align_mode, gap_mode, cons_agrm; + int align_mode, gap_mode, max_n_cons; double min_freq; // for multiploid data // char LogTable65536[65536]; @@ -82,8 +82,8 @@ typedef struct { typedef struct { int node_id; int in_edge_n, in_edge_m, *in_id; - int out_edge_n, out_edge_m, *out_id, max_out_id; int *out_weight; - uint64_t *read_ids; int read_ids_n; // for multiploid + int out_edge_n, out_edge_m, *out_id; int *out_weight; + uint64_t **read_ids; int read_ids_n; // for each edge int aligned_node_n, aligned_node_m, *aligned_node_id; // mismatch; aligned node will have same rank // int heaviest_weight, heaviest_out_id; // for consensus @@ -98,6 +98,18 @@ typedef struct { uint8_t is_topological_sorted:1, is_called_cons:1, is_set_msa_rank:1; } abpoa_graph_t; +typedef struct { + int n_cons, n_seq, msa_len; // # cons, # of total seq, length of row-column MSA (including gaps) + int *clu_n_seq; // # of reads in each read cluster/group, size: n_cons + int **clu_read_ids; // read ids for each cluster/group, size: n_cons * clu_n_seq[i] + int *cons_len; // length of each consensus sequence, size: n_cons + int **cons_node_ids; // node id of each consensus, size: n_cons * cons_len[i] + uint8_t **cons_base; // sequence base of each consensus, size: n_cons * cons_len[i] + uint8_t **msa_base; // sequence base of RC-MSA, size: (n_seq + n_cons) * msa_len + int **cons_cov; // coverage of each consensus base, size: n_cons * cons_len[i] + int **cons_phred_score; // phred score for each consensus base, size: n_cons * cons_len[i] +} abpoa_cons_t; + typedef struct { int l, m; char *s; } abpoa_str_t; @@ -117,6 +129,7 @@ typedef struct { abpoa_graph_t *abg; abpoa_seq_t *abs; abpoa_simd_matrix_t *abm; + abpoa_cons_t *abc; } abpoa_t; // init for abpoa parameters @@ -130,12 +143,12 @@ abpoa_t *abpoa_init(void); void abpoa_free(abpoa_t *ab); // perform msa -int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l); +int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp); -int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l); +int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp); // clean alignment graph -void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen); +void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen); // restore graph from GFA/FASTA file abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt); @@ -156,7 +169,7 @@ int abpoa_add_graph_node(abpoa_graph_t *abg, uint8_t base); // para: // from_id/to_id: ids of from and to nodes // check_edge: set as 1 if this edge maybe alread exist and only need to update weight, set as 0 if the edge is new -// add_read_id: set as 1 if read_id is used (to use row-column algorithm/generate MSA result/diploid consensus) +// add_read_id: set as 1 if read_id is used (to use row-column algorithm/generate MSA result/multiple consensus) // read_id: is of sequence // read_ids_n: size of read_id array, each one is 64-bit (1+(tot_read_n-1)/64) int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_edge, int w, uint8_t add_read_id, int read_id, int read_ids_n); @@ -185,16 +198,21 @@ void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt); // cons_l: store consensus sequences length // cons_n: store number of consensus sequences // Note: cons_seq and cons_l need to be freed by user. -int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n); +void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt); +void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); // generate column multiple sequence alignment from graph -void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***msa_seq, int *msa_l); +void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt); +void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); // generate graph in GFA format to _out_fp_ void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); +// output cons/msa +void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp); + // generate DOT graph plot and dump graph into PDF/PNG format file -int abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt); +void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt); #ifdef __cplusplus } diff --git a/src/abpoa_align.c b/src/abpoa_align.c index 265b38b..c379dcb 100644 --- a/src/abpoa_align.c +++ b/src/abpoa_align.c @@ -1,9 +1,11 @@ #include #include #include +#include "abpoa.h" #include "simd_abpoa_align.h" #include "abpoa_align.h" #include "abpoa_seq.h" +#include "abpoa_output.h" #include "utils.h" #include "abpoa_seed.h" @@ -104,10 +106,8 @@ abpoa_para_t *abpoa_init_para(void) { abpt->out_fq = 0; // output consensus sequence in fastq abpt->out_gfa = 0; // out graph in GFA format abpt->out_msa = 0; // output msa - abpt->out_msa_header = 0; // output read ID in msa output - abpt->is_diploid = 0; // diploid data - abpt->min_freq = DIPLOID_MIN_FREQ; - abpt->cons_agrm = ABPOA_HB; // consensus calling algorithm + abpt->max_n_cons = 1; // number of max. generated consensus sequence + abpt->min_freq = MULTIP_MIN_FREQ; abpt->use_read_ids = 0; abpt->incr_fn = NULL; // incrementally align seq to an existing graph abpt->out_pog = NULL; // dump partial order graph to file @@ -139,10 +139,10 @@ abpoa_para_t *abpoa_init_para(void) { void abpoa_post_set_para(abpoa_para_t *abpt) { abpoa_set_gap_mode(abpt); - if (abpt->cons_agrm == ABPOA_HC || abpt->out_msa || abpt->out_gfa || abpt->is_diploid) { + if (abpt->out_msa || abpt->out_gfa || abpt->max_n_cons > 1) { abpt->use_read_ids = 1; set_65536_table(); - if (abpt->cons_agrm == ABPOA_HC || abpt->is_diploid) set_bit_table16(); + if (abpt->max_n_cons > 1) set_bit_table16(); } if (abpt->align_mode == ABPOA_LOCAL_MODE) abpt->wb = -1; int i; @@ -202,7 +202,7 @@ int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int *seq_l // seq-to-graph alignment and add alignment within each split window if (_i == 0) ai = 0; else ai = par_c[_i-1]; - int beg_id = ABPOA_SRC_NODE_ID, beg_qpos = 0, end_id, end_tpos, end_qpos; + int beg_id = ABPOA_SRC_NODE_ID, beg_qpos = 0, end_id=-1, end_tpos=-1, end_qpos=-1; if (ai < par_c[_i]) { abs->is_rc[read_id] = (abs->is_rc[last_read_id] ^ (par_anchors.a[ai] >> 63)); // construct rc qseq @@ -327,21 +327,21 @@ int abpoa_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int *seq_lens, in return 0; } -void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l) { +void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { + // generate & output GFA if (abpt->out_gfa) abpoa_generate_gfa(ab, abpt, out_fp); else { + // generate rc-msa/cons + if (abpt->out_msa) abpoa_generate_rc_msa(ab, abpt); if (abpt->out_cons) { - if (abpt->out_msa) abpoa_generate_consensus(ab, abpt, NULL, cons_seq, cons_cov, cons_l, cons_n); - else abpoa_generate_consensus(ab, abpt, out_fp, cons_seq, cons_cov, cons_l, cons_n); - if (ab->abg->is_called_cons == 0) - err_printf("Warning: no consensus sequence generated.\n"); - } else if (abpt->out_fq) { - abpoa_generate_consensus(ab, abpt, out_fp, cons_seq, cons_cov, cons_l, cons_n); - } - if (abpt->out_msa) { - abpoa_generate_rc_msa(ab, abpt, out_fp, msa_seq, msa_l); + abpoa_generate_consensus(ab, abpt); + if (ab->abg->is_called_cons == 0) err_printf("Warning: no consensus sequence generated.\n"); } + // output cons/rc-msa + if (abpt->out_msa) abpoa_output_rc_msa(ab, abpt, out_fp); + else if (abpt->out_cons) abpoa_output_fx_consensus(ab, abpt, out_fp); } + // plot partial-order graph using dot if (abpt->out_pog) abpoa_dump_pog(ab, abpt); } @@ -354,9 +354,9 @@ void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons // n_seq: number of input sequences // seq_len: array of input sequence length, size: seq_n // seqs: array of input sequences, 0123 for ACGT, size: seq_n * seq_len[] -int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seq, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l) { +int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seq, char **seq_names, int *seq_lens, uint8_t **seqs, FILE *out_fp) { if ((!abpt->out_msa && !abpt->out_cons && !abpt->out_gfa) || n_seq <= 0) return 0; - abpoa_reset_graph(ab, abpt, 1024); + abpoa_reset(ab, abpt, 1024); if (abpt->incr_fn) abpoa_restore_graph(ab, abpt); // restore existing graph abpoa_seq_t *abs = ab->abs; int i, exist_n_seq = abs->n_seq; @@ -406,13 +406,13 @@ int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seq, char **seq_names, int } // output - abpoa_output(ab, abpt, out_fp, cons_seq, cons_cov, cons_l, cons_n, msa_seq, msa_l); + abpoa_output(ab, abpt, out_fp); return 0; } -int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n, uint8_t ***msa_seq, int *msa_l) { +int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) { if (!abpt->out_msa && !abpt->out_cons && !abpt->out_gfa) return 0; - abpoa_reset_graph(ab, abpt, 1024); + abpoa_reset(ab, abpt, 1024); if (abpt->incr_fn) abpoa_restore_graph(ab, abpt); // restore existing graph abpoa_seq_t *abs = ab->abs; int exist_n_seq = abs->n_seq; @@ -460,7 +460,7 @@ int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp, uin } // output - abpoa_output(ab, abpt, out_fp, cons_seq, cons_cov, cons_l, cons_n, msa_seq, msa_l); + abpoa_output(ab, abpt, out_fp); kseq_destroy(ks); gzclose(readfp); for (i = 0; i < n_seq; ++i) free(seqs[i]); free(seqs); free(seq_lens); diff --git a/src/abpoa_align.h b/src/abpoa_align.h index 289c565..25cf462 100644 --- a/src/abpoa_align.h +++ b/src/abpoa_align.h @@ -26,7 +26,7 @@ #define ABPOA_F_OP 0x18 #define ABPOA_ALL_OP 0x1f -#define DIPLOID_MIN_FREQ 0.3 +#define MULTIP_MIN_FREQ 0.25 // start and end of each band: // range: (min_of_two(max_left, qlen-remain), max_of_two(max_right, qlen-remain)) diff --git a/src/abpoa_graph.c b/src/abpoa_graph.c index b411c72..18bd683 100644 --- a/src/abpoa_graph.c +++ b/src/abpoa_graph.c @@ -6,45 +6,6 @@ #include "simd_abpoa_align.h" #include "kdq.h" -extern char ab_char256_table[256]; -char ab_LogTable65536[65536]; -char ab_bit_table16[65536]; - -#define NAT_E 2.718281828459045 -static const char ab_LogTable256[256] = { -#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), - LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) -}; - -static inline int ilog2_32(uint32_t v) -{ - uint32_t t, tt; - if ((tt = v>>16)) return (t = tt>>8) ? 24 + ab_LogTable256[t] : 16 + ab_LogTable256[tt]; - return (t = v>>8) ? 8 + ab_LogTable256[t] : ab_LogTable256[v]; -} - -void set_65536_table(void) { - int i; - for (i = 0; i < 65536; ++i) { - ab_LogTable65536[i] = ilog2_32(i); - } -} - -void set_bit_table16(void) { - int i; ab_bit_table16[0] = 0; - for (i = 0; i != 65536; ++i) ab_bit_table16[i] = (i&1) + ab_bit_table16[i>>1]; -} - -#define get_bit_cnt4(table, b) (table[(b)&0xffff] + table[(b)>>16&0xffff] + table[(b)>>32&0xffff] + table[(b)>>48&0xffff]) - -static inline int ilog2_64(uint64_t v) { - uint64_t t, tt; - if ((tt = v >> 32)) return (t = tt >> 16) ? 48 + ab_LogTable65536[t] : 32 + ab_LogTable65536[tt]; - return (t = v>>16) ? 16 + ab_LogTable65536[t] : ab_LogTable65536[v]; -} - KDQ_INIT(int) #define kdq_int_t kdq_t(int) @@ -62,15 +23,16 @@ void abpoa_set_graph_node(abpoa_graph_t *abg, int node_i) { } void abpoa_free_node(abpoa_node_t *node, int n) { - int i; + int i, j; for (i = 0; i < n; ++i) { if (node[i].in_edge_m > 0) free(node[i].in_id); - if (node[i].out_edge_m > 0) { + if (node[i].out_edge_m > 0) { free(node[i].out_id); free(node[i].out_weight); - //if (abpt->use_read_ids) { - if (node[i].read_ids_n > 0) - free(node[i].read_ids); - //} + if (node[i].read_ids_n > 0) { + for (j = 0; j < node[i].out_edge_m; ++j) { + free(node[i].read_ids[j]); + } + } free(node[i].read_ids); } if (node[i].aligned_node_m > 0) free(node[i].aligned_node_id); } @@ -78,13 +40,34 @@ void abpoa_free_node(abpoa_node_t *node, int n) { } // 0: in_edge, 1: out_edge -abpoa_graph_t *abpoa_realloc_graph_edge(abpoa_graph_t *abg, int io, int id) { +abpoa_graph_t *abpoa_realloc_graph_edge(abpoa_graph_t *abg, int io, int id, int use_read_ids) { if (io == 0) { _uni_realloc(abg->node[id].in_id, abg->node[id].in_edge_n, abg->node[id].in_edge_m, int); } else { int edge_m = abg->node[id].out_edge_m; - _uni_realloc(abg->node[id].out_id, abg->node[id].out_edge_n, edge_m, int); - _uni_realloc(abg->node[id].out_weight, abg->node[id].out_edge_n, abg->node[id].out_edge_m, int); + if (edge_m <= 0) { + abg->node[id].out_edge_m = MAX_OF_TWO(abg->node[id].out_edge_n, 1); + abg->node[id].out_id = (int*)_err_malloc(abg->node[id].out_edge_m * sizeof(int)); + abg->node[id].out_weight = (int*)_err_malloc(abg->node[id].out_edge_m * sizeof(int)); + abg->node[id].read_ids = (uint64_t**)_err_malloc(abg->node[id].out_edge_m * sizeof(uint64_t*)); + if (abg->node[id].read_ids_n > 0) { + int i; + for (i = 0; i < abg->node[id].out_edge_m; ++i) { + abg->node[id].read_ids[i] = (uint64_t*)_err_calloc(abg->node[id].read_ids_n, sizeof(uint64_t)); + } + } + } else if (abg->node[id].out_edge_n >= edge_m) { + abg->node[id].out_edge_m = abg->node[id].out_edge_n+1; kroundup32(abg->node[id].out_edge_m); + abg->node[id].out_id = (int*)_err_realloc(abg->node[id].out_id, abg->node[id].out_edge_m * sizeof(int)); + abg->node[id].out_weight = (int*)_err_realloc(abg->node[id].out_weight, abg->node[id].out_edge_m * sizeof(int)); + abg->node[id].read_ids = (uint64_t**)_err_realloc(abg->node[id].read_ids, abg->node[id].out_edge_m * sizeof(uint64_t*)); + if (abg->node[id].read_ids_n > 0) { + int i; + for (i = edge_m; i < abg->node[id].out_edge_m; ++i) { + abg->node[id].read_ids[i] = (uint64_t*)_err_calloc(abg->node[id].read_ids_n, sizeof(uint64_t)); + } + } + } } return abg; } @@ -132,11 +115,56 @@ void abpoa_free_graph(abpoa_graph_t *abg) { free(abg); } +abpoa_cons_t *abpoa_init_cons(void) { + abpoa_cons_t *abc = (abpoa_cons_t*)_err_malloc(sizeof(abpoa_cons_t)); + abc->n_cons = 0; abc->msa_len = 0; + abc->clu_n_seq = NULL; + abc->cons_len = NULL; + abc->cons_node_ids = NULL; + abc->cons_base = NULL; + abc->msa_base = NULL; + abc->cons_cov = NULL; + abc->clu_read_ids = NULL; + abc->cons_phred_score = NULL; + return abc; +} + +void abpoa_free_cons(abpoa_cons_t *abc) { + int i; + if (abc->n_cons > 0) { + if (abc->clu_n_seq != NULL) free(abc->clu_n_seq); + if (abc->cons_len != NULL) free(abc->cons_len); + if (abc->cons_node_ids != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_node_ids[i]); free(abc->cons_node_ids); + } + if (abc->cons_base != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_base[i]); free(abc->cons_base); + } + if (abc->cons_cov != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_cov[i]); free(abc->cons_cov); + } + if (abc->clu_read_ids != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->clu_read_ids[i]); free(abc->clu_read_ids); + } + if (abc->cons_phred_score != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_phred_score[i]); free(abc->cons_phred_score); + } + } + if (abc->msa_len > 0) { + if (abc->msa_base != NULL) { + for (i = 0; i < abc->n_seq+abc->n_cons; ++i) free(abc->msa_base[i]); + free(abc->msa_base); + } + } + free(abc); +} + abpoa_t *abpoa_init(void) { abpoa_t *ab = (abpoa_t*)_err_malloc(sizeof(abpoa_t)); ab->abg = abpoa_init_graph(); ab->abs = abpoa_init_seq(); ab->abm = abpoa_init_simd_matrix(); + ab->abc = abpoa_init_cons(); return ab; } @@ -144,6 +172,7 @@ void abpoa_free(abpoa_t *ab) { abpoa_free_graph(ab->abg); abpoa_free_seq(ab->abs); abpoa_free_simd_matrix(ab->abm); + abpoa_free_cons(ab->abc); free(ab); } @@ -251,7 +280,7 @@ void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt) { // fprintf(stderr, "node_n: %d, index_rank_m: %d\n", node_n, abg->index_rank_m); abg->index_to_node_id = (int*)_err_realloc(abg->index_to_node_id, abg->index_rank_m * sizeof(int)); abg->node_id_to_index = (int*)_err_realloc(abg->node_id_to_index, abg->index_rank_m * sizeof(int)); - if (abpt->out_msa || abpt->cons_agrm == ABPOA_HC || abpt->is_diploid) + if (abpt->out_msa || abpt->max_n_cons > 1) abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, abg->index_rank_m * sizeof(int)); if (abpt->wb >= 0) { abg->node_id_to_max_pos_left = (int*)_err_realloc(abg->node_id_to_max_pos_left, abg->index_rank_m * sizeof(int)); @@ -338,685 +367,6 @@ void abpoa_set_msa_rank(abpoa_graph_t *abg, int src_id, int sink_id) { } } -void abpoa_set_row_column_weight(abpoa_graph_t *abg, int **rc_weight, int **msa_node_id) { - int i, k, rank, aligned_id; - uint64_t b; - for (i = 2; i < abg->node_n; ++i) { - // get msa rank - rank = abpoa_graph_node_id_to_msa_rank(abg, i); - for (k = 0; k < abg->node[i].aligned_node_n; ++k) { - aligned_id = abg->node[i].aligned_node_id[k]; - rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id)); - } - // assign seq - for (k = 0; k < abg->node[i].read_ids_n; ++k) { - b = abg->node[i].read_ids[k]; - rc_weight[rank-1][abg->node[i].base] += get_bit_cnt4(ab_bit_table16, b); - } - rc_weight[rank-1][4] -= rc_weight[rank-1][abg->node[i].base]; - msa_node_id[rank-1][abg->node[i].base] = i; - } -} - -void abpoa_set_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids, int **rc_weight, int **msa_node_id, int msa_l, int n_seq, int read_ids_n) { - int i, j, k, rank, aligned_id; - uint64_t b, one = 1, *whole_read_ids = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); - for (i = 0; i < n_seq; ++i) { - j = i / 64; b = i & 0x3f; - whole_read_ids[j] |= (one << b); - } - for (i = 0; i < msa_l; ++i) { - for (j = 0; j < read_ids_n; ++j) { - read_ids[i][4][j] = whole_read_ids[j]; - } - } - for (i = 2; i < abg->node_n; ++i) { - // get msa rank - rank = abpoa_graph_node_id_to_msa_rank(abg, i); - for (k = 0; k < abg->node[i].aligned_node_n; ++k) { - aligned_id = abg->node[i].aligned_node_id[k]; - rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id)); - } - // assign seq - for (k = 0; k < abg->node[i].read_ids_n; ++k) { - b = abg->node[i].read_ids[k]; - rc_weight[rank-1][abg->node[i].base] += get_bit_cnt4(ab_bit_table16, b); - read_ids[rank-1][abg->node[i].base][k] = b; - read_ids[rank-1][4][k] ^= b; - } - rc_weight[rank-1][4] -= rc_weight[rank-1][abg->node[i].base]; - msa_node_id[rank-1][abg->node[i].base] = i; - } - free(whole_read_ids); -} - -void abpoa_heaviest_column_multip_consensus(uint64_t ***read_ids, int **cluster_ids, int *cluster_ids_n, int multip, int read_ids_n, int msa_l, FILE *out_fp, uint8_t ***_cons_seq, int **_cons_l, int *_cons_n) { - int i, j, k, m, w, cnt, max_w, max_base, gap_w; - uint64_t *read_ids_mask = (uint64_t*)_err_malloc(read_ids_n * sizeof(uint64_t)), one=1, b; - uint8_t *cons_seq = (uint8_t*)_err_malloc(sizeof(uint8_t) * msa_l); int cons_l; - int read_id, n; - - if (_cons_n) { - *_cons_n = multip; - (*_cons_l) = (int*)_err_malloc(sizeof(int) * multip); - (*_cons_seq) = (uint8_t **)_err_malloc(sizeof(uint8_t*) * multip); - } - for (i = 0; i < multip; ++i) { - cons_l = 0; - if (out_fp) fprintf(out_fp, ">Consensus_sequence_%d_%d\n", i+1, cluster_ids_n[i]); - for (j = 0; j < read_ids_n; ++j) read_ids_mask[j] = 0; - for (j = 0; j < cluster_ids_n[i]; ++j) { - read_id = cluster_ids[i][j]; - n = read_id / 64; - read_ids_mask[n] |= (one << (read_id & 0x3f)); - } - for (j = 0; j < msa_l; ++j) { - max_w = 0, max_base = 5, gap_w = cluster_ids_n[i]; - for (k = 0; k < 4; ++k) { - w = 0; - for (m = 0; m < read_ids_n; ++m) { - b = read_ids[j][k][m] & read_ids_mask[m]; - cnt = get_bit_cnt4(ab_bit_table16, b); - w += cnt, gap_w -= cnt; - } - if (w > max_w) { - max_w = w; max_base = k; - } - } - if (max_w > gap_w) { - cons_seq[cons_l++] = max_base; - } - } - if (out_fp) { - for (j = 0; j < cons_l; ++j) fprintf(out_fp, "%c", ab_char256_table[cons_seq[j]]); fprintf(out_fp, "\n"); - } - if (_cons_n) { - (*_cons_l)[i] = cons_l; - (*_cons_seq)[i] = (uint8_t*)_err_malloc(sizeof(uint8_t) * cons_l); - for (j = 0; j < cons_l; ++j) (*_cons_seq)[i][j] = cons_seq[j]; - } - } - free(read_ids_mask); free(cons_seq); -} - -int output_consensus(int out_fq, abpoa_graph_t *abg, int src_id, int sink_id, int **cons_cov, int n_seq, FILE *out_fp) { - int cons_l = 0; - if (out_fq) fprintf(out_fp, "@Consensus_sequence\n"); - else fprintf(out_fp, ">Consensus_sequence\n"); - int id = abg->node[src_id].max_out_id; - while (id != sink_id) { - fprintf(out_fp, "%c", ab_char256_table[abg->node[id].base]); - id = abg->node[id].max_out_id; - cons_l++; - } fprintf(out_fp, "\n"); - if (out_fq) { - fprintf(out_fp, "+Consensus_sequence\n"); - if (cons_cov == NULL) err_fatal_simple("No coverage information available."); - int i, phred; double x, p; - for (i = 0; i < cons_l; ++i) { - // max q-score: 33+60=93, min score: 33+0=33 - x = 13.8 * (1.25 * cons_cov[0][i] / n_seq - 0.25); - p = 1 - 1.0 / (1.0 + pow(NAT_E, -1 * x)); - phred = 33 + (int)(-10 * log10(p) + 0.499); - fprintf(out_fp, "%c", phred); - } fprintf(out_fp, "\n"); - } - return cons_l; -} - -int abpoa_store_consensus(abpoa_graph_t *abg, int src_id, int sink_id, uint8_t ***cons_seq, int **cons_l) { - *cons_seq = (uint8_t**)_err_malloc(sizeof(uint8_t*)); - *cons_l = (int*)_err_malloc(sizeof(int)); - (*cons_seq)[0] = (uint8_t*)_err_malloc(sizeof(uint8_t) * abg->node_n); - int id = abg->node[src_id].max_out_id, i = 0; - while (id != sink_id) { - (*cons_seq)[0][i++] = abg->node[id].base; - id = abg->node[id].max_out_id; - } - (*cons_l)[0] = i; - return i; -} - -void abpoa_generate_consensus_cov(abpoa_graph_t *abg, int src_id, int sink_id, int ***cons_cov) { - *cons_cov = (int**)_err_malloc(sizeof(int*)); - (*cons_cov)[0] = (int*)_err_malloc(sizeof(int) * abg->node_n); - int id = abg->node[src_id].max_out_id; - int left_w, right_w; - int i, j, in_id, cons_i = 0; - while (id != sink_id) { - // for each id: get max{left_weigth, right_weight} - left_w = right_w = 0; - for (i = 0; i < abg->node[id].in_edge_n; ++i) { - in_id = abg->node[id].in_id[i]; - for (j = 0; j < abg->node[in_id].out_edge_n; ++j) { - if (abg->node[in_id].out_id[j] == id) { - left_w += abg->node[in_id].out_weight[j]; - } - } - } - for (i = 0; i < abg->node[id].out_edge_n; ++i) { - right_w += abg->node[id].out_weight[i]; - } - (*cons_cov)[0][cons_i++] = MAX_OF_TWO(left_w, right_w); - id = abg->node[id].max_out_id; - } -} - -void abpoa_traverse_min_flow(abpoa_graph_t *abg, int src_id, int sink_id, int *out_degree) { - int *id, i, cur_id, in_id, out_id, max_id; int max_w, max_out_i, min_w; - kdq_int_t *q = kdq_init_int(); kdq_push_int(q, sink_id); - int *max_out_weight = (int*)_err_malloc(abg->node_n * sizeof(int)); - // reverse Breadth-First-Search - while ((id = kdq_shift_int(q)) != 0) { - cur_id = *id; - if (cur_id == sink_id) { - abg->node[sink_id].max_out_id = -1; - max_out_weight[sink_id] = INT32_MAX; - } else { - max_w = INT32_MIN, max_id = -1, max_out_i = -1, min_w = INT32_MAX; - for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { - out_id = abg->node[cur_id].out_id[i]; - min_w = MIN_OF_TWO(max_out_weight[out_id], abg->node[cur_id].out_weight[i]); - if (max_w < min_w) { - max_w = min_w; - max_id = out_id; - max_out_i = i; - } else if (max_w == min_w && abg->node[cur_id].out_weight[max_out_i] <= abg->node[cur_id].out_weight[i]) { - max_id = out_id; - max_out_i = i; - } - } - abg->node[cur_id].max_out_id = max_id; - max_out_weight[cur_id] = max_w; - } - - if (cur_id == src_id) { - kdq_destroy_int(q); - goto MF_CONS; - } - for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) { - in_id = abg->node[cur_id].in_id[i]; - if (--out_degree[in_id] == 0) kdq_push_int(q, in_id); - } - } -MF_CONS: - free(max_out_weight); -} - -// heaviest_bundling -// 1. argmax{cur->weight} -// 2. argmax{out_node->weight} -void abpoa_heaviest_bundling(abpoa_graph_t *abg, int src_id, int sink_id, int *out_degree, int ***cons_cov) { - int *id, i, cur_id, in_id, out_id, max_id; int max_w, out_w; - - int *score = (int*)_err_malloc(abg->node_n * sizeof(int)); - kdq_int_t *q = kdq_init_int(); - kdq_push_int(q, sink_id); - // reverse Breadth-First-Search - while ((id = kdq_shift_int(q)) != 0) { - cur_id = *id; - if (cur_id == sink_id) { - abg->node[sink_id].max_out_id = -1; - score[sink_id] = 0; - } else { - max_id = -1; - if (cur_id == src_id) { - int path_score = -1, path_max_w = -1; - for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { - out_id = abg->node[cur_id].out_id[i]; - out_w = abg->node[cur_id].out_weight[i]; - if (out_w > path_max_w || (out_w == path_max_w && score[out_id] > path_score)) { - max_id = out_id; - path_score = score[out_id]; - path_max_w = out_w; - } - } - abg->node[src_id].max_out_id = max_id; - kdq_destroy_int(q); - goto HB_CONS; - } else { - max_w = INT32_MIN; - for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { - out_id = abg->node[cur_id].out_id[i]; - out_w = abg->node[cur_id].out_weight[i]; - if (max_w < out_w) { - max_w = out_w; - max_id = out_id; - } else if (max_w == out_w && score[max_id] <= score[out_id]) { - max_id = out_id; - } - } - score[cur_id] = max_w + score[max_id]; - abg->node[cur_id].max_out_id = max_id; - } - } - for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) { - in_id = abg->node[cur_id].in_id[i]; - if (--out_degree[in_id] == 0) - kdq_push_int(q, in_id); - } - } -HB_CONS: - if (cons_cov != NULL) abpoa_generate_consensus_cov(abg, src_id, sink_id, cons_cov); - free(score); -} - -void abpoa_heaviest_column_consensus(abpoa_graph_t *abg, int **rc_weight, int **msa_node_id, int src_id, int sink_id, int msa_l, int n_seq, int ***cons_cov) { - int i, j, w, max_base, max_w, gap_w; - int last_id = src_id, cur_id, cons_i = 0; - if (cons_cov != NULL) { - *cons_cov = (int**)_err_malloc(sizeof(int*)); - (*cons_cov)[0] = (int*)_err_malloc(sizeof(int) * abg->node_n); - } - for (i = 0; i < msa_l; ++i) { - max_w = 0, max_base = 5, gap_w = n_seq; - for (j = 0; j < 4; ++j) { - w = rc_weight[i][j]; - if (w > max_w) { - max_base = j, max_w = w; - } - gap_w -= w; - } - if (max_w >= gap_w) { - cur_id = msa_node_id[i][max_base]; - abg->node[last_id].max_out_id = cur_id; - last_id = cur_id; - if (cons_cov != NULL) - (*cons_cov)[0][cons_i++] = max_w; - } - } - abg->node[last_id].max_out_id = sink_id; -} - -void abpoa_heaviest_column(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, int ***cons_cov) { - abpoa_set_msa_rank(abg, src_id, sink_id); - - int i, msa_l = abg->node_id_to_msa_rank[sink_id] - 1; - int **rc_weight = (int**)_err_malloc(sizeof(int*) * msa_l); - int **msa_node_id = (int**)_err_malloc(sizeof(int*) * msa_l); - for (i = 0; i < msa_l; ++i) { - rc_weight[i] = (int*)_err_calloc(5, sizeof(int)); // ACGT - rc_weight[i][4] = n_seq; - msa_node_id[i] = (int*)_err_calloc(5, sizeof(int)); - } - abpoa_set_row_column_weight(abg, rc_weight, msa_node_id); - - abpoa_heaviest_column_consensus(abg, rc_weight, msa_node_id, src_id, sink_id, msa_l, n_seq, cons_cov); - for (i = 0; i < msa_l; ++i) { - free(rc_weight[i]); free(msa_node_id[i]); - } free(rc_weight); free(msa_node_id); -} - -void add_het_read_ids(int *init, uint64_t **het_read_ids, uint8_t **het_cons_base, uint64_t **read_ids, int het_n, int *multip_i, int read_ids_n) { - int i, j; - if (*init) { - for (i = 0; i < 2; ++i) { - for (j = 0; j < read_ids_n; ++j) het_read_ids[i][j] = read_ids[multip_i[i]][j]; - het_cons_base[het_n][0] = multip_i[0]; - het_cons_base[het_n][1] = multip_i[1]; - } - *init = 0; - } else { - int ovlp_n, max_ovlp_n, max_clu_i; - uint64_t *ids, b; - max_ovlp_n = max_clu_i = -1; - ids = read_ids[multip_i[0]]; - for (i = 0; i < 2; ++i) { // het_read_ids - ovlp_n = 0; - for (j = 0; j < read_ids_n; ++j) { - b = het_read_ids[i][j] & ids[j]; - ovlp_n += get_bit_cnt4(ab_bit_table16, b); - } - if (ovlp_n > max_ovlp_n) { - max_ovlp_n = ovlp_n; - max_clu_i = i; - } - } - // 0 & i, 1 & 1-i - for (i = 0; i < read_ids_n; ++i) { - het_read_ids[max_clu_i][i] |= read_ids[multip_i[0]][i]; - het_read_ids[1-max_clu_i][i] |= read_ids[multip_i[1]][i]; - } - het_cons_base[het_n][max_clu_i] = multip_i[0]; - het_cons_base[het_n][1-max_clu_i] = multip_i[1]; - } -} - -int set_clu_read_ids(int **clu_read_ids, int *clu_read_ids_n, int n_seq, double min_freq, uint64_t ***read_ids, uint8_t **het_cons_base, int *het_pos, int het_n) { - int i, j, seq_i, pos, clu_base; - int **diff = (int**)_err_malloc(sizeof(int*) * n_seq); - for (i = 0; i < n_seq; ++i) { - diff[i] = (int*)_err_malloc(2 * sizeof(int)); - diff[i][0] = het_n, diff[i][1] = het_n; - } - uint64_t seq_b, one = 1; int seq_bit; - for (seq_i = 0; seq_i < n_seq; ++seq_i) { - seq_bit = seq_i / 64; seq_b = one << (seq_i & 0x3f); - for (i = 0; i < het_n; ++i) { - pos = het_pos[i]; - for (j = 0; j < 2; ++j) { - clu_base = het_cons_base[i][j]; - if (seq_b & read_ids[pos][clu_base][seq_bit]) { - --diff[seq_i][j]; break; - } - } - } - } - for (i = 0; i < n_seq; ++i) { - if (diff[i][0] < diff[i][1]) { - clu_read_ids[0][clu_read_ids_n[0]++] = i; - } else if (diff[i][1] < diff[i][0]) { - clu_read_ids[1][clu_read_ids_n[1]++] = i; - } - } - for (i = 0; i < n_seq; ++i) free(diff[i]); free(diff); - - int min_n = MIN_OF_TWO(clu_read_ids_n[0], clu_read_ids_n[1]); - int clu_n = (min_n >= (int)(n_seq * min_freq)) ? 2 : 1; - return clu_n; -} - -int abpoa_diploid_ids(uint64_t ***read_ids, int **rc_weight, int msa_l, int n_seq, double min_freq, int read_ids_n, int **clu_read_ids, int *clu_read_ids_n) { - int i, j; - uint64_t **het_read_ids = (uint64_t**)_err_malloc(sizeof(uint64_t*) * 2 ); - for (i = 0; i < 2; ++i) het_read_ids[i] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); - int *het_pos = (int*)_err_malloc(sizeof(int) * msa_l), het_n = 0; - uint8_t **het_cons_base = (uint8_t**)_err_malloc(sizeof(uint8_t*) * msa_l); - for (i = 0; i < msa_l; ++i) het_cons_base[i] = (uint8_t*)_err_malloc(sizeof(uint8_t) * 2); - int min_w = MAX_OF_TWO(1, n_seq * min_freq); - int init = 1, tmp, multip_i[2], multip_n, w; - // collect het nodes - for (i = 0; i < msa_l; ++i) { - multip_n = 0; - for (j = 0; j < 5; ++j) { - w = rc_weight[i][j]; - if (w >= min_w) { - if (multip_n == 2) { - multip_n = 0; - break; - } - multip_i[multip_n++] = j; - } - } - if (multip_n == 2) { - if (rc_weight[i][multip_i[0]] < rc_weight[i][multip_i[1]]) { - tmp = multip_i[0]; multip_i[0] = multip_i[1]; multip_i[1] = tmp; - } - // iteratively update read_ids and cons-base for each cluster - // read_ids |= - // cons_base1[i++] = ; cons_base2[i++] = ; - add_het_read_ids(&init, het_read_ids, het_cons_base, read_ids[i], het_n, multip_i, read_ids_n); - het_pos[het_n++] = i; - } - } - int clu_n; - if (het_n == 0) clu_n = 1; - else { - // for each read, determine cluster based on diff with two cons-bases - // return two group of read_ids - clu_n = set_clu_read_ids(clu_read_ids, clu_read_ids_n, n_seq, min_freq, read_ids, het_cons_base, het_pos, het_n); - } - - for (i = 0; i < msa_l; ++i) free(het_cons_base[i]); - for (i = 0; i < 2; ++i) free(het_read_ids[i]); - free(het_cons_base); free(het_read_ids); free(het_pos); - return clu_n; -} - -void abpoa_diploid_heaviest_column(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, double min_freq, FILE *out_fp, uint8_t ***cons_seq, int **cons_l, int *cons_n) { - abpoa_set_msa_rank(abg, src_id, sink_id); - int i, j, msa_l = abg->node_id_to_msa_rank[sink_id] - 1; - int read_ids_n = (n_seq-1)/64+1; - uint64_t ***read_ids = (uint64_t***)_err_malloc(sizeof(uint64_t**) * msa_l); - for (i = 0; i < msa_l; ++i) { - read_ids[i] = (uint64_t**)_err_malloc(sizeof(uint64_t*) * 5); - for (j = 0; j < 5; ++j) read_ids[i][j] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); - } - int **rc_weight = (int**)_err_malloc(msa_l * sizeof(int*)); - int **msa_node_id = (int**)_err_malloc(msa_l * sizeof(int*)); - for (i = 0; i < msa_l; ++i) { - rc_weight[i] = (int*)_err_calloc(5, sizeof(int)); // ACGT - rc_weight[i][4] = n_seq; - msa_node_id[i] = (int*)_err_calloc(5, sizeof(int)); // ACGT - } - abpoa_set_row_column_ids_weight(abg, read_ids, rc_weight, msa_node_id, msa_l, n_seq, read_ids_n); - - int **clu_read_ids = (int**)_err_malloc(sizeof(int*) * 2), *clu_read_ids_n; - for (i = 0; i < 2; ++i) clu_read_ids[i] = (int*)_err_malloc(sizeof(int) * n_seq); - clu_read_ids_n = (int*)_err_calloc(2, sizeof(int)); - int clu_n = abpoa_diploid_ids(read_ids, rc_weight, msa_l, n_seq, min_freq, read_ids_n, clu_read_ids, clu_read_ids_n); - if (clu_n == 1) { - abpoa_heaviest_column_consensus(abg, rc_weight, msa_node_id, src_id, sink_id, msa_l, n_seq, NULL); - if (out_fp) output_consensus(0, abg, src_id, sink_id, NULL, n_seq, out_fp); - if (cons_n) { - *cons_n = 1; abpoa_store_consensus(abg, src_id, sink_id, cons_seq, cons_l); - } - } else abpoa_heaviest_column_multip_consensus(read_ids, clu_read_ids, clu_read_ids_n, clu_n, read_ids_n, msa_l, out_fp, cons_seq, cons_l, cons_n); - - for (i = 0; i < msa_l; ++i) { - for (j = 0; j < 5; ++j) { - free(read_ids[i][j]); - } - free(read_ids[i]); free(rc_weight[i]); free(msa_node_id[i]); - } free(read_ids); free(rc_weight); free(msa_node_id); - for (i = 0; i < 2; ++i) free(clu_read_ids[i]); free(clu_read_ids); free(clu_read_ids_n); -} - -// should always topological sort first, then generate consensus -int abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***cons_seq, int ***cons_cov, int **cons_l, int *cons_n) { - abpoa_graph_t *abg = ab->abg; - if (abg->node_n <= 2) return 0; - int i, _cons_l = 0, *out_degree = (int*)_err_malloc(abg->node_n * sizeof(int)); - int **_cons_cov; - if (abpt->out_fq == 0 && cons_cov == NULL) _cons_cov = NULL; - int n_seq = ab->abs->n_seq; - for (i = 0; i < abg->node_n; ++i) { - out_degree[i] = abg->node[i].out_edge_n; - } - - if (abpt->is_diploid) { - abpoa_diploid_heaviest_column(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, abpt->min_freq, out_fp, cons_seq, cons_l, cons_n); - } else { - if (abpt->cons_agrm == ABPOA_HB) abpoa_heaviest_bundling(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, out_degree, &_cons_cov); - else if (abpt->cons_agrm == ABPOA_HC) abpoa_heaviest_column(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, &_cons_cov); - else err_fatal(__func__, "Unknown consensus calling algorithm: %d.", abpt->cons_agrm); - if (out_fp) _cons_l = output_consensus(abpt->out_fq, abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, _cons_cov, n_seq, out_fp); - if (cons_n && cons_l && cons_seq) { - *cons_n = 1; - _cons_l = abpoa_store_consensus(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, cons_seq, cons_l); - } - if (cons_cov && _cons_l) { - *cons_cov = (int**)_err_malloc(sizeof(int*)); - (*cons_cov)[0] = (int*)_err_malloc(sizeof(int) * _cons_l); - for (i = 0; i < _cons_l; ++i) (*cons_cov)[0][i] = _cons_cov[0][i]; - } - - if (_cons_cov) { - free(_cons_cov[0]); free(_cons_cov); - } - } - abg->is_called_cons = 1; free(out_degree); - return _cons_l; -} - -void abpoa_set_msa_seq(abpoa_node_t node, int rank, uint8_t **msa_seq) { - int i, b, read_id; uint8_t base = node.base; - uint64_t num, tmp; - b = 0; - for (i = 0; i < node.read_ids_n; ++i) { - num = node.read_ids[i]; - while (num) { - tmp = num & -num; - read_id = ilog2_64(tmp); - // fprintf(stderr, "%d -> %d\n", node.node_id, read_id); - msa_seq[b+read_id][rank-1] = base; - num ^= tmp; - } - b += 64; - } -} - -void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp, uint8_t ***msa_seq, int *msa_l) { - abpoa_graph_t *abg = ab->abg; - if (abg->node_n <= 2) return; - abpoa_set_msa_rank(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID); - - abpoa_seq_t *abs = ab->abs; - int i, j, k, aligned_id, rank, n_seq = abs->n_seq; - uint8_t **_msa_seq = (uint8_t**)_err_malloc(n_seq * sizeof(uint8_t*)); - int _msa_l = abg->node_id_to_msa_rank[ABPOA_SINK_NODE_ID]-1; - - for (i = 0; i < n_seq; ++i) { - _msa_seq[i] = (uint8_t*)_err_malloc(_msa_l * sizeof(uint8_t)); - for (j = 0; j < _msa_l; ++j) - _msa_seq[i][j] = 27; - } - - if (out_fp && abpt->out_msa_header == 0) fprintf(out_fp, ">Multiple_sequence_alignment\n"); - for (i = 2; i < abg->node_n; ++i) { - // get msa rank - rank = abpoa_graph_node_id_to_msa_rank(abg, i); - for (k = 0; k < abg->node[i].aligned_node_n; ++k) { - aligned_id = abg->node[i].aligned_node_id[k]; - rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id)); - } - // assign seq - abpoa_set_msa_seq(abg->node[i], rank, _msa_seq); - } - if (out_fp) { - for (i = 0; i < n_seq; ++i) { - if (abpt->out_msa_header && abs->name[i].l > 0) { - if (abs->is_rc[i]) fprintf(out_fp, ">%s_reverse_complement\n", abs->name[i].s); - else fprintf(out_fp, ">%s\n", abs->name[i].s); - } - for (j = 0; j < _msa_l; ++j) fprintf(out_fp, "%c", ab_char256_table[_msa_seq[i][j]]); - fprintf(out_fp, "\n"); - } - if (abpt->out_cons) { // RC-MSA for consensus sequence - if (abpt->out_msa_header && abs->name[0].l > 0) fprintf(out_fp, ">Consensus_sequence\n"); - - i = abg->node[ABPOA_SRC_NODE_ID].max_out_id; - int last_rank = 1; - while (i != ABPOA_SINK_NODE_ID) { - rank = abpoa_graph_node_id_to_msa_rank(abg, i); - for (k = 0; k < abg->node[i].aligned_node_n; ++k) { - aligned_id = abg->node[i].aligned_node_id[k]; - rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id)); - } - // last_rank -> rank : - - for (k = last_rank; k < rank; ++k) fprintf(out_fp, "-"); - // rank : base - fprintf(out_fp, "%c", ab_char256_table[abg->node[i].base]); - last_rank = rank+1; - i = abg->node[i].max_out_id; - } - // last_rank -> msa_l:- - for (k = last_rank; k <= _msa_l; ++k) fprintf(out_fp, "-"); - fprintf(out_fp, "\n"); - } - } - if (msa_l) { - *msa_l = _msa_l; *msa_seq = _msa_seq; - } else { - for (i = 0; i < n_seq; ++i) free(_msa_seq[i]); free(_msa_seq); - } -} - -void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { - abpoa_seq_t *abs = ab->abs; abpoa_graph_t *abg = ab->abg; - if (abg->node_n <= 2) return; - - // traverse graph - int *in_degree = (int*)_err_malloc(abg->node_n * sizeof(int)); - int n_seq = abs->n_seq; - int **read_paths = (int**)_err_malloc(n_seq * sizeof(int*)), *read_path_i = (int*)_err_calloc(n_seq, sizeof(int)); - int i, j, cur_id, pre_id, out_id, *id; - for (i = 0; i < abg->node_n; ++i) in_degree[i] = abg->node[i].in_edge_n; - for (i = 0; i < n_seq; ++i) read_paths[i] = (int*)_err_malloc(abg->node_n * sizeof(int)); - - // output comment and header - int nl = 0; - for (i = 2; i < abg->node_n; ++i) nl += abg->node[i].in_edge_n; - fprintf(out_fp, "H\tVN:Z:1.0\tNS:i:%d\tNL:i:%d\tNP:i:%d\n", abg->node_n-2, nl - abg->node[ABPOA_SRC_NODE_ID].out_edge_n, n_seq + abpt->out_cons); - - kdq_int_t *q = kdq_init_int(); - - // Breadth-First-Search - kdq_push_int(q, ABPOA_SRC_NODE_ID); - while ((id = kdq_shift_int(q)) != 0) { - cur_id = *id; - if (cur_id == ABPOA_SINK_NODE_ID) { - kdq_destroy_int(q); - break; - } else { - if (cur_id != ABPOA_SRC_NODE_ID) { - // output node - fprintf(out_fp, "S\t%d\t%c\n", cur_id-1, ab_char256_table[abg->node[cur_id].base]); - // output all links based pre_ids - for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) { - pre_id = abg->node[cur_id].in_id[i]; - if (pre_id != ABPOA_SRC_NODE_ID) - fprintf(out_fp, "L\t%d\t+\t%d\t+\t0M\n", pre_id-1, cur_id-1); - } - // add node id to read path - int b, read_id; uint64_t num, tmp; - b = 0; - for (i = 0; i < abg->node[cur_id].read_ids_n; ++i) { - num = abg->node[cur_id].read_ids[i]; - while (num) { - tmp = num & -num; - read_id = ilog2_64(tmp); - read_paths[b+read_id][read_path_i[b+read_id]++] = cur_id-1; - num ^= tmp; - } - b += 64; - } - } - for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { - out_id = abg->node[cur_id].out_id[i]; - if (--in_degree[out_id] == 0) { - kdq_push_int(q, out_id); - } - } - } - } - // output read paths - for (i = 0; i < n_seq; ++i) { - if (abs->name[i].l > 0) fprintf(out_fp, "P\t%s\t", abs->name[i].s); - else fprintf(out_fp, "P\t%d\t", i+1); - if (abs->is_rc[i]) { - for (j = read_path_i[i]-1; j >= 0; --j) { - fprintf(out_fp, "%d-", read_paths[i][j]); - if (j != 0) fprintf(out_fp, ","); - else fprintf(out_fp, "\t*\n"); - } - } else { - for (j = 0; j < read_path_i[i]; ++j) { - fprintf(out_fp, "%d+", read_paths[i][j]); - if (j != read_path_i[i]-1) fprintf(out_fp, ","); - else fprintf(out_fp, "\t*\n"); - } - } - } - if (abpt->out_cons) { - abpoa_generate_consensus(ab, abpt, NULL, NULL, NULL, NULL, NULL); - int id = abg->node[ABPOA_SRC_NODE_ID].max_out_id; - fprintf(out_fp, "P\tConsensus_sequence\t"); - while (1) { - fprintf(out_fp, "%d+", id-1); - id = abg->node[id].max_out_id; - if (id != ABPOA_SINK_NODE_ID) fprintf(out_fp, ","); - else { - fprintf(out_fp, "\t*\n"); - break; - } - } - } - free(in_degree); - for (i = 0; i < n_seq; ++i) free(read_paths[i]); - free(read_paths); free(read_path_i); -} - int abpoa_get_aligned_id(abpoa_graph_t *abg, int node_id, uint8_t base) { int i, aln_id; abpoa_node_t *node = abg->node; @@ -1064,43 +414,55 @@ int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_e int ret = 1; if (from_id < 0 || from_id >= abg->node_n || to_id < 0 || to_id >= abg->node_n) err_fatal(__func__, "node_n: %d\tfrom_id: %d\tto_id: %d.", abg->node_n, from_id, to_id); int out_edge_n = abg->node[from_id].out_edge_n; + int edge_exist = 0; + int out_edge_i = -1; if (check_edge) { int i; for (i = 0; i < out_edge_n; ++i) { if (abg->node[from_id].out_id[i] == to_id) { // edge exists abg->node[from_id].out_weight[i] += w; // update weight on existing edge // update label id - goto ADD_READ_ID; - // return; + edge_exist = 1; + out_edge_i = i; + break; } } } // add edge - /// in edge - abpoa_realloc_graph_edge(abg, 0, to_id); - abg->node[to_id].in_id[abg->node[to_id].in_edge_n] = from_id; - ++abg->node[to_id].in_edge_n; - /// out edge - abpoa_realloc_graph_edge(abg, 1, from_id); - abg->node[from_id].out_id[out_edge_n] = to_id; - abg->node[from_id].out_weight[out_edge_n] = w; // initial weight for new edge - ++abg->node[from_id].out_edge_n; + if (edge_exist == 0) { + /// in edge + abpoa_realloc_graph_edge(abg, 0, to_id, 0); + abg->node[to_id].in_id[abg->node[to_id].in_edge_n] = from_id; + ++abg->node[to_id].in_edge_n; + /// out edge + abpoa_realloc_graph_edge(abg, 1, from_id, add_read_id); + abg->node[from_id].out_id[out_edge_n] = to_id; + abg->node[from_id].out_weight[out_edge_n] = w; // initial weight for new edge + out_edge_i = out_edge_n; + ++abg->node[from_id].out_edge_n; + } // add read_id to out edge -ADD_READ_ID: if (add_read_id) { + if (out_edge_i < 0) err_fatal_simple("No edge found."); + if (read_ids_n <= 0) err_fatal(__func__, "Unexpected read_ids_n: %d.", read_ids_n); + int i, j; abpoa_node_t *from_node = abg->node + from_id; if (from_node->read_ids_n == 0) { - from_node->read_ids = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t*)); + for (i = 0; i < from_node->out_edge_m; ++i) { + from_node->read_ids[i] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); + } from_node->read_ids_n = read_ids_n; } else if (from_node->read_ids_n < read_ids_n) { - from_node->read_ids = (uint64_t*)_err_realloc(from_node->read_ids, read_ids_n * sizeof(uint64_t*)); - int i; - for (i = from_node->read_ids_n; i < read_ids_n; ++i) from_node->read_ids[i] = 0; + // reallocate from_node->read_ids + for (i = 0; i < from_node->out_edge_m; ++i) { + from_node->read_ids[i] = (uint64_t*)_err_realloc(from_node->read_ids[i], read_ids_n * sizeof(uint64_t)); + for (j = from_node->read_ids_n; j < read_ids_n; ++j) from_node->read_ids[i][j] = 0; + } from_node->read_ids_n = read_ids_n; } - abpoa_set_read_id(from_node->read_ids, read_id); + abpoa_set_read_id(from_node->read_ids[out_edge_i], read_id); } return ret; } @@ -1290,16 +652,16 @@ int abpoa_add_graph_alignment(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *seq, int // reset allocated memery everytime init the graph // * node // * index_to_node_id/node_id_to_index/node_id_to_max_remain, max_pos_left/right -void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen) { +void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen) { abpoa_graph_t *abg = ab->abg; - int i, k, node_m; + int i, j, k, node_m; abg->is_topological_sorted = abg->is_called_cons = 0; for (i = 0; i < abg->node_n; ++i) { - abg->node[i].in_edge_n = abg->node[i].out_edge_n = abg->node[i].aligned_node_n = 0; - if (abpt->use_read_ids) { - for (k = 0; k < abg->node[i].read_ids_n; ++k) - abg->node[i].read_ids[k] = 0; + for (j = 0; j < abg->node[i].out_edge_n; ++j) { + for (k = 0; k < abg->node[i].read_ids_n; ++k) abg->node[i].read_ids[j][k] = 0; } + abg->node[i].in_edge_n = abg->node[i].out_edge_n = abg->node[i].aligned_node_n = 0; + } abg->node_n = 2; if (qlen+2 > abg->node_m) { @@ -1310,7 +672,7 @@ void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen) { abg->node_m = abg->index_rank_m = node_m; abg->index_to_node_id = (int*)_err_realloc(abg->index_to_node_id, node_m * sizeof(int)); abg->node_id_to_index = (int*)_err_realloc(abg->node_id_to_index, node_m * sizeof(int)); - if (abpt->out_msa || abpt->cons_agrm == ABPOA_HC || abpt->is_diploid) + if (abpt->out_msa || abpt->max_n_cons > 1) abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, node_m * sizeof(int)); if (abpt->wb >= 0) { abg->node_id_to_max_pos_left = (int*)_err_realloc(abg->node_id_to_max_pos_left, node_m * sizeof(int)); @@ -1321,4 +683,34 @@ void abpoa_reset_graph(abpoa_t *ab, abpoa_para_t *abpt, int qlen) { } } // fprintf(stderr, "qlen: %d, node_n: %d, node_m: %d\n", qlen, abg->node_n, abg->node_m); + // reset abs + ab->abs->n_seq = 0; + // reset cons + abpoa_cons_t *abc = ab->abc; + if (abc->n_cons > 0) { + if (abc->clu_n_seq != NULL) free(abc->clu_n_seq); + if (abc->cons_len != NULL) free(abc->cons_len); + if (abc->cons_node_ids != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_node_ids[i]); free(abc->cons_node_ids); + } + if (abc->cons_base != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_base[i]); free(abc->cons_base); + } + if (abc->cons_cov != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_cov[i]); free(abc->cons_cov); + } + if (abc->clu_read_ids != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->clu_read_ids[i]); free(abc->clu_read_ids); + } + if (abc->cons_phred_score != NULL) { + for (i = 0; i < abc->n_cons; ++i) free(abc->cons_phred_score[i]); free(abc->cons_phred_score); + } + } + if (abc->msa_len > 0) { + if (abc->msa_base != NULL) { + for (i = 0; i < abc->n_seq+abc->n_cons; ++i) free(abc->msa_base[i]); + free(abc->msa_base); + } + } + abc->n_seq = abc->n_cons = abc->msa_len = 0; } diff --git a/src/abpoa_graph.h b/src/abpoa_graph.h index da492ce..c565189 100644 --- a/src/abpoa_graph.h +++ b/src/abpoa_graph.h @@ -18,11 +18,9 @@ extern "C" { #endif -void set_65536_table(void); -void set_bit_table16(void); - int abpoa_get_aligned_id(abpoa_graph_t *abg, int node_id, uint8_t base); void abpoa_add_graph_aligned_node(abpoa_graph_t *abg, int node_id, int aligned_id); +void abpoa_set_msa_rank(abpoa_graph_t *abg, int src_id, int sink_id); abpoa_graph_t *abpoa_init_graph(void); void abpoa_free_graph(abpoa_graph_t *graph); diff --git a/src/abpoa_output.c b/src/abpoa_output.c new file mode 100644 index 0000000..c7104d6 --- /dev/null +++ b/src/abpoa_output.c @@ -0,0 +1,899 @@ +#include +#include +#include +#include "abpoa.h" +#include "abpoa_graph.h" +#include "utils.h" +#include "abpoa_seq.h" +#include "kdq.h" + +extern char ab_char256_table[256]; +char ab_LogTable65536[65536]; +char ab_bit_table16[65536]; + +#define NAT_E 2.718281828459045 +static const char ab_LogTable256[256] = { +#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n + -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), + LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) +}; + +static inline int ilog2_32(uint32_t v) +{ + uint32_t t, tt; + if ((tt = v>>16)) return (t = tt>>8) ? 24 + ab_LogTable256[t] : 16 + ab_LogTable256[tt]; + return (t = v>>8) ? 8 + ab_LogTable256[t] : ab_LogTable256[v]; +} + +void set_65536_table(void) { + int i; + for (i = 0; i < 65536; ++i) { + ab_LogTable65536[i] = ilog2_32(i); + } +} + +void set_bit_table16(void) { + int i; ab_bit_table16[0] = 0; + for (i = 0; i != 65536; ++i) ab_bit_table16[i] = (i&1) + ab_bit_table16[i>>1]; +} + +#define get_bit_cnt4(table, b) (table[(b)&0xffff] + table[(b)>>16&0xffff] + table[(b)>>32&0xffff] + table[(b)>>48&0xffff]) + +static inline int ilog2_64(uint64_t v) { + uint64_t t, tt; + if ((tt = v >> 32)) return (t = tt >> 16) ? 48 + ab_LogTable65536[t] : 32 + ab_LogTable65536[tt]; + return (t = v>>16) ? 16 + ab_LogTable65536[t] : ab_LogTable65536[v]; +} + +KDQ_INIT(int) +#define kdq_int_t kdq_t(int) + +static inline int get_read_cnt(uint64_t *read_ids, int read_ids_n) { + int i, c; + for (i = c =0; i < read_ids_n; ++i) { + c += get_bit_cnt4(ab_bit_table16, read_ids[i]); + } + return c; +} + +abpoa_cons_t *abpoa_allocate_rc_msa(abpoa_cons_t *abc, int msa_len, int n_seq, int n_cons) { + int i; + abc->n_seq = n_seq; abc->msa_len = msa_len; + abc->msa_base = (uint8_t**)_err_malloc((n_seq+n_cons) * sizeof(uint8_t*)); + for (i = 0; i < n_seq+n_cons; ++i) { + abc->msa_base[i] = (uint8_t*)_err_malloc(msa_len * sizeof(uint8_t)); + } + return abc; +} + +void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { + if (out_fp == NULL) return; + int i, j; + abpoa_seq_t *abs = ab->abs; abpoa_cons_t *abc = ab->abc; + if (abc->msa_len <= 0) return; + for (i = 0; i < abs->n_seq; ++i) { + if (abs->name[i].l > 0) { + if (abs->is_rc[i]) fprintf(out_fp, ">%s_reverse_complement\n", abs->name[i].s); + else fprintf(out_fp, ">%s\n", abs->name[i].s); + } else { + fprintf(out_fp, ">Seq_%d\n", i+1); + } + for (j = 0; j < abc->msa_len; ++j) fprintf(out_fp, "%c", ab_char256_table[abc->msa_base[i][j]]); + fprintf(out_fp, "\n"); + } + if (abpt->out_cons) { // RC-MSA for consensus sequence + int cons_i; + for (cons_i = 0; cons_i < abc->n_cons; cons_i++) { + fprintf(out_fp, ">Consensus_sequence"); + if (abc->n_cons > 1) fprintf(out_fp, "_%d", cons_i+1); + fprintf(out_fp, "\n"); + for (i = 0; i < abc->msa_len; ++i) fprintf(out_fp, "%c", ab_char256_table[abc->msa_base[abc->n_seq+cons_i][i]]); + fprintf(out_fp, "\n"); + } + } +} + +void abpoa_set_msa_seq(abpoa_node_t node, int rank, uint8_t **msa_base) { + int i, j, b, read_id; uint8_t base = node.base; + uint64_t num, tmp; + + b = 0; + for (i = 0; i < node.read_ids_n; ++i) { + for (j = 0; j < node.out_edge_n; ++j) { + num = node.read_ids[j][i]; + while (num) { + tmp = num & -num; + read_id = ilog2_64(tmp); + msa_base[b+read_id][rank-1] = base; + num ^= tmp; + } + } + b += 64; + } +} + +// only generate rc-msa, output in separated func +void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt) { + abpoa_graph_t *abg = ab->abg; + if (abg->node_n <= 2) return; + abpoa_set_msa_rank(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID); + if (abpt->out_cons) abpoa_generate_consensus(ab, abpt); + + abpoa_seq_t *abs = ab->abs; abpoa_cons_t *abc = ab->abc; + int i, j, aligned_id, n_seq = abs->n_seq; + int msa_len = abg->node_id_to_msa_rank[ABPOA_SINK_NODE_ID]-1; + + abpoa_allocate_rc_msa(abc, msa_len, n_seq, abc->n_cons); + for (i = 0; i < n_seq; ++i) { + for (j = 0; j < abc->msa_len; ++j) + abc->msa_base[i][j] = abpt->m; + } + + int rank; + // if (out_fp && abpt->out_msa_header == 0) fprintf(out_fp, ">Multiple_sequence_alignment\n"); + for (i = 2; i < abg->node_n; ++i) { + // get msa rank + rank = abpoa_graph_node_id_to_msa_rank(abg, i); + for (j = 0; j < abg->node[i].aligned_node_n; ++j) { + aligned_id = abg->node[i].aligned_node_id[j]; + rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id)); + } + // assign seq + abpoa_set_msa_seq(abg->node[i], rank, abc->msa_base); + } + if (abpt->out_cons) { + int cons_i, cur_id; + for (cons_i = 0; cons_i < abc->n_cons; cons_i++) { + for (i = 0; i < msa_len; ++i) abc->msa_base[n_seq+cons_i][i] = abpt->m; + for (i = 0; i < abc->cons_len[cons_i]; ++i) { + cur_id = abc->cons_node_ids[cons_i][i]; + rank = abpoa_graph_node_id_to_msa_rank(abg, cur_id); + for (j = 0; j < abg->node[cur_id].aligned_node_n; ++j) { + aligned_id = abg->node[cur_id].aligned_node_id[j]; + rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id)); + } + abc->msa_base[n_seq+cons_i][rank-1] = abc->cons_base[cons_i][i]; + } + } + } +} + +// generate & output gfa +void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { + if (out_fp == NULL) return; + abpoa_seq_t *abs = ab->abs; abpoa_graph_t *abg = ab->abg; + if (abg->node_n <= 2) return; + + // traverse graph + int *in_degree = (int*)_err_malloc(abg->node_n * sizeof(int)); + int n_seq = abs->n_seq; + int **read_paths = (int**)_err_malloc(n_seq * sizeof(int*)), *read_path_i = (int*)_err_calloc(n_seq, sizeof(int)); + int i, j, cur_id, pre_id, out_id, *id; + for (i = 0; i < abg->node_n; ++i) in_degree[i] = abg->node[i].in_edge_n; + for (i = 0; i < n_seq; ++i) read_paths[i] = (int*)_err_malloc(abg->node_n * sizeof(int)); + + // output comment and header + int nl = 0; + for (i = 2; i < abg->node_n; ++i) nl += abg->node[i].in_edge_n; + fprintf(out_fp, "H\tVN:Z:1.0\tNS:i:%d\tNL:i:%d\tNP:i:%d\n", abg->node_n-2, nl - abg->node[ABPOA_SRC_NODE_ID].out_edge_n, n_seq + abpt->out_cons); + + kdq_int_t *q = kdq_init_int(); + + // Breadth-First-Search + kdq_push_int(q, ABPOA_SRC_NODE_ID); + while ((id = kdq_shift_int(q)) != 0) { + cur_id = *id; + if (cur_id == ABPOA_SINK_NODE_ID) { + kdq_destroy_int(q); + break; + } else { + if (cur_id != ABPOA_SRC_NODE_ID) { + // output node + fprintf(out_fp, "S\t%d\t%c\n", cur_id-1, ab_char256_table[abg->node[cur_id].base]); + // output all links based pre_ids + for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) { + pre_id = abg->node[cur_id].in_id[i]; + if (pre_id != ABPOA_SRC_NODE_ID) + fprintf(out_fp, "L\t%d\t+\t%d\t+\t0M\n", pre_id-1, cur_id-1); + } + // add node id to read path + int b, read_id; uint64_t num, tmp; + b = 0; + for (i = 0; i < abg->node[cur_id].read_ids_n; ++i) { + for (j = 0; j < abg->node[cur_id].out_edge_n; ++j) { + num = abg->node[cur_id].read_ids[j][i]; + while (num) { + tmp = num & -num; + read_id = ilog2_64(tmp); + read_paths[b+read_id][read_path_i[b+read_id]++] = cur_id-1; + num ^= tmp; + } + } + b += 64; + } + } + for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { + out_id = abg->node[cur_id].out_id[i]; + if (--in_degree[out_id] == 0) { + kdq_push_int(q, out_id); + } + } + } + } + // output read paths + for (i = 0; i < n_seq; ++i) { + if (abs->name[i].l > 0) fprintf(out_fp, "P\t%s\t", abs->name[i].s); + else fprintf(out_fp, "P\t%d\t", i+1); + if (abs->is_rc[i]) { + for (j = read_path_i[i]-1; j >= 0; --j) { + fprintf(out_fp, "%d-", read_paths[i][j]); + if (j != 0) fprintf(out_fp, ","); + else fprintf(out_fp, "\t*\n"); + } + } else { + for (j = 0; j < read_path_i[i]; ++j) { + fprintf(out_fp, "%d+", read_paths[i][j]); + if (j != read_path_i[i]-1) fprintf(out_fp, ","); + else fprintf(out_fp, "\t*\n"); + } + } + } + if (abpt->out_cons) { + abpoa_generate_consensus(ab, abpt); + abpoa_cons_t *abc = ab->abc; + int cons_i; + for (cons_i = 0; cons_i < abc->n_cons; ++cons_i) { + fprintf(out_fp, "P\tConsensus_sequence"); + if (abc->n_cons > 1) fprintf(out_fp, "_%d", cons_i+1); + fprintf(out_fp, "\t"); + for (i = 0; i < abc->cons_len[cons_i]; ++i) { + cur_id = abc->cons_node_ids[cons_i][i]; + fprintf(out_fp, "%d+", cur_id-1); + if (i != abc->cons_len[cons_i]-1) fprintf(out_fp, ","); + else fprintf(out_fp, "\t*\n"); + } + + } + } + free(in_degree); + for (i = 0; i < n_seq; ++i) free(read_paths[i]); + free(read_paths); free(read_path_i); +} + +int abpoa_cons_phred_score(int n_cov, int n_seq) { + if (n_cov > n_seq) err_fatal(__func__, "Error: unexpected n_cov/n_seq (%d/%d).", n_cov, n_seq); + double x, p; + x = 13.8 * (1.25 * n_cov / n_seq - 0.25); + p = 1 - 1.0 / (1.0 + pow(NAT_E, -1 * x)); + return (33 + (int)(-10 * log10(p) + 0.499)); +} + +int get_read_ids_clu_weight(uint64_t *cur_read_ids, int read_ids_n, uint64_t *clu_read_ids) { + int w = 0, i; uint64_t b; + for (i = 0; i < read_ids_n; ++i) { + b = cur_read_ids[i] & clu_read_ids[i]; + w += get_bit_cnt4(ab_bit_table16, b); + } + return w; +} + +int abpoa_consensus_cov(abpoa_graph_t *abg, int id, uint64_t *clu_read_ids) { + int i, j, in_id, left_w, right_w; + // for each id: get max{left_weigth, right_weight} + left_w = right_w = 0; + for (i = 0; i < abg->node[id].in_edge_n; ++i) { + in_id = abg->node[id].in_id[i]; + for (j = 0; j < abg->node[in_id].out_edge_n; ++j) { + if (abg->node[in_id].out_id[j] == id) { + left_w += get_read_ids_clu_weight(abg->node[in_id].read_ids[j], abg->node[in_id].read_ids_n, clu_read_ids); + break; + } + } + } + for (i = 0; i < abg->node[id].out_edge_n; ++i) { + right_w += get_read_ids_clu_weight(abg->node[id].read_ids[i], abg->node[id].read_ids_n, clu_read_ids); + } + return MAX_OF_TWO(left_w, right_w); +} + +void abpoa_set_hb_cons(abpoa_graph_t *abg, int **max_out_id, int n_cons, uint64_t **clu_read_ids, int src_id, int sink_id, abpoa_cons_t *abc) { + abc->n_cons = n_cons; + int i, j, cur_id; + for (i = 0; i < n_cons; ++i) { + cur_id = max_out_id[i][src_id]; + j = 0; + while (cur_id != sink_id) { + abc->cons_node_ids[i][j] = cur_id; + abc->cons_base[i][j] = abg->node[cur_id].base; + abc->cons_cov[i][j] = abpoa_consensus_cov(abg, cur_id, clu_read_ids[i]); + abc->cons_phred_score[i][j] = abpoa_cons_phred_score(abc->cons_cov[i][j], abc->clu_n_seq[i]); + ++j; + cur_id = max_out_id[i][cur_id]; + } + abc->cons_len[i] = j; + } +} + +int abpoa_consensus_cov1(abpoa_graph_t *abg, int id) { + int i, j, in_id, left_w, right_w; + // for each id: get max{left_weigth, right_weight} + left_w = right_w = 0; + for (i = 0; i < abg->node[id].in_edge_n; ++i) { + in_id = abg->node[id].in_id[i]; + for (j = 0; j < abg->node[in_id].out_edge_n; ++j) { + if (abg->node[in_id].out_id[j] == id) { + left_w += abg->node[in_id].out_weight[j]; + break; + } + } + } + for (i = 0; i < abg->node[id].out_edge_n; ++i) { + right_w += abg->node[id].out_weight[i]; + } + return MAX_OF_TWO(left_w, right_w); +} + +void abpoa_set_hb_cons1(abpoa_graph_t *abg, int *max_out_id, int src_id, int sink_id, abpoa_cons_t *abc) { + int i = 0, cur_id; + abc->n_cons = 1; + cur_id = max_out_id[src_id]; + while (cur_id != sink_id) { + abc->cons_node_ids[0][i] = cur_id; + abc->cons_base[0][i] = abg->node[cur_id].base; + abc->cons_cov[0][i] = abpoa_consensus_cov1(abg, cur_id); + abc->cons_phred_score[0][i] = abpoa_cons_phred_score(abc->cons_cov[0][i], abc->n_seq); + cur_id = max_out_id[cur_id]; + ++i; + } + abc->cons_len[0] = i; +} + +// heaviest_bundling +// 1. argmax{cur->weight} +// 2. argmax{out_node->weight} +void abpoa_heaviest_bundling(abpoa_graph_t *abg, int src_id, int sink_id, int *out_degree, abpoa_cons_t *abc) { + int *id, i, cur_id, in_id, out_id, max_id; int max_w, out_w; + int *score = (int*)_err_malloc(abg->node_n * sizeof(int)); + int *max_out_id = (int*)_err_malloc(abg->node_n * sizeof(int)); + + kdq_int_t *q = kdq_init_int(); + kdq_push_int(q, sink_id); + // reverse Breadth-First-Search + while ((id = kdq_shift_int(q)) != 0) { + cur_id = *id; + if (cur_id == sink_id) { + max_out_id[cur_id] = -1; + score[cur_id] = 0; + } else { + max_id = -1; + if (cur_id == src_id) { + int path_score = -1, path_max_w = -1; + for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { + out_id = abg->node[cur_id].out_id[i]; + out_w = abg->node[cur_id].out_weight[i]; + if (out_w > path_max_w || (out_w == path_max_w && score[out_id] > path_score)) { + max_id = out_id; + path_score = score[out_id]; + path_max_w = out_w; + } + } + max_out_id[cur_id] = max_id; + kdq_destroy_int(q); + break; + } else { + max_w = INT32_MIN; + for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { + out_id = abg->node[cur_id].out_id[i]; + out_w = abg->node[cur_id].out_weight[i]; + if (max_w < out_w) { + max_w = out_w; max_id = out_id; + } else if (max_w == out_w && score[max_id] <= score[out_id]) { + max_id = out_id; + } + } + score[cur_id] = max_w + score[max_id]; + max_out_id[cur_id] = max_id; + } + } + for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) { + in_id = abg->node[cur_id].in_id[i]; + if (--out_degree[in_id] == 0) kdq_push_int(q, in_id); + } + } + abc->clu_n_seq[0] = abc->n_seq; + // set cons read ids + for (i = 0; i < abc->n_seq; ++i) abc->clu_read_ids[0][i] = i; + abpoa_set_hb_cons1(abg, max_out_id, src_id, sink_id, abc); + free(score); free(max_out_id); +} + +void set_clu_read_ids(abpoa_cons_t *abc, uint64_t **read_ids, int cons_i, int n_seq) { + int n, i, j; uint64_t b, one = 1; + for (i = n = 0; i < n_seq; ++i) { + j = i / 64; b = i & 0x3f; + if (read_ids[cons_i][j] & (one << b)) { + abc->clu_read_ids[cons_i][n++] = i; + } + } + if (n != abc->clu_n_seq[cons_i]) + err_fatal(__func__, "Error in set cluster read ids. (%d, %d)", n, abc->clu_n_seq[cons_i]); +} + +void abpoa_multip_heaviest_bundling(abpoa_graph_t *abg, int src_id, int sink_id, int *out_degree, int n_clu, int read_ids_n, uint64_t **clu_read_ids, abpoa_cons_t *abc) { + int *id, cons_i, i, cur_id, in_id, out_id, max_id; int max_w, out_w; + + int *_out_degree = (int*)_err_malloc(abg->node_n * sizeof(int)); + int *score = (int*)_err_malloc(abg->node_n * sizeof(int)); + int **max_out_id = (int**)_err_malloc(n_clu * sizeof(int*)); + for (i = 0; i < n_clu; ++i) max_out_id[i] = (int*)_err_malloc(abg->node_n * sizeof(int)); + + for (cons_i = 0; cons_i < n_clu; cons_i++) { + for (i = 0; i < abg->node_n; ++i) _out_degree[i] = out_degree[i]; + abc->clu_n_seq[cons_i] = get_read_cnt(clu_read_ids[cons_i], read_ids_n); + set_clu_read_ids(abc, clu_read_ids, cons_i, abc->n_seq); + kdq_int_t *q = kdq_init_int(); + kdq_push_int(q, sink_id); + // reverse Breadth-First-Search + while ((id = kdq_shift_int(q)) != 0) { + cur_id = *id; + if (cur_id == sink_id) { + max_out_id[cons_i][cur_id] = -1; + score[cur_id] = 0; + } else { + max_id = -1; + if (cur_id == src_id) { + int path_score = -1, path_max_w = -1; + for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { + out_id = abg->node[cur_id].out_id[i]; + out_w = get_read_ids_clu_weight(abg->node[cur_id].read_ids[i], abg->node[cur_id].read_ids_n, clu_read_ids[cons_i]); + out_w = abg->node[cur_id].out_weight[i]; + if (out_w > path_max_w || (out_w == path_max_w && score[out_id] > path_score)) { + max_id = out_id; + path_score = score[out_id]; + path_max_w = out_w; + } + } + max_out_id[cons_i][cur_id] = max_id; + kdq_destroy_int(q); + break; + } else { + max_w = INT32_MIN; + for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) { + out_id = abg->node[cur_id].out_id[i]; + out_w = get_read_ids_clu_weight(abg->node[cur_id].read_ids[i], abg->node[cur_id].read_ids_n, clu_read_ids[cons_i]); + if (max_w < out_w) { + max_w = out_w; + max_id = out_id; + } else if (max_w == out_w && score[max_id] <= score[out_id]) { + max_id = out_id; + } + } + score[cur_id] = max_w + score[max_id]; + max_out_id[cons_i][cur_id] = max_id; + } + } + for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) { + in_id = abg->node[cur_id].in_id[i]; + if (--_out_degree[in_id] == 0) + kdq_push_int(q, in_id); + } + } + } + abpoa_set_hb_cons(abg, max_out_id, n_clu, clu_read_ids, src_id, sink_id, abc); + + free(score); free(_out_degree); + for (i = 0; i < n_clu; ++i) free(max_out_id[i]); free(max_out_id); +} + +void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { + if (out_fp == NULL) return; + int cons_i, j; + abpoa_cons_t *abc = ab->abc; + for (cons_i = 0; cons_i < abc->n_cons; ++cons_i) { + if (abpt->out_fq) fprintf(out_fp, "@Consensus_sequence"); + else fprintf(out_fp, ">Consensus_sequence"); + if (abc->n_cons > 1) fprintf(out_fp, "_%d", cons_i+1); + fprintf(out_fp, "\n"); + for (j = 0; j < abc->cons_len[cons_i]; ++j) { + fprintf(out_fp, "%c", ab_char256_table[abc->cons_base[cons_i][j]]); + } fprintf(out_fp, "\n"); + if (abpt->out_fq) { + fprintf(out_fp, "+Consensus_sequence"); + if (abc->n_cons > 1) fprintf(out_fp, "_%d", cons_i+1); + fprintf(out_fp, "\n"); + for (j = 0; j < abc->cons_len[cons_i]; ++j) { + fprintf(out_fp, "%c", abc->cons_phred_score[cons_i][j]); + } fprintf(out_fp, "\n"); + } + } +} + +abpoa_cons_t *abpoa_allocate_cons(abpoa_cons_t *abc, int n_node, int n_seq, int n_cons) { + int i; + abc->n_cons = n_cons, abc->n_seq = n_seq; + abc->clu_n_seq = (int*)_err_calloc(n_cons, sizeof(int)); + abc->cons_len = (int*)_err_calloc(n_cons, sizeof(int)); + abc->cons_node_ids = (int**)_err_malloc(n_cons * sizeof(int*)); + abc->cons_base = (uint8_t**)_err_malloc(n_cons * sizeof(uint8_t*)); + abc->cons_cov = (int**)_err_malloc(n_cons * sizeof(int*)); + abc->clu_read_ids = (int**)_err_malloc(n_cons * sizeof(int*)); + abc->cons_phred_score = (int**)_err_malloc(n_cons * sizeof(int*)); + for (i = 0; i < n_cons; ++i) { + abc->cons_node_ids[i] = (int*)_err_malloc(n_node * sizeof(int)); + abc->cons_base[i] = (uint8_t*)_err_malloc(n_node * sizeof(uint8_t)); + abc->cons_cov[i] = (int*)_err_malloc(n_node * sizeof(int)); + abc->clu_read_ids[i] = (int*)_err_malloc(n_seq * sizeof(int)); + abc->cons_phred_score[i] = (int*)_err_malloc(n_node * sizeof(int)); + } + return abc; +} + +int abpoa_check_iden_read_ids(int **rc_weight, uint64_t ***read_ids, int m, int read_ids_n, int pos1, int pos2) { + int i, j, k, iden = 1; + uint8_t *map = (uint8_t*)_err_calloc(m, sizeof(uint8_t)); + + for (i = 0; i < m ; ++i) { + if (rc_weight[pos1][i] == 0) continue; + int found_iden = 0; + for (j = 0; j < m; ++j) { // find from 0~m that is identical to i'th read_ids + if (map[j] == 1 || rc_weight[pos1][i] != rc_weight[pos2][j]) continue; + // iden rc_weight + int diff = 0; + for (k = 0; k < read_ids_n; ++k) { + if (read_ids[pos1][i][k] != read_ids[pos2][j][k]) { + diff = 1; break; + } + } + if (diff == 0) { // i is identical to j + found_iden = 1; + map[j] = 1; + break; + } + } + if (found_iden == 0) { // no iden for i'th base + iden = 0; break; + } + } + free(map); + return iden; +} + +// return: 1 if redundent else 0 +int check_redundent_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int n_clu, int new_clu_i, int n_het_pos, int read_id_i, uint64_t read_id) { + int i, j, redundent = 0; + for (i = n_clu-1; i >= 0; --i) { + int iden = 1; + for (j = 0; j < n_het_pos; ++j) { + if (clu_haps[i][j] != clu_haps[new_clu_i][j]) { + iden = 0; break; + } + } + if (iden == 1) { + clu_size[i] += 1; + clu_read_ids[i][read_id_i] |= read_id; + redundent = 1; break; + } + } + if (redundent == 0) { + clu_size[new_clu_i] += 1; + clu_read_ids[new_clu_i][read_id_i] |= read_id; + } + return redundent; +} + +int reassign_hap_by_min_w(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int min_w, int n_het_pos) { + int i, j, k, n_reassign = 0; + for (i = 0; i < n_clu; ++i) { + if (clu_size[i] >= min_w || clu_size[i] == 0) continue; + int reassign_i = -1, max_iden_pos = 0; + for (j = 0; j < n_het_pos; ++j) { + int n_iden_pos = 0; + if (clu_size[j] < min_w) continue; + // i < min_w, j >= min_w + for (k = 0; k < n_het_pos; ++k) { + if (clu_haps[i][k] == clu_haps[j][k]) n_iden_pos++; + } + if (n_iden_pos > max_iden_pos) { + max_iden_pos = n_iden_pos; + reassign_i = j; + } + } + if (reassign_i >= 0) { + for (j = 0; j < read_ids_n; ++j) { + clu_read_ids[reassign_i][j] |= clu_read_ids[i][j]; + clu_read_ids[i][j] = 0; + } + clu_size[reassign_i] += clu_size[i]; + clu_size[i] = 0; + n_reassign += 1; + } + } + return n_clu - n_reassign; +} + +int reassign_max_n_hap1(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int *clu_poss, int max_n_cons, int n_het_pos) { + int i, j, k, n_reassign = 0; + for (i = 0; i < n_clu; ++i) { + int is_clu = 0; + if (clu_size[i] == 0) continue; + for (j = 0; j < max_n_cons; ++j) { + if (i == clu_poss[j]) { + is_clu = 1; + break; + } + } + if (is_clu) continue; + + int reassign_i = -1, max_iden_pos = 0; + for (j = 0; j < max_n_cons; ++j) { + int clu_i = clu_poss[j], n_iden_pos = 0; + // i < min_w, clu_i >= min_w + for (k = 0; k < n_het_pos; ++k) { + if (clu_haps[i][k] == clu_haps[clu_i][k]) n_iden_pos++; + } + if (n_iden_pos > max_iden_pos) { + max_iden_pos = n_iden_pos; + reassign_i = clu_i; + } + } + if (reassign_i >= 0) { + for (j = 0; j < read_ids_n; ++j) { + clu_read_ids[reassign_i][j] |= clu_read_ids[i][j]; + clu_read_ids[i][j] = 0; + } + clu_size[reassign_i] += clu_size[i]; + clu_size[i] = 0; + n_reassign += 1; + } else { + clu_size[i] = 0; + } + } + return n_clu - n_reassign; +} + +typedef struct { + int size, pos; +} clu_hap_tuple_t; + +// descending order +int tup_cmpfunc (const void * a, const void * b) { + return -(((clu_hap_tuple_t*)a)->size - ((clu_hap_tuple_t*)b)->size); +} + +int reassign_max_n_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int min_w, int max_n_cons, int n_het_pos) { + int i; + clu_hap_tuple_t *tup = (clu_hap_tuple_t*)_err_malloc(n_clu * sizeof(clu_hap_tuple_t)); + int *clu_poss = (int*)_err_malloc(max_n_cons * sizeof(int)); + + while (n_clu > max_n_cons) { + for (i = 0; i < n_clu; ++i) { + tup[i].size = clu_size[i]; + tup[i].pos = i; + } + qsort(tup, n_clu, sizeof(clu_hap_tuple_t), tup_cmpfunc); + // new min_w + for (i = 0; i < max_n_cons; ++i) clu_poss[i] = tup[i].pos; + int new_n_clu = reassign_max_n_hap1(clu_haps, clu_size, clu_read_ids, read_ids_n, n_clu, clu_poss, max_n_cons, n_het_pos); + if (new_n_clu == n_clu) { // no further reassignment, but still have more than _max_n_cons_ clus + err_func_printf(__func__, "%d small clusters of sequences remain un-assigned.", n_clu-max_n_cons); + break; + } + n_clu = new_n_clu; + } + free(tup); + return n_clu; +} + +int reassign_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int min_w, int max_n_cons, int n_het_pos) { + // assign haplotype with reads < min_w to haplotype with reads >= min_w + int new_n_clu = reassign_hap_by_min_w(clu_haps, clu_size, clu_read_ids, read_ids_n, n_clu, min_w, n_het_pos); + if (new_n_clu > max_n_cons) // keep at most _max_n_cons_ + new_n_clu = reassign_max_n_hap(clu_haps, clu_size, clu_read_ids, read_ids_n, n_clu, min_w, n_het_pos, max_n_cons); + // move max_n_cons to the front + int i, j, pos_i; + for (i = pos_i = 0; i < n_clu; ++i) { + if (clu_size[i] == 0) continue; + if (i == pos_i) { + pos_i++; continue; + } + // move i to pos_i + for (j = 0; j < read_ids_n; ++j) { + clu_read_ids[pos_i][j] = clu_read_ids[i][j]; + clu_size[pos_i] = clu_size[i]; + } + pos_i++; + } + if (pos_i > max_n_cons) err_fatal_core(__func__, "Error: collected %d clusters.", pos_i); + return pos_i; +} + +// collect minimized set of het bases +int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids, int *het_poss, int **rc_weight, int msa_l, int n_seq, int m, int min_w, int read_ids_n) { + int i, j, k, n, rank; + uint64_t b, one = 1, *whole_read_ids = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); + for (i = 0; i < n_seq; ++i) { + j = i / 64; b = i & 0x3f; + whole_read_ids[j] |= (one << b); + } + for (i = 0; i < msa_l; ++i) { + for (j = 0; j < read_ids_n; ++j) { + read_ids[i][m-1][j] = whole_read_ids[j]; + } + } free(whole_read_ids); + + uint8_t *node_map = (uint8_t*)_err_calloc(abg->node_n, sizeof(uint8_t)); + int *n_branch = (int*)_err_calloc(msa_l, sizeof(int)), n_het_pos = 0; + for (i = 2; i < abg->node_n; ++i) { + if (abg->node[i].out_edge_n < 2) continue; + + for (j = 0; j < abg->node[i].out_edge_n; ++j) { + int out_id = abg->node[i].out_id[j]; + if (node_map[out_id]) continue; + else node_map[out_id] = 1; + int sum_out_w = 0; + for (k = 0; k < abg->node[out_id].out_edge_n; ++k) + sum_out_w += abg->node[out_id].out_weight[k]; + if (sum_out_w < min_w || sum_out_w > n_seq-min_w) continue; + rank = abpoa_graph_node_id_to_msa_rank(abg, out_id); + n_branch[rank-1] += 1; + // assign seq + for (n = 0; n < abg->node[out_id].out_edge_n; ++n) { + for (k = 0; k < abg->node[out_id].read_ids_n; ++k) { + b = abg->node[out_id].read_ids[n][k]; + rc_weight[rank-1][abg->node[out_id].base] += get_bit_cnt4(ab_bit_table16, b); + read_ids[rank-1][abg->node[out_id].base][k] |= b; + read_ids[rank-1][m-1][k] ^= b; + } + } + rc_weight[rank-1][m-1] -= rc_weight[rank-1][abg->node[out_id].base]; + } + } + for (rank = 0; rank < msa_l; ++rank) { + if (rc_weight[rank][m-1] >= min_w && rc_weight[rank][m-1] <= n_seq-min_w) n_branch[rank]++; + if (n_branch[rank] > 1) { + // filter out identical read_ids + int iden = 0; + for (i = n_het_pos-1; i >= 0; i--) { + int het_pos = het_poss[i]; + // remove het bases that share the identical read groups + iden = abpoa_check_iden_read_ids(rc_weight, read_ids, m, read_ids_n, rank, het_pos); + if (iden == 1) break; + } + if (iden == 1) continue; + + het_poss[n_het_pos++] = rank; +#ifdef __DEBUG__ + fprintf(stderr, "%d\t", rank); + for (j = 0; j < m; ++j) { + fprintf(stderr, "%c: %d\t", "ACGT-"[j], rc_weight[rank][j]); + } fprintf(stderr, "\n"); +#endif + } + } + free(n_branch); free(node_map); + return n_het_pos; +} + +int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***read_ids, int read_ids_n, int n_seq, int m, int min_w, int max_n_cons, uint64_t ***clu_read_ids, int *_m_clu) { + if (n_het_pos == 0) return 1; + int i, j, k, n_clu = 0, m_clu = 2; + int **clu_haps = (int**)_err_malloc(2 * sizeof(int*)); + int *clu_size = (int*)_err_calloc(2, sizeof(int)); + *clu_read_ids = (uint64_t**)_err_malloc(2 * sizeof(uint64_t**)); + for (i = 0; i < 2; ++i) { + clu_haps[i] = (int*)_err_calloc(n_het_pos, sizeof(int)); + (*clu_read_ids)[i] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); + } + + for (i = 0; i < n_seq; ++i) { // collect haplotype for each sequence + int read_id_i = i / 64; uint64_t read_id = 1ULL << (i & 0x3f); + for (j = 0; j < n_het_pos; ++j) { + int het_pos = het_poss[j]; + for (k = 0; k < m; ++k) { + if (read_ids[het_pos][k][read_id_i] & read_id) { + clu_haps[n_clu][j] = k; break; + } + } + } + if (check_redundent_hap(clu_haps, clu_size, *clu_read_ids, n_clu, n_clu, n_het_pos, read_id_i, read_id) == 0) { + if (++n_clu == m_clu) { + m_clu <<= 1; + clu_haps = (int**)_err_realloc(clu_haps, m_clu * sizeof(int*)); + clu_size = (int*)_err_realloc(clu_size, m_clu * sizeof(int)); + (*clu_read_ids) = (uint64_t**)_err_realloc(*clu_read_ids, m_clu * sizeof(uint64_t**)); + for (j = n_clu; j < m_clu; ++j) { + clu_haps[j] = (int*)_err_calloc(n_het_pos, sizeof(int)); + clu_size[j] = 0; + (*clu_read_ids)[j] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); // mem may lost + } + } + } + } + if (n_clu < 2) err_fatal(__func__, "# haplotypes: %d\n", n_clu); +#ifdef __DEBUG__ + fprintf(stderr, "n_clu: %d\n", n_clu); + for (i = 0; i < n_clu; ++i) { + for (j = 0; j < n_het_pos; ++j) { + fprintf(stderr, "%d\t", clu_haps[i][j]); + } + fprintf(stderr, "\tsize: %d\n", clu_size[i]); + } +#endif + + // assign haplotype with reads < min_w to haplotype with reads >= min_w + // keep at most _max_n_cons_ haps and read ids, weight need to >= min_w + n_clu = reassign_hap(clu_haps, clu_size, *clu_read_ids, read_ids_n, n_clu, min_w, max_n_cons, n_het_pos); +#ifdef __DEBUG__ + fprintf(stderr, "After re-assign: n_clu: %d\n", n_clu); + for (i = 0; i < n_clu; ++i) { + fprintf(stderr, "%d:\tsize: %d\n", i, clu_size[i]); + } +#endif + for (i = 0; i < m_clu; ++i) free(clu_haps[i]); free(clu_haps); free(clu_size); + *_m_clu = m_clu; + return n_clu; +} + +// cluster reads into _n_clu_ groups based on heterogeneous bases +int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, int m, int max_n_cons, double min_freq, uint64_t ***clu_read_ids, int *_m_clu) { + abpoa_set_msa_rank(abg, src_id, sink_id); + int i, j, n_clu, m_clu, read_ids_n = (n_seq-1)/64+1; + int msa_l = abg->node_id_to_msa_rank[sink_id]-1, min_w = MAX_OF_TWO(1, n_seq * min_freq); + + // read_ids: support reads for each base (A/C/G/T) at each position + uint64_t ***read_ids = (uint64_t***)_err_malloc(sizeof(uint64_t**) * msa_l); + for (i = 0; i < msa_l; ++i) { + read_ids[i] = (uint64_t**)_err_malloc(sizeof(uint64_t*) * m); + for (j = 0; j < m; ++j) read_ids[i][j] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); + } + + // is rc_weight necessary? + int **rc_weight = (int**)_err_malloc(msa_l * sizeof(int*)); + for (i = 0; i < msa_l; ++i) { + rc_weight[i] = (int*)_err_calloc(m, sizeof(int)); // ACGT + rc_weight[i][m-1] = n_seq; + } + // find min set of het nodes + int *het_poss = (int*)_err_calloc(msa_l, sizeof(int)); + int n_het_pos = abpoa_set_het_row_column_ids_weight(abg, read_ids, het_poss, rc_weight, msa_l, n_seq, m, min_w, read_ids_n); + + if (n_het_pos < 1) n_clu = 1; + // collect at most _max_n_cons_ haplotypes and corresponding read ids + else n_clu = abpoa_collect_clu_hap_read_ids(het_poss, n_het_pos, read_ids, read_ids_n, n_seq, m, min_w, max_n_cons, clu_read_ids, &m_clu); + + for (i = 0; i < msa_l; ++i) { + for (j = 0; j < m; ++j) free(read_ids[i][j]); + free(read_ids[i]); free(rc_weight[i]); + } free(read_ids); free(rc_weight); free(het_poss); + + *_m_clu = m_clu; + return n_clu; +} + +// should always do topological sort first, then generate consensus +void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt) { + if (ab->abg->is_called_cons == 1) return; + abpoa_graph_t *abg = ab->abg; + if (abg->node_n <= 2) return; + int i, *out_degree = (int*)_err_malloc(abg->node_n * sizeof(int)); + for (i = 0; i < abg->node_n; ++i) { + out_degree[i] = abg->node[i].out_edge_n; + } + + int n_clu, m_clu, n_seq = ab->abs->n_seq; uint64_t **clu_read_ids; + int read_ids_n = (n_seq-1)/64+1; + + if (abpt->max_n_cons > 1) n_clu = abpoa_multip_read_clu(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, abpt->m, abpt->max_n_cons, abpt->min_freq, &clu_read_ids, &m_clu); + else n_clu = 1; + + abpoa_cons_t *abc = ab->abc; + abpoa_allocate_cons(abc, abg->node_n, ab->abs->n_seq, n_clu); + if (n_clu > 1) { + abpoa_multip_heaviest_bundling(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, out_degree, n_clu, read_ids_n, clu_read_ids, abc); + for (i = 0; i < m_clu; ++i) free(clu_read_ids[i]); free(clu_read_ids); + } else { + abpoa_heaviest_bundling(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, out_degree, abc); + } + abg->is_called_cons = 1; free(out_degree); +} \ No newline at end of file diff --git a/src/abpoa_output.h b/src/abpoa_output.h new file mode 100644 index 0000000..9952c56 --- /dev/null +++ b/src/abpoa_output.h @@ -0,0 +1,15 @@ +#ifndef ABPOA_OUTPUT_H +#define ABPOA_OUTPUT_H + +#ifdef __cplusplus +extern "C" { +#endif + +void set_65536_table(void); +void set_bit_table16(void); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/abpoa_plot.c b/src/abpoa_plot.c index cf52be5..d35fbcf 100644 --- a/src/abpoa_plot.c +++ b/src/abpoa_plot.c @@ -30,7 +30,7 @@ extern char ab_nt256_table[256]; // base (index, rank, node_id) // A (1, 1, 2) A: base 1: index 1: rank 2: node_id -int abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt) { +void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt) { char PROG[20] = "abpoa"; int font_size=24; abpoa_graph_t *abg = ab->abg; @@ -118,5 +118,4 @@ int abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt) { sprintf(cmd, "dot %s -T%s > %s", dot_fn, type+1, abpt->out_pog); free(dot_fn); if (system(cmd) != 0) err_fatal(__func__, "Fail to plot %s DAG.", PROG); - return 0; } diff --git a/sub_example.c b/sub_example.c index 5ffec75..ace05c7 100644 --- a/sub_example.c +++ b/sub_example.c @@ -1,7 +1,8 @@ /* sub_example.c libabpoa usage example To compile: - gcc -O2 sub_example.c -I ./include -L ./lib -labpoa -lz -o sub_example - Or: gcc -O2 sub_example.c -I ./include ./lib/libabpoa.a -lz -o sub_example +gcc -g sub_example.c -I ./include -L ./lib -labpoa -lz -lm -o sub_example +or: +gcc -g sub_example.c -I ./include ./lib/libabpoa.a -lz -lm -o sub_example */ #include #include @@ -115,15 +116,8 @@ int main(void) { abpoa_add_subgraph_alignment(ab, abpt, exc_beg, exc_end, bseqs[i], seq_lens[i], NULL, res, i, n_seqs, 0); if (res.n_cigar) free(res.graph_cigar); } - if (abpt->out_cons) { - abpoa_generate_consensus(ab, abpt, stdout, NULL, NULL, NULL, NULL); - if (ab->abg->is_called_cons == 0) - fprintf(stderr, "Warning: no consensus sequence generated.\n"); - } - if (abpt->out_msa) { - abpoa_generate_rc_msa(ab, abpt, stdout, NULL, NULL); - } - // abpoa_reset_graph(ab, abpt, seq_lens[0]); // reset graph before re-use + + abpoa_output(ab, abpt, stdout); /* generate DOT partial order graph plot */ abpt->out_pog = strdup("sub_example.png"); // dump parital order graph to file diff --git a/test_data/heter.fa b/test_data/heter.fa new file mode 100644 index 0000000..fc29729 --- /dev/null +++ b/test_data/heter.fa @@ -0,0 +1,30 @@ +>m64062_200517_230654/46596725/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200517_230654/122620624/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200604_115437/178193244/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200517_230654/73204047/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200604_115437/157813105/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200604_115437/141952744/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCAGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCCTCCACCAACATCCCCACCATCCCCACCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200604_115437/19531191/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTAACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACACCATTCTCACCATCTCCACCAACATCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCAATCCCCACCATCC +>m64062_200604_115437/120652681/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200604_115437/28773203/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200604_115437/75628767/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTCCATCCATTCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCACCACCATTTCCACCATCCCACCATCATCCCCACCACCATCCCCAGTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200517_230654/180226763/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200517_230654/29755012/ccs +CCATTCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200517_230654/154468686/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATGCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCACCACCATCCTCACTACCATCCCACCACCATTCCACCATTCCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCACCACCATCTCCATTACCATCCCCACCACCATCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCACCCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATC +>m64062_200604_115437/146211230/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC +>m64062_200517_230654/85983296/ccs +CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC