Skip to content

Commit

Permalink
experimenting with a partial order alignment
Browse files Browse the repository at this point in the history
It's slower than wfa but more acccurate, less accurate than mafft but
faster
  • Loading branch information
ACEnglish committed Jan 6, 2024
1 parent 0791e3f commit d79bae2
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 8 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def _get_repo_hash():
"numpy>=1.23.3",
"pytabix>=0.1",
"bwapy>=0.1.4",
"pandas>=1.4.4"
"pandas>=1.4.4",
"pyabpoa>=1.4.3",
],
)
2 changes: 0 additions & 2 deletions truvari/annotations/gccontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ def edit_header(my_vcf):
"""
Add INFO for new fields to vcf
"""
# Update header
# Edit Header
header = my_vcf.header.copy()
header.add_line(('##INFO=<ID=GCPCT,Number=1,Type=Integer,'
'Description="GC Percent of the reference call range or alt sequence (whichever is longer)">'))
Expand Down
2 changes: 1 addition & 1 deletion truvari/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def entry_to_key(entry, prefix="", bounds=False):
VCF when consolidating multiple files' calls. If bounds: call entry_boundaries for start/stop.
.. warning::
If a caller redundantly calls a variant exactly the same. It will not have a unique key
If a caller redundantly calls a variant exactly the same it will not have a unique key
:param `entry`: entry to stringify
:type `entry`: :class:`pysam.VariantRecord`
Expand Down
17 changes: 14 additions & 3 deletions truvari/phab.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from collections import defaultdict

import pysam
import pyabpoa
from pysam import samtools
from intervaltree import IntervalTree
from pywfa.align import WavefrontAligner # pylint: disable=no-name-in-module
Expand Down Expand Up @@ -269,13 +270,24 @@ def mafft_to_vars(seq_bytes, params=DEFAULT_MAFFT_PARAM):
fasta[name] = sequence.decode()
return truvari.msa2vcf(fasta)

def poa_to_vars(seq_bytes):
"""
Run partial order alignment to create msa
"""
fasta = {k: v.decode() for k, v in fasta_reader(seq_bytes.decode(), False)}
aligner = pyabpoa.msa_aligner()
aln_result = aligner.msa(fasta.values(), False, True)
return truvari.msa2vcf(dict(zip(fasta.keys(), aln_result.msa_seq)))


def harmonize_variants(harm_jobs, mafft_params, base_vcf, samp_names, output_fn, threads, method="mafft"):
"""
Parallel processing of variants to harmonize. Writes to output
"""
if method == "mafft":
align_method = partial(mafft_to_vars, params=mafft_params)
if method == "poa":
align_method = poa_to_vars
else:
align_method = wfa_to_vars

Expand Down Expand Up @@ -334,7 +346,6 @@ def phab(var_regions, base_vcf, ref_fn, output_fn, bSamples=None, buffer=100,

logging.info("Extracting haplotypes")
ref_haps_fn = extract_reference(region_fn, ref_fn)
# Haplotype jobs is now simplier (1 and 2 are made together)
hap_jobs, samp_names = make_haplotype_jobs(base_vcf, bSamples,
comp_vcf, cSamples,
prefix_comp)
Expand Down Expand Up @@ -374,8 +385,8 @@ def parse_args(args):
help="Subset of samples to MSA from comp-VCF")
parser.add_argument("-m", "--mafft-params", type=str, default=DEFAULT_MAFFT_PARAM,
help="Parameters for mafft, wrap in a single quote (%(default)s)")
parser.add_argument("--align", type=str, choices=["mafft", "wfa"], default="mafft",
help="Alignment method accurate (mafft) or fast (wfa) (%(default)s)")
parser.add_argument("--align", type=str, choices=["mafft", "wfa", "poa"], default="mafft",
help="Alignment method accurate (mafft), fast (wfa), medium (poa) (%(default)s)")
parser.add_argument("-t", "--threads", type=int, default=1,
help="Number of threads (%(default)s)")
parser.add_argument("--debug", action="store_true",
Expand Down
2 changes: 1 addition & 1 deletion truvari/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def parse_args(args):
"those in analyzed regions"))
parser.add_argument("-t", "--threads", default=4, type=int,
help="Number of threads to use (%(default)s)")
parser.add_argument("-a", "--align", type=str, choices=["mafft", "wfa"], default="mafft",
parser.add_argument("-a", "--align", type=str, choices=["mafft", "wfa", "poa"], default="mafft",
help="Alignment method for phab (%(default)s)")
parser.add_argument("-m", "--mafft-params", type=str, default=DEFAULT_MAFFT_PARAM,
help="Parameters for mafft, wrap in a single quote (%(default)s)")
Expand Down

0 comments on commit d79bae2

Please sign in to comment.