Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: distinguish soft and hard AF filter #9

Merged
merged 15 commits into from
Feb 7, 2024
34 changes: 26 additions & 8 deletions .github/workflows/juno_mapping_singularity.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

name: Singularity e2e test

on: [pull_request]
on: workflow_dispatch

env:
KRAKEN_DEFAULT_DB: /home/runner/kraken-database
KRAKEN_DEFAULT_DB: /mnt/kraken-database

jobs:
singularity_end_to_end:
Expand All @@ -18,10 +18,18 @@ jobs:
name: Singularity Juno_mapping pipeline ${{ matrix.config.os }}

steps:
- uses: actions/checkout@v2
- name: check space 1
shell: bash -l {0}
run: |
sudo chmod -R 777 /mnt
echo "Checking space: ."
df -h .
echo "Checking space: /mnt"
df -h /mnt
- uses: actions/checkout@v4
- uses: eWaterCycle/setup-singularity@v7
with:
singularity-version: 3.8.7
singularity-version: 3.8.3
- name: Install Conda environment with Micromamba
uses: mamba-org/setup-micromamba@v1
with:
Expand All @@ -42,10 +50,11 @@ jobs:
curl -k https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20220908.tar.gz > k2_viral_20220908.tar.gz
tar zxvf k2_viral_20220908.tar.gz
ls -lh
rm k2_viral_20220908.tar.gz
- name: Conda list
shell: bash -l {0}
run: conda list
- name: check space
- name: check space 2
shell: bash -l {0}
run: |
which singularity
Expand All @@ -56,7 +65,10 @@ jobs:
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
fi
echo "Checking space: ."
df -h .
echo "Checking space: /mnt"
df -h /mnt
which singularity
singularity --version
- name: Test juno_mapping pipeline using singularity.
Expand All @@ -65,10 +77,16 @@ jobs:
- name: Test the juno_mapping wrapper
shell: bash -l {0}
run: pytest -v tests/test_juno_mapping.py
- name: check space 2
- name: check space 3
if: always()
shell: bash -l {0}
run: |
echo "Checking space: ."
df -h .


echo "Checking space: /mnt"
df -h /mnt
du -sh /mnt/kraken-database
echo "Checking space: $GITHUB_WORKSPACE"
du -sh $GITHUB_WORKSPACE
echo "Checking space: /home/runner"
du -sh /home/runner/*
28 changes: 23 additions & 5 deletions juno_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,20 @@ def _add_args_to_parser(self) -> None:
help="Minimum length for fastq reads to be kept after trimming.",
)
self.add_argument(
"-maf",
"--minimum-allele-frequency",
"-smaf",
"--soft-filter-minimum-allele-frequency",
type=check_number_within_range(minimum=0, maximum=1),
metavar="FLOAT",
default=0.8,
help="Minimum allele frequency to filter variants on.",
help="Minimum allele frequency to soft filter variants on. Soft filtering is done by adding a filter tag to the VCF file.",
)
self.add_argument(
"-hmaf",
"--hard-filter-minimum-allele-frequency",
type=check_number_within_range(minimum=0, maximum=1),
metavar="FLOAT",
default=0.2,
help="Minimum allele frequency to hard filter variants on. Hard filtering is done by removing variants from the VCF file.",
)
self.add_argument(
"--min-reads-per-strand",
Expand All @@ -176,7 +184,12 @@ def _parse_args(self) -> argparse.Namespace:
self.time_limit: int = args.time_limit
self.species: str = args.species
self.minimum_depth_variant: int = args.minimum_depth_variant
self.minimum_allele_frequency: float = args.minimum_allele_frequency
self.hard_filter_minimum_allele_frequency: float = (
args.hard_filter_minimum_allele_frequency
)
self.soft_filter_minimum_allele_frequency: float = (
args.soft_filter_minimum_allele_frequency
)
self.min_reads_per_strand: int = args.min_reads_per_strand

return args
Expand Down Expand Up @@ -258,7 +271,12 @@ def setup(self) -> None:
"use_singularity": str(self.snakemake_args["use_singularity"]),
"species": str(self.species),
"minimum_depth": int(self.minimum_depth_variant),
"minimum_allele_frequency": float(self.minimum_allele_frequency),
"soft_filter_minimum_allele_frequency": float(
self.soft_filter_minimum_allele_frequency
),
"hard_filter_minimum_allele_frequency": float(
self.hard_filter_minimum_allele_frequency
),
"min_reads_per_strand": int(self.min_reads_per_strand),
}

Expand Down
6 changes: 3 additions & 3 deletions tests/test_juno_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_junomapping_dryrun_user_params(self) -> None:
expected_user_param_values = {
"species": "mycobacterium_tuberculosis",
"reference": "reference.fasta",
"minimum_allele_frequency": 0.8,
"soft_filter_minimum_allele_frequency": 0.8,
"minimum_depth": 10,
"disable_mask": "False",
"mask_bed": "files/RLC_Farhat.bed",
Expand All @@ -147,7 +147,7 @@ def test_junomapping_dryrun_changed_user_params(self) -> None:
"--reference",
"reference.fasta",
"--disable-mask",
"-maf",
"-smaf",
"0.5",
"-md",
"20",
Expand All @@ -159,7 +159,7 @@ def test_junomapping_dryrun_changed_user_params(self) -> None:
expected_user_param_values = {
"species": "mycobacterium_tuberculosis",
"reference": "reference.fasta",
"minimum_allele_frequency": 0.5,
"soft_filter_minimum_allele_frequency": 0.5,
"minimum_depth": 20,
"disable_mask": "True",
"mask_bed": "None",
Expand Down
6 changes: 3 additions & 3 deletions tests/test_pipeline_singularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class TestJunoMappingPipelineSingularity(unittest.TestCase):
46679: {"REF": "C", "ALT": "G"},
}

output_dir = Path("pipeline_test_output_singularity")
output_dir = Path("/mnt/pipeline_test_output_singularity")
input_dir = "tests"

@classmethod
Expand All @@ -61,7 +61,7 @@ def test_010_junomapping_run_in_singularity(self) -> None:
kraken_db = Path.home().joinpath("kraken-database")
assert kraken_db.exists(), "Kraken database not found"
else:
kraken_db = Path("/home/runner/kraken-database")
kraken_db = Path("/mnt/kraken-database")

pipeline = JunoMapping(
argv=[
Expand All @@ -81,7 +81,7 @@ def test_010_junomapping_run_in_singularity(self) -> None:
"--db-dir",
str(kraken_db),
"--prefix",
"/home/runner/sing_containers",
"/mnt/sing_containers",
]
)
pipeline.run()
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/qc_variant_calling.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
rule get_filter_status:
input:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
output:
tsv=OUT + "/qc_variant_calling/get_filter_status/{sample}.tsv",
message:
Expand Down
54 changes: 41 additions & 13 deletions workflow/rules/variant_filtering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,49 @@ gatk FilterMutectCalls -V {input.vcf} \
"""


rule filter_af:
rule hard_filter_af:
input:
vcf=OUT + "/variants_raw/FMC/{sample}.vcf",
stats=OUT + "/variants_raw/raw/{sample}.vcf.stats",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants_raw/FMC_af/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard/{sample}.vcf",
message:
"Hard filtering variants with very low allele frequency for {wildcards.sample}"
container:
"docker://staphb/bcftools:1.16"
conda:
"../envs/bcftools.yaml"
params:
min_af=config["hard_filter_minimum_allele_frequency"],
log:
OUT + "/log/hard_filter_af/{sample}.log",
threads: config["threads"]["filter_variants"]
resources:
mem_gb=config["mem_gb"]["filter_variants"],
shell:
"""
bcftools filter \
--exclude \"FORMAT/AF < {params.min_af}\" \
{input.vcf} \
1>{output.vcf} \
2>{log}
"""


rule soft_filter_af:
input:
vcf=OUT + "/variants_raw/FMC_afhard/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants_raw/FMC_afhard_afsoft/{sample}.vcf",
message:
"Marking minority variants for {wildcards.sample}"
container:
"docker://staphb/bcftools:1.16"
conda:
"../envs/bcftools.yaml"
params:
min_af=config["minimum_allele_frequency"],
min_af=config["soft_filter_minimum_allele_frequency"],
log:
OUT + "/log/filter_af/{sample}.log",
threads: config["threads"]["filter_variants"]
Expand All @@ -62,10 +90,10 @@ bcftools filter \

rule filter_depth:
input:
vcf=OUT + "/variants_raw/FMC_af/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf",
container:
"docker://staphb/bcftools:1.16"
conda:
Expand Down Expand Up @@ -103,11 +131,11 @@ if config["disable_mask"] == "True":

rule filter_mask:
input:
vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
mask=OUT + "/variants_raw/no_mask.bed",
output:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
log:
OUT + "/log/filter_depth/{sample}.log",
threads: config["threads"]["filter_variants"]
Expand Down Expand Up @@ -138,11 +166,11 @@ else:

rule filter_mask:
input:
vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
mask=OUT + "/variants_raw/mask.bed",
output:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
container:
"docker://staphb/bcftools:1.16"
conda:
Expand All @@ -165,10 +193,10 @@ bcftools filter \

rule remove_low_confidence_variants:
input:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants/{sample}.vcf",
vcf=OUT + "/variants_clean/{sample}.vcf",
message:
"Remove low confidence variants for {wildcards.sample}"
conda:
Expand All @@ -193,7 +221,7 @@ gatk SelectVariants \
rule select_snps:
input:
ref=OUT + "/reference/reference.fasta",
vcf=OUT + "/variants/{sample}.vcf",
vcf=OUT + "/variants_clean/{sample}.vcf",
output:
vcf=OUT + "/variants_snps_only/{sample}.snps.vcf",
container:
Expand Down
Loading