Skip to content

Commit

Permalink
Merge pull request #9 from RIVM-bioinformatics/soft_filter
Browse files Browse the repository at this point in the history
fix: distinguish soft and hard AF filter
  • Loading branch information
boasvdp authored Feb 7, 2024
2 parents 799156a + e563cab commit 2e6f620
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 33 deletions.
34 changes: 26 additions & 8 deletions .github/workflows/juno_mapping_singularity.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

name: Singularity e2e test

on: [pull_request]
on: workflow_dispatch

env:
KRAKEN_DEFAULT_DB: /home/runner/kraken-database
KRAKEN_DEFAULT_DB: /mnt/kraken-database

jobs:
singularity_end_to_end:
Expand All @@ -18,10 +18,18 @@ jobs:
name: Singularity Juno_mapping pipeline ${{ matrix.config.os }}

steps:
- uses: actions/checkout@v2
- name: check space 1
shell: bash -l {0}
run: |
sudo chmod -R 777 /mnt
echo "Checking space: ."
df -h .
echo "Checking space: /mnt"
df -h /mnt
- uses: actions/checkout@v4
- uses: eWaterCycle/setup-singularity@v7
with:
singularity-version: 3.8.7
singularity-version: 3.8.3
- name: Install Conda environment with Micromamba
uses: mamba-org/setup-micromamba@v1
with:
Expand All @@ -42,10 +50,11 @@ jobs:
curl -k https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20220908.tar.gz > k2_viral_20220908.tar.gz
tar zxvf k2_viral_20220908.tar.gz
ls -lh
rm k2_viral_20220908.tar.gz
- name: Conda list
shell: bash -l {0}
run: conda list
- name: check space
- name: check space 2
shell: bash -l {0}
run: |
which singularity
Expand All @@ -56,7 +65,10 @@ jobs:
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
fi
echo "Checking space: ."
df -h .
echo "Checking space: /mnt"
df -h /mnt
which singularity
singularity --version
- name: Test juno_mapping pipeline using singularity.
Expand All @@ -65,10 +77,16 @@ jobs:
- name: Test the juno_mapping wrapper
shell: bash -l {0}
run: pytest -v tests/test_juno_mapping.py
- name: check space 2
- name: check space 3
if: always()
shell: bash -l {0}
run: |
echo "Checking space: ."
df -h .
echo "Checking space: /mnt"
df -h /mnt
du -sh /mnt/kraken-database
echo "Checking space: $GITHUB_WORKSPACE"
du -sh $GITHUB_WORKSPACE
echo "Checking space: /home/runner"
du -sh /home/runner/*
28 changes: 23 additions & 5 deletions juno_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,20 @@ def _add_args_to_parser(self) -> None:
help="Minimum length for fastq reads to be kept after trimming.",
)
self.add_argument(
"-maf",
"--minimum-allele-frequency",
"-smaf",
"--soft-filter-minimum-allele-frequency",
type=check_number_within_range(minimum=0, maximum=1),
metavar="FLOAT",
default=0.8,
help="Minimum allele frequency to filter variants on.",
help="Minimum allele frequency to soft filter variants on. Soft filtering is done by adding a filter tag to the VCF file.",
)
self.add_argument(
"-hmaf",
"--hard-filter-minimum-allele-frequency",
type=check_number_within_range(minimum=0, maximum=1),
metavar="FLOAT",
default=0.2,
help="Minimum allele frequency to hard filter variants on. Hard filtering is done by removing variants from the VCF file.",
)
self.add_argument(
"--min-reads-per-strand",
Expand All @@ -176,7 +184,12 @@ def _parse_args(self) -> argparse.Namespace:
self.time_limit: int = args.time_limit
self.species: str = args.species
self.minimum_depth_variant: int = args.minimum_depth_variant
self.minimum_allele_frequency: float = args.minimum_allele_frequency
self.hard_filter_minimum_allele_frequency: float = (
args.hard_filter_minimum_allele_frequency
)
self.soft_filter_minimum_allele_frequency: float = (
args.soft_filter_minimum_allele_frequency
)
self.min_reads_per_strand: int = args.min_reads_per_strand

return args
Expand Down Expand Up @@ -258,7 +271,12 @@ def setup(self) -> None:
"use_singularity": str(self.snakemake_args["use_singularity"]),
"species": str(self.species),
"minimum_depth": int(self.minimum_depth_variant),
"minimum_allele_frequency": float(self.minimum_allele_frequency),
"soft_filter_minimum_allele_frequency": float(
self.soft_filter_minimum_allele_frequency
),
"hard_filter_minimum_allele_frequency": float(
self.hard_filter_minimum_allele_frequency
),
"min_reads_per_strand": int(self.min_reads_per_strand),
}

Expand Down
6 changes: 3 additions & 3 deletions tests/test_juno_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_junomapping_dryrun_user_params(self) -> None:
expected_user_param_values = {
"species": "mycobacterium_tuberculosis",
"reference": "reference.fasta",
"minimum_allele_frequency": 0.8,
"soft_filter_minimum_allele_frequency": 0.8,
"minimum_depth": 10,
"disable_mask": "False",
"mask_bed": "files/RLC_Farhat.bed",
Expand All @@ -147,7 +147,7 @@ def test_junomapping_dryrun_changed_user_params(self) -> None:
"--reference",
"reference.fasta",
"--disable-mask",
"-maf",
"-smaf",
"0.5",
"-md",
"20",
Expand All @@ -159,7 +159,7 @@ def test_junomapping_dryrun_changed_user_params(self) -> None:
expected_user_param_values = {
"species": "mycobacterium_tuberculosis",
"reference": "reference.fasta",
"minimum_allele_frequency": 0.5,
"soft_filter_minimum_allele_frequency": 0.5,
"minimum_depth": 20,
"disable_mask": "True",
"mask_bed": "None",
Expand Down
6 changes: 3 additions & 3 deletions tests/test_pipeline_singularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class TestJunoMappingPipelineSingularity(unittest.TestCase):
46679: {"REF": "C", "ALT": "G"},
}

output_dir = Path("pipeline_test_output_singularity")
output_dir = Path("/mnt/pipeline_test_output_singularity")
input_dir = "tests"

@classmethod
Expand All @@ -61,7 +61,7 @@ def test_010_junomapping_run_in_singularity(self) -> None:
kraken_db = Path.home().joinpath("kraken-database")
assert kraken_db.exists(), "Kraken database not found"
else:
kraken_db = Path("/home/runner/kraken-database")
kraken_db = Path("/mnt/kraken-database")

pipeline = JunoMapping(
argv=[
Expand All @@ -81,7 +81,7 @@ def test_010_junomapping_run_in_singularity(self) -> None:
"--db-dir",
str(kraken_db),
"--prefix",
"/home/runner/sing_containers",
"/mnt/sing_containers",
]
)
pipeline.run()
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/qc_variant_calling.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
rule get_filter_status:
input:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
output:
tsv=OUT + "/qc_variant_calling/get_filter_status/{sample}.tsv",
message:
Expand Down
54 changes: 41 additions & 13 deletions workflow/rules/variant_filtering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,49 @@ gatk FilterMutectCalls -V {input.vcf} \
"""


rule filter_af:
rule hard_filter_af:
input:
vcf=OUT + "/variants_raw/FMC/{sample}.vcf",
stats=OUT + "/variants_raw/raw/{sample}.vcf.stats",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants_raw/FMC_af/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard/{sample}.vcf",
message:
"Hard filtering variants with very low allele frequency for {wildcards.sample}"
container:
"docker://staphb/bcftools:1.16"
conda:
"../envs/bcftools.yaml"
params:
min_af=config["hard_filter_minimum_allele_frequency"],
log:
OUT + "/log/hard_filter_af/{sample}.log",
threads: config["threads"]["filter_variants"]
resources:
mem_gb=config["mem_gb"]["filter_variants"],
shell:
"""
bcftools filter \
--exclude \"FORMAT/AF < {params.min_af}\" \
{input.vcf} \
1>{output.vcf} \
2>{log}
"""


rule soft_filter_af:
input:
vcf=OUT + "/variants_raw/FMC_afhard/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants_raw/FMC_afhard_afsoft/{sample}.vcf",
message:
"Marking minority variants for {wildcards.sample}"
container:
"docker://staphb/bcftools:1.16"
conda:
"../envs/bcftools.yaml"
params:
min_af=config["minimum_allele_frequency"],
min_af=config["soft_filter_minimum_allele_frequency"],
log:
OUT + "/log/filter_af/{sample}.log",
threads: config["threads"]["filter_variants"]
Expand All @@ -62,10 +90,10 @@ bcftools filter \

rule filter_depth:
input:
vcf=OUT + "/variants_raw/FMC_af/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf",
container:
"docker://staphb/bcftools:1.16"
conda:
Expand Down Expand Up @@ -103,11 +131,11 @@ if config["disable_mask"] == "True":

rule filter_mask:
input:
vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
mask=OUT + "/variants_raw/no_mask.bed",
output:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
log:
OUT + "/log/filter_depth/{sample}.log",
threads: config["threads"]["filter_variants"]
Expand Down Expand Up @@ -138,11 +166,11 @@ else:

rule filter_mask:
input:
vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf",
vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
mask=OUT + "/variants_raw/mask.bed",
output:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
container:
"docker://staphb/bcftools:1.16"
conda:
Expand All @@ -165,10 +193,10 @@ bcftools filter \

rule remove_low_confidence_variants:
input:
vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf",
vcf=OUT + "/variants/{sample}.vcf",
ref=OUT + "/reference/reference.fasta",
output:
vcf=OUT + "/variants/{sample}.vcf",
vcf=OUT + "/variants_clean/{sample}.vcf",
message:
"Remove low confidence variants for {wildcards.sample}"
conda:
Expand All @@ -193,7 +221,7 @@ gatk SelectVariants \
rule select_snps:
input:
ref=OUT + "/reference/reference.fasta",
vcf=OUT + "/variants/{sample}.vcf",
vcf=OUT + "/variants_clean/{sample}.vcf",
output:
vcf=OUT + "/variants_snps_only/{sample}.snps.vcf",
container:
Expand Down

0 comments on commit 2e6f620

Please sign in to comment.