diff --git a/.github/workflows/juno_mapping_singularity.yaml b/.github/workflows/juno_mapping_singularity.yaml index bb8a563..854afb6 100644 --- a/.github/workflows/juno_mapping_singularity.yaml +++ b/.github/workflows/juno_mapping_singularity.yaml @@ -2,10 +2,10 @@ name: Singularity e2e test -on: [pull_request] +on: workflow_dispatch env: - KRAKEN_DEFAULT_DB: /home/runner/kraken-database + KRAKEN_DEFAULT_DB: /mnt/kraken-database jobs: singularity_end_to_end: @@ -18,10 +18,18 @@ jobs: name: Singularity Juno_mapping pipeline ${{ matrix.config.os }} steps: - - uses: actions/checkout@v2 + - name: check space 1 + shell: bash -l {0} + run: | + sudo chmod -R 777 /mnt + echo "Checking space: ." + df -h . + echo "Checking space: /mnt" + df -h /mnt + - uses: actions/checkout@v4 - uses: eWaterCycle/setup-singularity@v7 with: - singularity-version: 3.8.7 + singularity-version: 3.8.3 - name: Install Conda environment with Micromamba uses: mamba-org/setup-micromamba@v1 with: @@ -42,10 +50,11 @@ jobs: curl -k https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20220908.tar.gz > k2_viral_20220908.tar.gz tar zxvf k2_viral_20220908.tar.gz ls -lh + rm k2_viral_20220908.tar.gz - name: Conda list shell: bash -l {0} run: conda list - - name: check space + - name: check space 2 shell: bash -l {0} run: | which singularity @@ -56,7 +65,10 @@ jobs: sudo rm -rf /opt/ghc sudo rm -rf "/usr/local/share/boost" fi + echo "Checking space: ." df -h . + echo "Checking space: /mnt" + df -h /mnt which singularity singularity --version - name: Test juno_mapping pipeline using singularity. @@ -65,10 +77,16 @@ jobs: - name: Test the juno_mapping wrapper shell: bash -l {0} run: pytest -v tests/test_juno_mapping.py - - name: check space 2 + - name: check space 3 if: always() shell: bash -l {0} run: | + echo "Checking space: ." df -h . - - \ No newline at end of file + echo "Checking space: /mnt" + df -h /mnt + du -sh /mnt/kraken-database + echo "Checking space: $GITHUB_WORKSPACE" + du -sh $GITHUB_WORKSPACE + echo "Checking space: /home/runner" + du -sh /home/runner/* \ No newline at end of file diff --git a/juno_mapping.py b/juno_mapping.py index 7317c93..e3f013a 100644 --- a/juno_mapping.py +++ b/juno_mapping.py @@ -145,12 +145,20 @@ def _add_args_to_parser(self) -> None: help="Minimum length for fastq reads to be kept after trimming.", ) self.add_argument( - "-maf", - "--minimum-allele-frequency", + "-smaf", + "--soft-filter-minimum-allele-frequency", type=check_number_within_range(minimum=0, maximum=1), metavar="FLOAT", default=0.8, - help="Minimum allele frequency to filter variants on.", + help="Minimum allele frequency to soft filter variants on. Soft filtering is done by adding a filter tag to the VCF file.", + ) + self.add_argument( + "-hmaf", + "--hard-filter-minimum-allele-frequency", + type=check_number_within_range(minimum=0, maximum=1), + metavar="FLOAT", + default=0.2, + help="Minimum allele frequency to hard filter variants on. Hard filtering is done by removing variants from the VCF file.", ) self.add_argument( "--min-reads-per-strand", @@ -176,7 +184,12 @@ def _parse_args(self) -> argparse.Namespace: self.time_limit: int = args.time_limit self.species: str = args.species self.minimum_depth_variant: int = args.minimum_depth_variant - self.minimum_allele_frequency: float = args.minimum_allele_frequency + self.hard_filter_minimum_allele_frequency: float = ( + args.hard_filter_minimum_allele_frequency + ) + self.soft_filter_minimum_allele_frequency: float = ( + args.soft_filter_minimum_allele_frequency + ) self.min_reads_per_strand: int = args.min_reads_per_strand return args @@ -258,7 +271,12 @@ def setup(self) -> None: "use_singularity": str(self.snakemake_args["use_singularity"]), "species": str(self.species), "minimum_depth": int(self.minimum_depth_variant), - "minimum_allele_frequency": float(self.minimum_allele_frequency), + "soft_filter_minimum_allele_frequency": float( + self.soft_filter_minimum_allele_frequency + ), + "hard_filter_minimum_allele_frequency": float( + self.hard_filter_minimum_allele_frequency + ), "min_reads_per_strand": int(self.min_reads_per_strand), } diff --git a/tests/test_juno_mapping.py b/tests/test_juno_mapping.py index 041ccbf..fdaf8a7 100644 --- a/tests/test_juno_mapping.py +++ b/tests/test_juno_mapping.py @@ -122,7 +122,7 @@ def test_junomapping_dryrun_user_params(self) -> None: expected_user_param_values = { "species": "mycobacterium_tuberculosis", "reference": "reference.fasta", - "minimum_allele_frequency": 0.8, + "soft_filter_minimum_allele_frequency": 0.8, "minimum_depth": 10, "disable_mask": "False", "mask_bed": "files/RLC_Farhat.bed", @@ -147,7 +147,7 @@ def test_junomapping_dryrun_changed_user_params(self) -> None: "--reference", "reference.fasta", "--disable-mask", - "-maf", + "-smaf", "0.5", "-md", "20", @@ -159,7 +159,7 @@ def test_junomapping_dryrun_changed_user_params(self) -> None: expected_user_param_values = { "species": "mycobacterium_tuberculosis", "reference": "reference.fasta", - "minimum_allele_frequency": 0.5, + "soft_filter_minimum_allele_frequency": 0.5, "minimum_depth": 20, "disable_mask": "True", "mask_bed": "None", diff --git a/tests/test_pipeline_singularity.py b/tests/test_pipeline_singularity.py index ef0e2e3..92cf9c5 100644 --- a/tests/test_pipeline_singularity.py +++ b/tests/test_pipeline_singularity.py @@ -43,7 +43,7 @@ class TestJunoMappingPipelineSingularity(unittest.TestCase): 46679: {"REF": "C", "ALT": "G"}, } - output_dir = Path("pipeline_test_output_singularity") + output_dir = Path("/mnt/pipeline_test_output_singularity") input_dir = "tests" @classmethod @@ -61,7 +61,7 @@ def test_010_junomapping_run_in_singularity(self) -> None: kraken_db = Path.home().joinpath("kraken-database") assert kraken_db.exists(), "Kraken database not found" else: - kraken_db = Path("/home/runner/kraken-database") + kraken_db = Path("/mnt/kraken-database") pipeline = JunoMapping( argv=[ @@ -81,7 +81,7 @@ def test_010_junomapping_run_in_singularity(self) -> None: "--db-dir", str(kraken_db), "--prefix", - "/home/runner/sing_containers", + "/mnt/sing_containers", ] ) pipeline.run() diff --git a/workflow/rules/qc_variant_calling.smk b/workflow/rules/qc_variant_calling.smk index 5c17575..ae18f0c 100644 --- a/workflow/rules/qc_variant_calling.smk +++ b/workflow/rules/qc_variant_calling.smk @@ -1,6 +1,6 @@ rule get_filter_status: input: - vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf", + vcf=OUT + "/variants/{sample}.vcf", output: tsv=OUT + "/qc_variant_calling/get_filter_status/{sample}.tsv", message: diff --git a/workflow/rules/variant_filtering.smk b/workflow/rules/variant_filtering.smk index af6a84c..e84366a 100644 --- a/workflow/rules/variant_filtering.smk +++ b/workflow/rules/variant_filtering.smk @@ -29,13 +29,41 @@ gatk FilterMutectCalls -V {input.vcf} \ """ -rule filter_af: +rule hard_filter_af: input: vcf=OUT + "/variants_raw/FMC/{sample}.vcf", - stats=OUT + "/variants_raw/raw/{sample}.vcf.stats", ref=OUT + "/reference/reference.fasta", output: - vcf=OUT + "/variants_raw/FMC_af/{sample}.vcf", + vcf=OUT + "/variants_raw/FMC_afhard/{sample}.vcf", + message: + "Hard filtering variants with very low allele frequency for {wildcards.sample}" + container: + "docker://staphb/bcftools:1.16" + conda: + "../envs/bcftools.yaml" + params: + min_af=config["hard_filter_minimum_allele_frequency"], + log: + OUT + "/log/hard_filter_af/{sample}.log", + threads: config["threads"]["filter_variants"] + resources: + mem_gb=config["mem_gb"]["filter_variants"], + shell: + """ +bcftools filter \ +--exclude \"FORMAT/AF < {params.min_af}\" \ +{input.vcf} \ +1>{output.vcf} \ +2>{log} + """ + + +rule soft_filter_af: + input: + vcf=OUT + "/variants_raw/FMC_afhard/{sample}.vcf", + ref=OUT + "/reference/reference.fasta", + output: + vcf=OUT + "/variants_raw/FMC_afhard_afsoft/{sample}.vcf", message: "Marking minority variants for {wildcards.sample}" container: @@ -43,7 +71,7 @@ rule filter_af: conda: "../envs/bcftools.yaml" params: - min_af=config["minimum_allele_frequency"], + min_af=config["soft_filter_minimum_allele_frequency"], log: OUT + "/log/filter_af/{sample}.log", threads: config["threads"]["filter_variants"] @@ -62,10 +90,10 @@ bcftools filter \ rule filter_depth: input: - vcf=OUT + "/variants_raw/FMC_af/{sample}.vcf", + vcf=OUT + "/variants_raw/FMC_afhard_afsoft/{sample}.vcf", ref=OUT + "/reference/reference.fasta", output: - vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf", + vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf", container: "docker://staphb/bcftools:1.16" conda: @@ -103,11 +131,11 @@ if config["disable_mask"] == "True": rule filter_mask: input: - vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf", + vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf", ref=OUT + "/reference/reference.fasta", mask=OUT + "/variants_raw/no_mask.bed", output: - vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf", + vcf=OUT + "/variants/{sample}.vcf", log: OUT + "/log/filter_depth/{sample}.log", threads: config["threads"]["filter_variants"] @@ -138,11 +166,11 @@ else: rule filter_mask: input: - vcf=OUT + "/variants_raw/FMC_af_depth/{sample}.vcf", + vcf=OUT + "/variants_raw/FMC_afhard_afsoft_depth/{sample}.vcf", ref=OUT + "/reference/reference.fasta", mask=OUT + "/variants_raw/mask.bed", output: - vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf", + vcf=OUT + "/variants/{sample}.vcf", container: "docker://staphb/bcftools:1.16" conda: @@ -165,10 +193,10 @@ bcftools filter \ rule remove_low_confidence_variants: input: - vcf=OUT + "/variants_raw/FMC_af_depth_masked/{sample}.vcf", + vcf=OUT + "/variants/{sample}.vcf", ref=OUT + "/reference/reference.fasta", output: - vcf=OUT + "/variants/{sample}.vcf", + vcf=OUT + "/variants_clean/{sample}.vcf", message: "Remove low confidence variants for {wildcards.sample}" conda: @@ -193,7 +221,7 @@ gatk SelectVariants \ rule select_snps: input: ref=OUT + "/reference/reference.fasta", - vcf=OUT + "/variants/{sample}.vcf", + vcf=OUT + "/variants_clean/{sample}.vcf", output: vcf=OUT + "/variants_snps_only/{sample}.snps.vcf", container: