snakemake-workflows · famosab · Jan 9, 2025 · Jan 9, 2025 · coderabbitai · Jan 9, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,5 @@ resources/**
 logs
 logs/**
 data/**
-report/**
+report/**
+test/**
diff --git a/workflow/envs/cyvcf.yaml b/workflow/envs/cyvcf.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - cyvcf2
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -455,6 +455,8 @@ def get_vaf_status(wildcards):
     vaf_benchmark = benchmarks[wildcards.benchmark].get("vaf-field")
     if vaf_benchmark is None:
         return False
+    if vaf_benchmark == "tbc":
+        return True
     else:
         callsets = get_benchmark_callsets(wildcards.benchmark)
         vaf_callsets = [

diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
@@ -91,6 +91,19 @@ rule remove_non_pass:
         "v3.3.6/bio/bcftools/view"
 
 
+rule calculate_vaf:
+    input:
+        "results/filtered-variants/{callset}.bcf",
+    output:
+        "results/calculate-vaf/{callset}.added-vaf.bcf",
+    log:
+        "logs/calculate-vaf/",
+    conda:
+        "../envs/cyvcf.yaml"
+    script:
+        "../scripts/calc-vaf.py"
+
+
 rule intersect_calls_with_target_regions:
     input:
         bcf="results/filtered-variants/{callset}.bcf",

diff --git a/workflow/scripts/calc-vaf.py b/workflow/scripts/calc-vaf.py
@@ -0,0 +1,36 @@
+from cyvcf2 import VCF
+
+#vcf = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_snvs_VEP.ann.vcf" #snakemake.input
+#indel = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_indels_VEP.ann.vcf.gz"
+
+#
+# SNVS
+#
+
+def get_snv_allele_freq(vcf):
+    for variant in VCF(vcf):
+        refCounts = variant.format(variant.REF + "U")
+        altCounts = variant.format(variant.ALT[0] + "U")
+
+        # TODO: check which value is the correct one from the matrix (this leads to many zero VAF)
+        tier1RefCounts = refCounts[0, 0]
+        tier1AltCounts = altCounts[0, 0]
+
+        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
+
+        print(vaf)
-def get_snv_allele_freq(vcf):
-    for variant in VCF(vcf):
-        refCounts = variant.format(variant.REF + "U")
-        altCounts = variant.format(variant.ALT[0] + "U")
-
-        # TODO: check which value is the correct one from the matrix (this leads to many zero VAF)
-        tier1RefCounts = refCounts[0, 0]
-        tier1AltCounts = altCounts[0, 0]
-
-        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
-
-        print(vaf)
+def get_snv_allele_freq(vcf):
+    """Calculate Variant Allele Frequency (VAF) for SNVs.
+    
+    Args:
+        vcf: Path to the VCF file containing SNV variants
+        
+    Returns:
+        List of tuples containing (variant_id, vaf)
+    """
+    results = []
+    for variant in VCF(vcf):
+        refCounts = variant.format(variant.REF + "U")
+        altCounts = variant.format(variant.ALT[0] + "U")
+
+        tier1RefCounts = refCounts[0, 0]
+        tier1AltCounts = altCounts[0, 0]
+
+        total_counts = tier1AltCounts + tier1RefCounts
+        if total_counts == 0:
+            vaf = 0.0
+        else:
+            vaf = tier1AltCounts / total_counts
+
+        results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
+    return results
-def get_snv_allele_freq(vcf):
-    for variant in VCF(vcf):
-        refCounts = variant.format(variant.REF + "U")
-        altCounts = variant.format(variant.ALT[0] + "U")
-
-        # TODO: check which value is the correct one from the matrix (this leads to many zero VAF)
-        tier1RefCounts = refCounts[0, 0]
-        tier1AltCounts = altCounts[0, 0]
-
-        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
-
-        print(vaf)
+def get_snv_allele_freq(vcf):
+    """Calculate Variant Allele Frequency (VAF) for SNVs.
+    
+    Args:
+        vcf: Path to the VCF file containing SNV variants
+        
+    Returns:
+        List of tuples containing (variant_id, vaf)
+    """
+    results = []
+    for variant in VCF(vcf):
+        refCounts = variant.format(variant.REF + "U")
+        altCounts = variant.format(variant.ALT[0] + "U")
+
+        tier1RefCounts = refCounts[0, 0]
+        tier1AltCounts = altCounts[0, 0]
+
+        total_counts = tier1AltCounts + tier1RefCounts
+        if total_counts == 0:
+            vaf = 0.0
+        else:
+            vaf = tier1AltCounts / total_counts
+
+        results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
+    return results
+
+
+
+#
+# INDELs
+#
+
+def get_indel_allele_freq(vcf):
+    for variant in VCF(vcf):
+        tier1RefCounts = variant.format("TAR")[0,0]
+        tier1AltCounts = variant.format("TIR")[0,0]
+
+        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
+
+        print(vaf)
-def get_indel_allele_freq(vcf):
-    for variant in VCF(vcf):
-        tier1RefCounts = variant.format("TAR")[0,0]
-        tier1AltCounts = variant.format("TIR")[0,0]
-
-        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
-
-        print(vaf)
+def get_indel_allele_freq(vcf):
+    """Calculate Variant Allele Frequency (VAF) for INDELs.
+    
+    Args:
+        vcf: Path to the VCF file containing INDEL variants
+        
+    Returns:
+        List of tuples containing (variant_id, vaf)
+    """
+    results = []
+    for variant in VCF(vcf):
+        tier1RefCounts = variant.format("TAR")[0,0]
+        tier1AltCounts = variant.format("TIR")[0,0]
+
+        total_counts = tier1AltCounts + tier1RefCounts
+        if total_counts == 0:
+            vaf = 0.0
+        else:
+            vaf = tier1AltCounts / total_counts
+
+        results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
+    return results
-def get_indel_allele_freq(vcf):
-    for variant in VCF(vcf):
-        tier1RefCounts = variant.format("TAR")[0,0]
-        tier1AltCounts = variant.format("TIR")[0,0]
-
-        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
-
-        print(vaf)
+def get_indel_allele_freq(vcf):
+    """Calculate Variant Allele Frequency (VAF) for INDELs.
+    
+    Args:
+        vcf: Path to the VCF file containing INDEL variants
+        
+    Returns:
+        List of tuples containing (variant_id, vaf)
+    """
+    results = []
+    for variant in VCF(vcf):
+        tier1RefCounts = variant.format("TAR")[0,0]
+        tier1AltCounts = variant.format("TIR")[0,0]
+
+        total_counts = tier1AltCounts + tier1RefCounts
+        if total_counts == 0:
+            vaf = 0.0
+        else:
+            vaf = tier1AltCounts / total_counts
+
+        results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
+    return results
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,5 @@ resources/** @@
     logs
     logs/**
     data/**
-    report/**
+    report/**
+    test/**