Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add vaf calculation for strelka results #109

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ resources/**
logs
logs/**
data/**
report/**
report/**
test/**
5 changes: 5 additions & 0 deletions workflow/envs/cyvcf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- cyvcf2
2 changes: 2 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,8 @@ def get_vaf_status(wildcards):
vaf_benchmark = benchmarks[wildcards.benchmark].get("vaf-field")
if vaf_benchmark is None:
return False
if vaf_benchmark == "tbc":
return True
else:
callsets = get_benchmark_callsets(wildcards.benchmark)
vaf_callsets = [
Expand Down
13 changes: 13 additions & 0 deletions workflow/rules/eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,19 @@ rule remove_non_pass:
"v3.3.6/bio/bcftools/view"


rule calculate_vaf:
input:
"results/filtered-variants/{callset}.bcf",
output:
"results/calculate-vaf/{callset}.added-vaf.bcf",
log:
"logs/calculate-vaf/",
conda:
"../envs/cyvcf.yaml"
script:
"../scripts/calc-vaf.py"


rule intersect_calls_with_target_regions:
input:
bcf="results/filtered-variants/{callset}.bcf",
Expand Down
36 changes: 36 additions & 0 deletions workflow/scripts/calc-vaf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from cyvcf2 import VCF

#vcf = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_snvs_VEP.ann.vcf" #snakemake.input
#indel = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_indels_VEP.ann.vcf.gz"

#
# SNVS
#

def get_snv_allele_freq(vcf):
for variant in VCF(vcf):
refCounts = variant.format(variant.REF + "U")
altCounts = variant.format(variant.ALT[0] + "U")

# TODO: check which value is the correct one from the matrix (this leads to many zero VAF)
tier1RefCounts = refCounts[0, 0]
tier1AltCounts = altCounts[0, 0]

vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)

print(vaf)
Comment on lines +10 to +21
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Enhance the SNV VAF calculation implementation.

The current implementation needs improvements:

  1. Add error handling for division by zero
  2. Add docstring explaining the function
  3. Store results instead of just printing
  4. Validate the matrix values as noted in TODO

Here's a suggested implementation:

 def get_snv_allele_freq(vcf):
+    """Calculate Variant Allele Frequency (VAF) for SNVs.
+    
+    Args:
+        vcf: Path to the VCF file containing SNV variants
+        
+    Returns:
+        List of tuples containing (variant_id, vaf)
+    """
+    results = []
     for variant in VCF(vcf):
         refCounts = variant.format(variant.REF + "U")
         altCounts = variant.format(variant.ALT[0] + "U")
 
-        # TODO: check which value is the correct one from the matrix (this leads to many zero VAF)
         tier1RefCounts = refCounts[0, 0]
         tier1AltCounts = altCounts[0, 0]
 
-        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
+        total_counts = tier1AltCounts + tier1RefCounts
+        if total_counts == 0:
+            vaf = 0.0
+        else:
+            vaf = tier1AltCounts / total_counts
 
-        print(vaf)
+        results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
+    return results
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def get_snv_allele_freq(vcf):
for variant in VCF(vcf):
refCounts = variant.format(variant.REF + "U")
altCounts = variant.format(variant.ALT[0] + "U")
# TODO: check which value is the correct one from the matrix (this leads to many zero VAF)
tier1RefCounts = refCounts[0, 0]
tier1AltCounts = altCounts[0, 0]
vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
print(vaf)
def get_snv_allele_freq(vcf):
"""Calculate Variant Allele Frequency (VAF) for SNVs.
Args:
vcf: Path to the VCF file containing SNV variants
Returns:
List of tuples containing (variant_id, vaf)
"""
results = []
for variant in VCF(vcf):
refCounts = variant.format(variant.REF + "U")
altCounts = variant.format(variant.ALT[0] + "U")
tier1RefCounts = refCounts[0, 0]
tier1AltCounts = altCounts[0, 0]
total_counts = tier1AltCounts + tier1RefCounts
if total_counts == 0:
vaf = 0.0
else:
vaf = tier1AltCounts / total_counts
results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
return results




#
# INDELs
#

def get_indel_allele_freq(vcf):
for variant in VCF(vcf):
tier1RefCounts = variant.format("TAR")[0,0]
tier1AltCounts = variant.format("TIR")[0,0]

vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)

print(vaf)
Comment on lines +29 to +36
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Apply similar improvements to INDEL VAF calculation.

The INDEL function needs the same enhancements as the SNV function.

Here's a suggested implementation:

 def get_indel_allele_freq(vcf):
+    """Calculate Variant Allele Frequency (VAF) for INDELs.
+    
+    Args:
+        vcf: Path to the VCF file containing INDEL variants
+        
+    Returns:
+        List of tuples containing (variant_id, vaf)
+    """
+    results = []
     for variant in VCF(vcf):
         tier1RefCounts = variant.format("TAR")[0,0]
         tier1AltCounts = variant.format("TIR")[0,0]
 
-        vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
+        total_counts = tier1AltCounts + tier1RefCounts
+        if total_counts == 0:
+            vaf = 0.0
+        else:
+            vaf = tier1AltCounts / total_counts
 
-        print(vaf)
+        results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
+    return results
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def get_indel_allele_freq(vcf):
for variant in VCF(vcf):
tier1RefCounts = variant.format("TAR")[0,0]
tier1AltCounts = variant.format("TIR")[0,0]
vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
print(vaf)
def get_indel_allele_freq(vcf):
"""Calculate Variant Allele Frequency (VAF) for INDELs.
Args:
vcf: Path to the VCF file containing INDEL variants
Returns:
List of tuples containing (variant_id, vaf)
"""
results = []
for variant in VCF(vcf):
tier1RefCounts = variant.format("TAR")[0,0]
tier1AltCounts = variant.format("TIR")[0,0]
total_counts = tier1AltCounts + tier1RefCounts
if total_counts == 0:
vaf = 0.0
else:
vaf = tier1AltCounts / total_counts
results.append((variant.ID or f"{variant.CHROM}:{variant.POS}", vaf))
return results

Loading