fix: properly solve DAG workflow for nonsegmented matched-ref samples

refactor: add extra safeguards to amplicon_covs script to bridge time for full refactor
RIVM-bioinformatics · Oct 8, 2024 · 02a821a · 02a821a
1 parent 86f5093
commit 02a821a
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 36 deletions.
diff --git a/ViroConstrictor/workflow/scripts/amplicon_covs.py b/ViroConstrictor/workflow/scripts/amplicon_covs.py
@@ -147,37 +147,39 @@ def remove_alt_primer_r(df):
 
 
 def Find_NonOverlap(df):
-    dd = df.to_dict(orient="records")
-    startingpoint = {}
-    endingpoint = {}
-    lastindex = list(enumerate(dd))[-1][0]
-    firstindex = list(enumerate(dd))[0][0]
-    for x, v in enumerate(dd):
-        t_end = v.get("rightstart")
-        s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend")
-        end_override = dd[x + 1].get("leftend") if x != lastindex else None
-        primerstart = s
-        if end_override is not None and end_override in range(primerstart, t_end):
-            primerend = end_override
-        else:
-            primerend = t_end
-        startingpoint[primerstart] = v.get("name")
-        endingpoint[primerend] = v.get("name")
-
-    startdf = (
-        pd.DataFrame.from_dict(startingpoint, orient="index")
-        .reset_index()
-        .rename(columns={0: "name", "index": "unique_start"})
-    )
-    enddf = (
-        pd.DataFrame.from_dict(endingpoint, orient="index")
-        .reset_index()
-        .rename(columns={0: "name", "index": "unique_end"})
-    )
-    df = pd.merge(df, startdf, on="name", how="inner")
-    df = pd.merge(df, enddf, on="name", how="inner")
-
-    return df
+    if not df.empty:
+        dd = df.to_dict(orient="records")
+        startingpoint = {}
+        endingpoint = {}
+        lastindex = list(enumerate(dd))[-1][0]
+        firstindex = list(enumerate(dd))[0][0]
+        for x, v in enumerate(dd):
+            t_end = v.get("rightstart")
+            s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend")
+            end_override = dd[x + 1].get("leftend") if x != lastindex else None
+            primerstart = s
+            if end_override is not None and end_override in range(primerstart, t_end):
+                primerend = end_override
+            else:
+                primerend = t_end
+            startingpoint[primerstart] = v.get("name")
+            endingpoint[primerend] = v.get("name")
+
+        startdf = (
+            pd.DataFrame.from_dict(startingpoint, orient="index")
+            .reset_index()
+            .rename(columns={0: "name", "index": "unique_start"})
+        )
+        enddf = (
+            pd.DataFrame.from_dict(endingpoint, orient="index")
+            .reset_index()
+            .rename(columns={0: "name", "index": "unique_end"})
+        )
+        df = pd.merge(df, startdf, on="name", how="inner")
+        df = pd.merge(df, enddf, on="name", how="inner")
+        return df
+    else:
+        return pd.DataFrame(columns=["name", "leftstart", "leftend", "rightstart", "rightend", "unique_start", "unique_end"])
 
 
 def avg(lst):
@@ -251,6 +253,14 @@ def pad_name(name):
     lf = remove_alt_primer_l(remove_alt_keyword(lf))
     rf = remove_alt_primer_r(remove_alt_keyword(rf))
 
+    # if either lf or rf is empty, write empty csv and exit
+    # csv will have one row with index "flags.key" and an empty value, no column name
+    if len(lf) == 0 or len(rf) == 0:
+        df = pd.DataFrame({flags.key: [None]})
+        print(df)
+        df.to_csv(flags.output, sep=",", index=False, header=False)
+        sys.exit(0)
+
     non_overlapping_points = Find_NonOverlap(
         pd.merge(lf, rf, on="name", how="inner")
         .rename(

diff --git a/ViroConstrictor/workflow/workflow.smk b/ViroConstrictor/workflow/workflow.smk
@@ -655,19 +655,28 @@ def group_aminoacids_inputs(wildcards):
         select_samples = list(
             samples_df.loc[samples_df["Virus"] == i]["sample"].unique()
         )
-        select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique())
+        # for x in select_samples:
+        #     y = samples_df.loc[(samples_df["Virus"] == i) & (samples_df["sample"] == x)]["RefID"].unique()
+        #     print(y)
+        # select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique())
+        # print(select_refIDs)
 
         # create a dictionary of dictionaries for each virus, with 'i' as the primary key and sample as the secondary key having a list of refIDs as the value
-        struct[i] = {sample: select_refIDs for sample in select_samples}
-
+        struct[i] = {
+            sample: list(
+                samples_df.loc[
+                    (samples_df["Virus"] == i) & (samples_df["sample"] == sample)
+                ]["RefID"].unique()
+            )
+            for sample in select_samples
+        }
     file_list = []
     for virus, sample in struct.items():
         for sample, refid in sample.items():
             for ref in refid:
                 file_list.append(
                     f"{datadir}Virus~{virus}/RefID~{ref}/{amino}{sample}/aa.faa"
                 )
-
     return file_list
 
 
@@ -772,7 +781,7 @@ rule concat_boc:
 
 rule calculate_amplicon_cov:
     input:
-        pr=f"{datadir}{wc_folder}{prim}" "{sample}_removedprimers.bed",
+        pr=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed",
         cov=rules.trueconsense.output.cov,
     output:
         f"{datadir}{wc_folder}{prim}" "{sample}_ampliconcoverage.csv",