RIVM-bioinformatics · florianzwagemaker · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## [1.4.6](https://github.com/RIVM-bioinformatics/ViroConstrictor/compare/v1.4.5...v1.4.6) (2024-10-08)
+
+
+### Bug Fixes
+
+* properly solve DAG workflow for nonsegmented matched-ref samples ([02a821a](https://github.com/RIVM-bioinformatics/ViroConstrictor/commit/02a821a44c3ed3741c65825789ef25ad3e2093c1))
+
 ## [1.4.5](https://github.com/RIVM-bioinformatics/ViroConstrictor/compare/v1.4.4...v1.4.5) (2024-09-25)
 
 

diff --git a/CITATION.cff b/CITATION.cff
@@ -24,7 +24,7 @@ authors:
       National Institute for Public Health and the
       Environment (RIVM)
   - name: "The RIVM-IDS Bioinformatics team"
-version: 1.4.5 #x-release-please-version
+version: 1.4.6 #x-release-please-version
 doi: 10.5281/zenodo.7688035
 identifiers:
   - type: doi

diff --git a/ViroConstrictor/__init__.py b/ViroConstrictor/__init__.py
@@ -1,2 +1,2 @@
-__version__ = "1.4.5"
+__version__ = "1.4.6"
 __prog__ = "ViroConstrictor"
diff --git a/ViroConstrictor/workflow/scripts/amplicon_covs.py b/ViroConstrictor/workflow/scripts/amplicon_covs.py
@@ -147,37 +147,39 @@ def remove_alt_primer_r(df):
 
 
 def Find_NonOverlap(df):
-    dd = df.to_dict(orient="records")
-    startingpoint = {}
-    endingpoint = {}
-    lastindex = list(enumerate(dd))[-1][0]
-    firstindex = list(enumerate(dd))[0][0]
-    for x, v in enumerate(dd):
-        t_end = v.get("rightstart")
-        s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend")
-        end_override = dd[x + 1].get("leftend") if x != lastindex else None
-        primerstart = s
-        if end_override is not None and end_override in range(primerstart, t_end):
-            primerend = end_override
-        else:
-            primerend = t_end
-        startingpoint[primerstart] = v.get("name")
-        endingpoint[primerend] = v.get("name")
-
-    startdf = (
-        pd.DataFrame.from_dict(startingpoint, orient="index")
-        .reset_index()
-        .rename(columns={0: "name", "index": "unique_start"})
-    )
-    enddf = (
-        pd.DataFrame.from_dict(endingpoint, orient="index")
-        .reset_index()
-        .rename(columns={0: "name", "index": "unique_end"})
-    )
-    df = pd.merge(df, startdf, on="name", how="inner")
-    df = pd.merge(df, enddf, on="name", how="inner")
-
-    return df
+    if not df.empty:
+        dd = df.to_dict(orient="records")
+        startingpoint = {}
+        endingpoint = {}
+        lastindex = list(enumerate(dd))[-1][0]
+        firstindex = list(enumerate(dd))[0][0]
+        for x, v in enumerate(dd):
+            t_end = v.get("rightstart")
+            s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend")
+            end_override = dd[x + 1].get("leftend") if x != lastindex else None
+            primerstart = s
+            if end_override is not None and end_override in range(primerstart, t_end):
+                primerend = end_override
+            else:
+                primerend = t_end
+            startingpoint[primerstart] = v.get("name")
+            endingpoint[primerend] = v.get("name")
+
+        startdf = (
+            pd.DataFrame.from_dict(startingpoint, orient="index")
+            .reset_index()
+            .rename(columns={0: "name", "index": "unique_start"})
+        )
+        enddf = (
+            pd.DataFrame.from_dict(endingpoint, orient="index")
+            .reset_index()
+            .rename(columns={0: "name", "index": "unique_end"})
+        )
+        df = pd.merge(df, startdf, on="name", how="inner")
+        df = pd.merge(df, enddf, on="name", how="inner")
+        return df
+    else:
+        return pd.DataFrame(columns=["name", "leftstart", "leftend", "rightstart", "rightend", "unique_start", "unique_end"])
 
 
 def avg(lst):
@@ -251,6 +253,14 @@ def pad_name(name):
     lf = remove_alt_primer_l(remove_alt_keyword(lf))
     rf = remove_alt_primer_r(remove_alt_keyword(rf))
 
+    # if either lf or rf is empty, write empty csv and exit
+    # csv will have one row with index "flags.key" and an empty value, no column name
+    if len(lf) == 0 or len(rf) == 0:
+        df = pd.DataFrame({flags.key: [None]})
+        print(df)
+        df.to_csv(flags.output, sep=",", index=False, header=False)
+        sys.exit(0)
+
     non_overlapping_points = Find_NonOverlap(
         pd.merge(lf, rf, on="name", how="inner")
         .rename(

diff --git a/ViroConstrictor/workflow/workflow.smk b/ViroConstrictor/workflow/workflow.smk
@@ -655,19 +655,28 @@ def group_aminoacids_inputs(wildcards):
         select_samples = list(
             samples_df.loc[samples_df["Virus"] == i]["sample"].unique()
         )
-        select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique())
+        # for x in select_samples:
+        #     y = samples_df.loc[(samples_df["Virus"] == i) & (samples_df["sample"] == x)]["RefID"].unique()
+        #     print(y)
+        # select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique())
+        # print(select_refIDs)
 
         # create a dictionary of dictionaries for each virus, with 'i' as the primary key and sample as the secondary key having a list of refIDs as the value
-        struct[i] = {sample: select_refIDs for sample in select_samples}
-
+        struct[i] = {
+            sample: list(
+                samples_df.loc[
+                    (samples_df["Virus"] == i) & (samples_df["sample"] == sample)
+                ]["RefID"].unique()
+            )
+            for sample in select_samples
+        }
     file_list = []
     for virus, sample in struct.items():
         for sample, refid in sample.items():
             for ref in refid:
                 file_list.append(
                     f"{datadir}Virus~{virus}/RefID~{ref}/{amino}{sample}/aa.faa"
                 )
-
     return file_list
 
 
@@ -772,7 +781,7 @@ rule concat_boc:
 
 rule calculate_amplicon_cov:
     input:
-        pr=f"{datadir}{wc_folder}{prim}" "{sample}_removedprimers.bed",
+        pr=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed",
         cov=rules.trueconsense.output.cov,
     output:
         f"{datadir}{wc_folder}{prim}" "{sample}_ampliconcoverage.csv",