diff --git a/CHANGELOG.md b/CHANGELOG.md index b485b8b..547d108 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [1.4.6](https://github.com/RIVM-bioinformatics/ViroConstrictor/compare/v1.4.5...v1.4.6) (2024-10-08) + + +### Bug Fixes + +* properly solve DAG workflow for nonsegmented matched-ref samples ([02a821a](https://github.com/RIVM-bioinformatics/ViroConstrictor/commit/02a821a44c3ed3741c65825789ef25ad3e2093c1)) + ## [1.4.5](https://github.com/RIVM-bioinformatics/ViroConstrictor/compare/v1.4.4...v1.4.5) (2024-09-25) diff --git a/CITATION.cff b/CITATION.cff index b0ab20b..43176da 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -24,7 +24,7 @@ authors: National Institute for Public Health and the Environment (RIVM) - name: "The RIVM-IDS Bioinformatics team" -version: 1.4.5 #x-release-please-version +version: 1.4.6 #x-release-please-version doi: 10.5281/zenodo.7688035 identifiers: - type: doi diff --git a/ViroConstrictor/__init__.py b/ViroConstrictor/__init__.py index e6ac66f..d6ad981 100644 --- a/ViroConstrictor/__init__.py +++ b/ViroConstrictor/__init__.py @@ -1,2 +1,2 @@ -__version__ = "1.4.5" +__version__ = "1.4.6" __prog__ = "ViroConstrictor" diff --git a/ViroConstrictor/workflow/scripts/amplicon_covs.py b/ViroConstrictor/workflow/scripts/amplicon_covs.py index b7f015e..718dee7 100644 --- a/ViroConstrictor/workflow/scripts/amplicon_covs.py +++ b/ViroConstrictor/workflow/scripts/amplicon_covs.py @@ -147,37 +147,39 @@ def remove_alt_primer_r(df): def Find_NonOverlap(df): - dd = df.to_dict(orient="records") - startingpoint = {} - endingpoint = {} - lastindex = list(enumerate(dd))[-1][0] - firstindex = list(enumerate(dd))[0][0] - for x, v in enumerate(dd): - t_end = v.get("rightstart") - s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend") - end_override = dd[x + 1].get("leftend") if x != lastindex else None - primerstart = s - if end_override is not None and end_override in range(primerstart, t_end): - primerend = end_override - else: - primerend = t_end - startingpoint[primerstart] = v.get("name") - endingpoint[primerend] = v.get("name") - - startdf = ( - pd.DataFrame.from_dict(startingpoint, orient="index") - .reset_index() - .rename(columns={0: "name", "index": "unique_start"}) - ) - enddf = ( - pd.DataFrame.from_dict(endingpoint, orient="index") - .reset_index() - .rename(columns={0: "name", "index": "unique_end"}) - ) - df = pd.merge(df, startdf, on="name", how="inner") - df = pd.merge(df, enddf, on="name", how="inner") - - return df + if not df.empty: + dd = df.to_dict(orient="records") + startingpoint = {} + endingpoint = {} + lastindex = list(enumerate(dd))[-1][0] + firstindex = list(enumerate(dd))[0][0] + for x, v in enumerate(dd): + t_end = v.get("rightstart") + s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend") + end_override = dd[x + 1].get("leftend") if x != lastindex else None + primerstart = s + if end_override is not None and end_override in range(primerstart, t_end): + primerend = end_override + else: + primerend = t_end + startingpoint[primerstart] = v.get("name") + endingpoint[primerend] = v.get("name") + + startdf = ( + pd.DataFrame.from_dict(startingpoint, orient="index") + .reset_index() + .rename(columns={0: "name", "index": "unique_start"}) + ) + enddf = ( + pd.DataFrame.from_dict(endingpoint, orient="index") + .reset_index() + .rename(columns={0: "name", "index": "unique_end"}) + ) + df = pd.merge(df, startdf, on="name", how="inner") + df = pd.merge(df, enddf, on="name", how="inner") + return df + else: + return pd.DataFrame(columns=["name", "leftstart", "leftend", "rightstart", "rightend", "unique_start", "unique_end"]) def avg(lst): @@ -251,6 +253,14 @@ def pad_name(name): lf = remove_alt_primer_l(remove_alt_keyword(lf)) rf = remove_alt_primer_r(remove_alt_keyword(rf)) + # if either lf or rf is empty, write empty csv and exit + # csv will have one row with index "flags.key" and an empty value, no column name + if len(lf) == 0 or len(rf) == 0: + df = pd.DataFrame({flags.key: [None]}) + print(df) + df.to_csv(flags.output, sep=",", index=False, header=False) + sys.exit(0) + non_overlapping_points = Find_NonOverlap( pd.merge(lf, rf, on="name", how="inner") .rename( diff --git a/ViroConstrictor/workflow/workflow.smk b/ViroConstrictor/workflow/workflow.smk index 744e055..658ec35 100644 --- a/ViroConstrictor/workflow/workflow.smk +++ b/ViroConstrictor/workflow/workflow.smk @@ -655,11 +655,21 @@ def group_aminoacids_inputs(wildcards): select_samples = list( samples_df.loc[samples_df["Virus"] == i]["sample"].unique() ) - select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique()) + # for x in select_samples: + # y = samples_df.loc[(samples_df["Virus"] == i) & (samples_df["sample"] == x)]["RefID"].unique() + # print(y) + # select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique()) + # print(select_refIDs) # create a dictionary of dictionaries for each virus, with 'i' as the primary key and sample as the secondary key having a list of refIDs as the value - struct[i] = {sample: select_refIDs for sample in select_samples} - + struct[i] = { + sample: list( + samples_df.loc[ + (samples_df["Virus"] == i) & (samples_df["sample"] == sample) + ]["RefID"].unique() + ) + for sample in select_samples + } file_list = [] for virus, sample in struct.items(): for sample, refid in sample.items(): @@ -667,7 +677,6 @@ def group_aminoacids_inputs(wildcards): file_list.append( f"{datadir}Virus~{virus}/RefID~{ref}/{amino}{sample}/aa.faa" ) - return file_list @@ -772,7 +781,7 @@ rule concat_boc: rule calculate_amplicon_cov: input: - pr=f"{datadir}{wc_folder}{prim}" "{sample}_removedprimers.bed", + pr=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed", cov=rules.trueconsense.output.cov, output: f"{datadir}{wc_folder}{prim}" "{sample}_ampliconcoverage.csv",