Skip to content

Commit

Permalink
fix: properly solve DAG workflow for nonsegmented matched-ref samples
Browse files Browse the repository at this point in the history
refactor: add extra safeguards to amplicon_covs script to bridge time for full refactor
  • Loading branch information
florianzwagemaker committed Oct 8, 2024
1 parent 86f5093 commit 02a821a
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 36 deletions.
72 changes: 41 additions & 31 deletions ViroConstrictor/workflow/scripts/amplicon_covs.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,37 +147,39 @@ def remove_alt_primer_r(df):


def Find_NonOverlap(df):
dd = df.to_dict(orient="records")
startingpoint = {}
endingpoint = {}
lastindex = list(enumerate(dd))[-1][0]
firstindex = list(enumerate(dd))[0][0]
for x, v in enumerate(dd):
t_end = v.get("rightstart")
s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend")
end_override = dd[x + 1].get("leftend") if x != lastindex else None
primerstart = s
if end_override is not None and end_override in range(primerstart, t_end):
primerend = end_override
else:
primerend = t_end
startingpoint[primerstart] = v.get("name")
endingpoint[primerend] = v.get("name")

startdf = (
pd.DataFrame.from_dict(startingpoint, orient="index")
.reset_index()
.rename(columns={0: "name", "index": "unique_start"})
)
enddf = (
pd.DataFrame.from_dict(endingpoint, orient="index")
.reset_index()
.rename(columns={0: "name", "index": "unique_end"})
)
df = pd.merge(df, startdf, on="name", how="inner")
df = pd.merge(df, enddf, on="name", how="inner")

return df
if not df.empty:
dd = df.to_dict(orient="records")
startingpoint = {}
endingpoint = {}
lastindex = list(enumerate(dd))[-1][0]
firstindex = list(enumerate(dd))[0][0]
for x, v in enumerate(dd):
t_end = v.get("rightstart")
s = dd[x - 1].get("rightstart") if x != firstindex else v.get("leftend")
end_override = dd[x + 1].get("leftend") if x != lastindex else None
primerstart = s
if end_override is not None and end_override in range(primerstart, t_end):
primerend = end_override
else:
primerend = t_end
startingpoint[primerstart] = v.get("name")
endingpoint[primerend] = v.get("name")

startdf = (
pd.DataFrame.from_dict(startingpoint, orient="index")
.reset_index()
.rename(columns={0: "name", "index": "unique_start"})
)
enddf = (
pd.DataFrame.from_dict(endingpoint, orient="index")
.reset_index()
.rename(columns={0: "name", "index": "unique_end"})
)
df = pd.merge(df, startdf, on="name", how="inner")
df = pd.merge(df, enddf, on="name", how="inner")
return df
else:
return pd.DataFrame(columns=["name", "leftstart", "leftend", "rightstart", "rightend", "unique_start", "unique_end"])


def avg(lst):
Expand Down Expand Up @@ -251,6 +253,14 @@ def pad_name(name):
lf = remove_alt_primer_l(remove_alt_keyword(lf))
rf = remove_alt_primer_r(remove_alt_keyword(rf))

# if either lf or rf is empty, write empty csv and exit
# csv will have one row with index "flags.key" and an empty value, no column name
if len(lf) == 0 or len(rf) == 0:
df = pd.DataFrame({flags.key: [None]})
print(df)
df.to_csv(flags.output, sep=",", index=False, header=False)
sys.exit(0)

non_overlapping_points = Find_NonOverlap(
pd.merge(lf, rf, on="name", how="inner")
.rename(
Expand Down
19 changes: 14 additions & 5 deletions ViroConstrictor/workflow/workflow.smk
Original file line number Diff line number Diff line change
Expand Up @@ -655,19 +655,28 @@ def group_aminoacids_inputs(wildcards):
select_samples = list(
samples_df.loc[samples_df["Virus"] == i]["sample"].unique()
)
select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique())
# for x in select_samples:
# y = samples_df.loc[(samples_df["Virus"] == i) & (samples_df["sample"] == x)]["RefID"].unique()
# print(y)
# select_refIDs = list(samples_df.loc[samples_df["Virus"] == i]["RefID"].unique())
# print(select_refIDs)

# create a dictionary of dictionaries for each virus, with 'i' as the primary key and sample as the secondary key having a list of refIDs as the value
struct[i] = {sample: select_refIDs for sample in select_samples}

struct[i] = {
sample: list(
samples_df.loc[
(samples_df["Virus"] == i) & (samples_df["sample"] == sample)
]["RefID"].unique()
)
for sample in select_samples
}
file_list = []
for virus, sample in struct.items():
for sample, refid in sample.items():
for ref in refid:
file_list.append(
f"{datadir}Virus~{virus}/RefID~{ref}/{amino}{sample}/aa.faa"
)

return file_list


Expand Down Expand Up @@ -772,7 +781,7 @@ rule concat_boc:

rule calculate_amplicon_cov:
input:
pr=f"{datadir}{wc_folder}{prim}" "{sample}_removedprimers.bed",
pr=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed",
cov=rules.trueconsense.output.cov,
output:
f"{datadir}{wc_folder}{prim}" "{sample}_ampliconcoverage.csv",
Expand Down

0 comments on commit 02a821a

Please sign in to comment.