From 70b58956400e1ae9729e7de5b2a6be1a1219b01d Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Jul 2024 12:48:44 +0200 Subject: [PATCH] fix: working exclusion --- Snakefile | 1 + workflow/rules/clustering.smk | 4 ++-- workflow/scripts/cluster.py | 25 +++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 7aac42b..51496b9 100644 --- a/Snakefile +++ b/Snakefile @@ -10,6 +10,7 @@ for param in ["threads", "mem_gb"]: config[param][k] = int(config[param][k]) OUT = config["output_dir"] +INPUT = config["input_dir"] # find collection using collfinder # iget collection and save to a path passed to cli diff --git a/workflow/rules/clustering.smk b/workflow/rules/clustering.smk index 90fe269..f9db155 100644 --- a/workflow/rules/clustering.smk +++ b/workflow/rules/clustering.smk @@ -97,7 +97,7 @@ else: input: distances=OUT + "/distances.tsv", previous_clustering=PREVIOUS_CLUSTERING + "/clusters.csv", - exclude_list=OUT + "/list_excluded_samples.txt", + exclude_list=OUT + "/list_excluded_samples.tsv", output: OUT + "/clusters.csv", log: @@ -124,6 +124,6 @@ python workflow/scripts/cluster.py \ --log {log} \ --verbose \ --merged-cluster-separator {params.merged_cluster_separator:q} \ ---exclude {input.exclude_list} +--exclude {input.exclude_list} \ --output {output} """ diff --git a/workflow/scripts/cluster.py b/workflow/scripts/cluster.py index 646f5dc..93a7b4b 100644 --- a/workflow/scripts/cluster.py +++ b/workflow/scripts/cluster.py @@ -66,6 +66,29 @@ def read_data(distances, previous_clustering): ) return df_distances, df_previous_clustering +@timing +def clean_sample_columns(df, cols, fixed_string): + """ + Remove fixed string from columns + + Parameters + ---------- + df : pd.DataFrame + Dataframe with distances + cols : list + Columns to clean + + Returns + ------- + df : pd.DataFrame + Dataframe with cleaned sample names + + """ + for col in cols: + df[col] = df[col].str.replace(fixed_string, "") + return df + + @timing def exclude_samples(df_distances, exclude_list): """ @@ -429,6 +452,8 @@ def main(args): args.distances, args.previous_clustering ) + df_distances = clean_sample_columns(df_distances, ["sample1", "sample2"], "_contig1") + if args.exclude_list: df_distances = exclude_samples(df_distances, args.exclude_list)