From 70b58956400e1ae9729e7de5b2a6be1a1219b01d Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Fri, 26 Jul 2024 12:48:44 +0200
Subject: [PATCH] fix: working exclusion

---
 Snakefile                     |  1 +
 workflow/rules/clustering.smk |  4 ++--
 workflow/scripts/cluster.py   | 25 +++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 7aac42b..51496b9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -10,6 +10,7 @@ for param in ["threads", "mem_gb"]:
         config[param][k] = int(config[param][k])
 
 OUT = config["output_dir"]
+INPUT = config["input_dir"]
 
 # find collection using collfinder
 # iget collection and save to a path passed to cli
diff --git a/workflow/rules/clustering.smk b/workflow/rules/clustering.smk
index 90fe269..f9db155 100644
--- a/workflow/rules/clustering.smk
+++ b/workflow/rules/clustering.smk
@@ -97,7 +97,7 @@ else:
         input:
             distances=OUT + "/distances.tsv",
             previous_clustering=PREVIOUS_CLUSTERING + "/clusters.csv",
-            exclude_list=OUT + "/list_excluded_samples.txt",
+            exclude_list=OUT + "/list_excluded_samples.tsv",
         output:
             OUT + "/clusters.csv",
         log:
@@ -124,6 +124,6 @@ python workflow/scripts/cluster.py \
 --log {log} \
 --verbose \
 --merged-cluster-separator {params.merged_cluster_separator:q} \
---exclude {input.exclude_list}
+--exclude {input.exclude_list} \
 --output {output}
             """
diff --git a/workflow/scripts/cluster.py b/workflow/scripts/cluster.py
index 646f5dc..93a7b4b 100644
--- a/workflow/scripts/cluster.py
+++ b/workflow/scripts/cluster.py
@@ -66,6 +66,29 @@ def read_data(distances, previous_clustering):
         )
     return df_distances, df_previous_clustering
 
+@timing
+def clean_sample_columns(df, cols, fixed_string):
+    """
+    Remove fixed string from columns
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe with distances
+    cols : list
+        Columns to clean
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Dataframe with cleaned sample names
+
+    """
+    for col in cols:
+        df[col] = df[col].str.replace(fixed_string, "")
+    return df
+
+
 @timing
 def exclude_samples(df_distances, exclude_list):
     """
@@ -429,6 +452,8 @@ def main(args):
         args.distances, args.previous_clustering
     )
 
+    df_distances = clean_sample_columns(df_distances, ["sample1", "sample2"], "_contig1")
+
     if args.exclude_list:
         df_distances = exclude_samples(df_distances, args.exclude_list)