Merge pull request #65 from BackofenLab/update_eval

Update eval
BackofenLab · Jan 24, 2024 · 4369dde · 4369dde
2 parents 1a7c743 + e0774e1
commit 4369dde
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 574 deletions.
diff --git a/README.md b/README.md
diff --git a/bin/cherri b/bin/cherri
@@ -8,6 +8,8 @@ import sys
 import rrieval.lib as rl
 from scipy.sparse import csr_matrix, vstack, hstack, load_npz, save_npz
 from ubergauss.tools import loadfile, dumpfile
+import csv
+import sys
 #import logging
 ## DeBug, INFO
 # import subprocess
@@ -96,7 +98,7 @@ def setup_argument_parser():
     # Optional arguments for evaluate.
     p_ex.add_argument("-i2", "--occupied_regions",
                       default="non",
-                      help= "Path to occupied regions python object file containing a dictionary")
+                      help= "Path to occupied regions python object. This file should be used if there are regions which that should be blocked from interactions. One can create this file with the find_occupied_regions.py")
     p_ex.add_argument("-c", "--context",
                       nargs='?',
                       type=int,
@@ -231,6 +233,53 @@ def setup_argument_parser():
 
 ################################################################################
 
+def test_eval_input(input):
+    # check header line
+    issue = 'no'
+    header = ['chrom1','start1','stop1','strand1','chrom2','start2','stop2','strand2']
+    if not check_header_line(input, header):
+        print(f'Input ERROR:\nplease proved the headerline:\n{header}\n')
+        issue = 'yes'
+    else:
+        print('You provided the corret input header line')
+
+    # check if order of the provided start stop positions are correct
+    if not check_positive_difference(input, 'stop1', 'start1'):
+        issue = 'yes'
+        print(f'Input ERROR:\nPlease provied a start1 smaller then stop1')
+    if not check_positive_difference(input, 'stop2', 'start2'):
+        issue = 'yes'
+        print(f'Input ERROR:\Please provied a start2 smaller then stop2')
+
+
+    if issue == 'yes':
+        sys.exit(1)
+
+
+def check_header_line(input,required_headers):
+
+    with open(input, newline='') as csvfile:
+        reader = csv.reader(csvfile)
+        # reads fist line into headers
+        headers = next(reader, None)  
+
+        if headers:
+            return all(header in headers for header in required_headers)
+        else:
+            return False
+
+def check_positive_difference(csv_file_path, pos_end, pos_start):
+    # Read the CSV file into a DataFrame
+    df = pd.read_csv(csv_file_path)
+
+    # Calculate the difference
+    difference = df[pos_end] - df[pos_start]
+
+    # Check if there is a negative value
+    has_negatives = (difference < 0).any()
+
+    # returns True if there is no and False if there is at least one negative value
+    return not has_negatives
 
 def read_RRI_table(file):
     """
@@ -296,9 +345,6 @@ def main_eval(args):
     Output files:
     ├── date_Cherri_evaluation_mode
     |   ├── evaluate_RRIs.csv
-    |   ├── date_occ_out
-    |       ├── occupied_regions.obj
-    |       ├── rri_occupied_regions_overlapTH_0.3_scoreTH_1.csv
     |   ├── positive_instance
     |       ├── {name}_context_{context}pos.csv
     |       ├── {name}_context_{context}_block_ends_0_RRI_dataset.csv
@@ -342,6 +388,9 @@ def main_eval(args):
     if not os.path.exists(model_params):
         print('Error: please set the path to your feature file of your model')
 
+    # test input data: RRIs_table
+    test_eval_input(RRIs_table)
+
 
     # define output folder
     timestr = time.strftime("%Y%m%d")

diff --git a/bin/generate_pos_neg_with_context.py b/bin/generate_pos_neg_with_context.py
@@ -228,7 +228,7 @@ def decode_Intarna_output(out):
 
     # out, err = process.communicate()
     out = out.decode('utf-8').strip().split('\n')
-    # print(f'IntaRNAout:\n{out}')
+    #print(f'IntaRNAout:\n{out}')
     for idx, line in enumerate(out):
         #print(idx)
         line = line.strip().split(';')
@@ -478,6 +478,9 @@ def decode_IntaRNA_call(call, lost_inst, row, list_rows_add, df_data, no_sub_opt
         """
     # print(f'####\nIntRNA call: \n{call}####\n')
     out = rl.call_script(call,reprot_stdout=True)
+    #print(out.decode('utf-8').strip().split('\n'))
+    if 'ERROR' in out.decode('utf-8'):
+        print(f'\n####\nIntaRNA is complining:\n{out}\nFor call:\n{call}\n####')
     #print(call)
     df = decode_Intarna_output(out)
     #print(df)
@@ -526,6 +529,7 @@ def get_context_added(input_rris, output_path, genome_file, context,
 
     # adding context by including infors into the df
     df_RRIs = extention_df(df_RRIs)
+    print(f'output dataframe:\ndf_RRIs')
     df_target = rl.get_context('target', df_RRIs, output_path,
                                 genome_file, context, chrom_len_file)
         #print(df_target)

diff --git a/source/docs/documentation.md b/source/docs/documentation.md
@@ -12,12 +12,12 @@ Here we search for trusted RRIs, so RRIs which can be found in all replicates. I
 | ID | name | description |
 |---|---|-----|
 | `-i` | `--input_path` | Path to folder storing input data (containing all replicates) |
-|`-r`| `--list_of_replicats` | List of file names for all replicates |
+|`-r`| `--list_of_replicates` | List of file names for all replicates |
 | `-o` | `--overlap_th` | Overlap threshold to find trusted RRIs |
 | `-d` | `--output_path` | Path where output folder should be stored |
 |`-n` | `--experiment_name` | Name of the data source of positive trusted RRIs |
 | `-s` | `--score_th` | Threshold for EM score from ChiRA |
-| `-fh` | `--filter_hybrid` | Filter the data for hyprids alrady detected by ChiRA |
+| `-fh` | `--filter_hybrid` | Filter the data for hybrids already detected by ChiRA |
 
 #### Output of find_trusted_RRI.py
 The filtered set of trusted RRI sites in tabular format. 
@@ -91,7 +91,7 @@ To generate the current features IntaRNA parameters by default are set to:
 | intLoopMax |  3  | number of unpaired bases between inter molecular base pairs |
 
 
-IntaRNA parameters can be changed by specifying a custom IntaRNA parameter file.
+IntaRNA parameters can be changed by specifying a custom IntaRNA parameter file. CheRRIs default parameter set can be found [here](https://github.com/BackofenLab/Cherri/tree/master/rrieval/IntaRNA_param).