diff --git a/README.md b/README.md index 1d945fd..2655974 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,41 @@ All **file path** statements have to be replaced by the path to your benchmark f #### Excluding a prediction tool: Currently, to remove a tool from the analysis simply remove the line from the input and shell part of the mergeConditions rule. -You can find it in under `rules/postprocessing.smk`: +You can find it in under `rules/postprocessing.smk`. + +To remove IRSOM change: +~~~~ +rule mergeConditions: + input: + ribotish="tracks/{condition}.ribotish.gff", + reparation="tracks/{condition}.reparation.gff", + deepribo="tracks/{condition}.deepribo.gff", + irsom="tracks/{condition}.irsom.gff", + spectre="tracks/{condition}.spectre.gff", + smorfer="tracks/{condition}.smorfer.gff", + ribotricer="tracks/{condition}.ribotricer.gff", + price="tracks/{condition}.price.gff" + output: + "tracks/{condition, [a-zA-Z]+}.merged.gff" + conda: + "../envs/bedtools.yaml" + threads: 1 + shell: + """ + mkdir -p tracks; + cat {input.spectre} > {output}.unsorted; + cat {input.ribotish} >> {output}.unsorted; + cat {input.reparation} >> {output}.unsorted; + cat {input.deepribo} >> {output}.unsorted; + cat {input.irsom} >> {output}.unsorted; + cat {input.smorfer} >> {output}.unsorted; + cat {input.ribotricer} >> {output}.unsorted; + cat {input.price} >> {output}.unsorted; + bedtools sort -i {output}.unsorted > {output}; + """ +~~~~ + +to ~~~~ rule mergeConditions: @@ -155,7 +189,6 @@ rule mergeConditions: ribotish="tracks/{condition}.ribotish.gff", reparation="tracks/{condition}.reparation.gff", deepribo="tracks/{condition}.deepribo.gff", - ~~irsom="tracks/{condition}.irsom.gff",~~ spectre="tracks/{condition}.spectre.gff", smorfer="tracks/{condition}.smorfer.gff", ribotricer="tracks/{condition}.ribotricer.gff", @@ -172,7 +205,6 @@ rule mergeConditions: cat {input.ribotish} >> {output}.unsorted; cat {input.reparation} >> {output}.unsorted; cat {input.deepribo} >> {output}.unsorted; - ~~cat {input.irsom} >> {output}.unsorted;~~ cat {input.smorfer} >> {output}.unsorted; cat {input.ribotricer} >> {output}.unsorted; cat {input.price} >> {output}.unsorted; diff --git a/evaluation/prc_plotting.py b/evaluation/prc_plotting.py index 8839b7d..693166b 100644 --- a/evaluation/prc_plotting.py +++ b/evaluation/prc_plotting.py @@ -22,34 +22,24 @@ def compute_prc(labels, scores): def get_list(saved_dir, tool): dir_score_list = saved_dir + '/' + tool + '_score_list' - #dir_lable_list = saved_dir + '/' + tool + '_label_list' if not os.path.isfile(dir_score_list): sys.exit('not existing file (PRC): %s' % (dir_score_list)) with open(dir_score_list, 'rb') as handle: score_overlap_label_list = pickle.load(handle) - #with open(dir_lable_list, 'rb') as handle: - #label_list = pickle.load(handle) - #print(len(label_list)) - #print(len(score_overlap_label_list)) return score_overlap_label_list def get_ranks(score_overlap_label_list, tool): #first_indexes = len(score_list) rank_list = list(range(len(score_overlap_label_list))) + #print('score list:\n',len(score_overlap_label_list)) + #print('rank list:\n',rank_list) list.reverse(rank_list) #diff = len(score_overlap_label_list)-(len(set(score_overlap_label_list))) - sorted_list = sorted(score_overlap_label_list, key=itemgetter(0,1), reverse=True) - #print ('#####Ranked List 1:#############') - #print (count) - #print ('#####END#############') - #print ('#####sorted List#############') - #print (sorted_list) - #print ('#####END#############') last_score = 0 last_overlap = 0 @@ -65,9 +55,9 @@ def get_ranks(score_overlap_label_list, tool): if trippel[2] == '1': count_pos += 1 #if tool == 'deepribo' and trippel[0]<0: - elif trippel[2] == '0': count_neg += 1 + # check if the current score the same as the last score if trippel[0] == last_score: #check if overlaps are the same @@ -96,14 +86,15 @@ def comput_roc_for_tool(saved_dir, tool): score_overlap_label_list = get_list(saved_dir, tool) #print('score list of %s'%(tool)) + #print(score_overlap_label_list) sort_list = sorted(score_overlap_label_list, key=itemgetter(0), reverse=True) - #print(sort_list[0:100]) + #print(sort_list) score_list, label_list, base = get_ranks(score_overlap_label_list, tool) #print('score ranked list of %s'%(tool)) - #print(score_list[0:100]) + #print(score_list) #print('lable ranked list of %s'%(tool)) - #print(label_list[0:100]) + #print(label_list) precision, recall, thresholds, auc_prc = compute_prc(label_list, score_list) @@ -132,48 +123,31 @@ def main(): overlap = coverage_percent.replace(".", "") - # 'deepribo', 'ribotish', 'reparation', 'irsom' - + # 'deepribo', 'ribotish', 'reparation', 'irsom', 'spectre' + #print('tool: deepribo\n') precision_deepribo, recall_deepribo, base, auc_prc_deepribo = comput_roc_for_tool(experiment_dict_path, 'deepribo') + #print('tool: ribotish\n' ) precision_ribotish, recall_ribotish, base, auc_prc_ribotish = comput_roc_for_tool(experiment_dict_path, 'ribotish',) + #print('tool: reparation\n' ) precision_reparation, recall_reparation, base, auc_prc_reparation = comput_roc_for_tool(experiment_dict_path, 'reparation') + #print('tool: irsom\n' ) precision_irsom, recall_irsom, base, auc_prc_irsom = comput_roc_for_tool(experiment_dict_path, 'irsom') + #print('tool: spectre\n' ) + precision_spectre, recall_spectre, base, auc_prc_spectre = comput_roc_for_tool(experiment_dict_path, 'spectre') + #print('tool: price\n' ) + precision_price, recall_price, base, auc_prc_price = comput_roc_for_tool(experiment_dict_path, 'price') + #print('tool: ribotricer\n' ) + precision_ribotricer, recall_ribotricer, base, auc_prc_ribotricer = comput_roc_for_tool(experiment_dict_path, 'ribotricer') + #print('tool: smorfer\n' ) + precision_smorfer, recall_smorfer, base, auc_prc_smorfer = comput_roc_for_tool(experiment_dict_path, 'smorfer') + #print('deepribo AUC: %f' % (auc_prc_deepribo)) + #print('ribotish AUC: %f' % (auc_prc_ribotish)) + #print('reparation AUC: %f' %(auc_prc_reparation)) + #print('irsom AUC: %f' % (auc_prc_irsom)) + #print('spectre AUC: %f' % (auc_prc_spectre)) + - #print('deepribo AUC: %f' % (auc_deepribo)) - #print('ribotish AUC: %f' % (auc_ribotish)) - #print('reparation AUC: %f' %(auc_reparation)) - #print('irsom AUC: %f' % (auc_irsom)) - - -# label_deepribo = 'deepribo AUC: %f' % (auc_deepribo) -# label_ribotish = ('ribotish AUC: %f' % (auc_ribotish)) -# label_reparation = ('reparation AUC: %f' %(auc_reparation)) -# label_irsom = ('irsom AUC: %f' % (auc_irsom)) -# -# print('++++\nfpr deepribo:') -# print(fpr_deepribo) -# print('+++++++++++++++++++++') -# print('++++\ntpr deepribo:') -# print(tpr_deepribo) -# print('+++++++++++++++++++++') -# plt.plot(fpr_deepribo, tpr_deepribo, color='orange', label=label_deepribo) -# plt.plot(fpr_ribotish, tpr_ribotish, color='blue', label=label_ribotish) -# plt.plot(fpr_reparation, tpr_reparation, color='red', label=label_reparation) -# plt.plot(fpr_irsom, tpr_irsom, color='green', label=label_irsom) -# -# plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') -# plt.xlabel('False Positive Rate') -# plt.ylabel('True Positive Rate') -# plt.title('Receiver Operating Characteristic (ROC) Curve ' + experiment) -# plt.legend() -# #plt.show() -# save_roc_diag = plot_dir + experiment + '_roc.pdf' -# plt.savefig(save_roc_diag, format='pdf', dpi=300, bbox_inches='tight') - - # - - # plot title Escherichia if species == 'EC': title = 'E. coli' elif species == 'LM': @@ -182,22 +156,38 @@ def main(): title = 'P. aeruginosa' elif species == 'ST': title = 'S. Typhimurium' + elif species == 'HV': + title = 'Haloferax volcanii' else: print('Error: unknown species label') title = 'unknown species label' + # generat the legend information label_deepribo = 'DeepRibo AUC: %.2f' % (auc_prc_deepribo) label_ribotish = ('Ribo-TISH AUC: %.2f' % (auc_prc_ribotish)) label_reparation = ('Reparation AUC: %.2f' %(auc_prc_reparation)) label_irsom = ('IRSOM AUC: %.2f' % (auc_prc_irsom)) - - plt.plot(recall_deepribo, precision_deepribo, color='green', label=label_deepribo) - plt.plot(recall_reparation, precision_reparation, color='blue', label=label_reparation) - plt.plot(recall_ribotish, precision_ribotish, color='yellow', label=label_ribotish) - plt.plot(recall_irsom, precision_irsom, color='red', label=label_irsom) - - plt.plot([0, 1], [base, base], color='grey', linestyle='--') + label_spectre = ('SPECtre AUC: %.2f' % (auc_prc_spectre)) + label_price = ('price AUC: %.2f' % (auc_prc_price)) + label_ribotricer = ('ribotricer AUC: %.2f' % (auc_prc_ribotricer)) + label_smorfer = ('smorfer AUC: %.2f' % (auc_prc_smorfer)) + + # choosing colure + + # plot the PRC into one plot + plt.plot(recall_deepribo, precision_deepribo, color='#009E73', label=label_deepribo) # green + plt.plot(recall_reparation, precision_reparation, color='#0072B2', label=label_reparation) #blue + plt.plot(recall_ribotish, precision_ribotish, color='#F0E442', label=label_ribotish) #yellow + plt.plot(recall_irsom, precision_irsom, color='#D55E00', label=label_irsom) #red + plt.plot(recall_spectre, precision_spectre, color='#E69F00', label=label_spectre) #orange + plt.plot(recall_price, precision_price, color='#000000', label=label_price) #black + plt.plot(recall_ribotricer, precision_ribotricer, color='#56B4E9', label=label_ribotricer) #sky blue + plt.plot(recall_smorfer, precision_smorfer, color='#CC79A7', label=label_smorfer) #reddish purple + + # plot the baseline + plt.plot([0, 1], [base, base], color='#BBBBBB', linestyle='--') #grey #808080 + # set axix labels and title plt.xlabel('Recall', fontsize=14) plt.ylabel('Precision', fontsize=14) plt.xticks(fontsize=12, rotation=30) @@ -206,10 +196,12 @@ def main(): plt.title(title) #fontsize=20 - plt.legend(prop={'size': 14}) + plt.legend(prop={'size': 12}) plt.ylim(-0.02, 1.02) #plt.show() - save_prc_diag = plot_dir + species + '_prc_' + overlap + '_' + experiment+'_.pdf' + + # save plot + save_prc_diag = plot_dir + species + '_prc_' + overlap + '_' + experiment + '_.pdf' plt.savefig(save_prc_diag, format='pdf', dpi=300, bbox_inches='tight') if __name__ == '__main__': diff --git a/evaluation/print_calls.py b/evaluation/print_calls.py index 502c953..189d7bd 100644 --- a/evaluation/print_calls.py +++ b/evaluation/print_calls.py @@ -26,8 +26,8 @@ def main(): help="path to the negative reference data.") parser.add_argument("-s", "--python_script_dir", action="store", dest="python_script_dir", required=True, - default="/home/muellert/Dokumente/benchmark_ribo_seq/ribo_benchmark/evaluation/", - help="path to your '/ribo_benchmark/evaluation/'' forder where all evaluation scripts are stored.") + default="/home/teresa/Dokumente/benchmark_ribo_seq/RiboReport/evaluation/", + help="path to your '/RiboReport/evaluation/' forder where all evaluation scripts are stored.") @@ -52,21 +52,28 @@ def main(): shell_script = result_dir + '/evaluation_calls.sh' - python_script_dir = '/home/muellert/Dokumente/benchmark_ribo_seq/ribo_benchmark/evaluation/' + python_script_dir = '/home/teresa/Dokumente/benchmark_ribo_seq/RiboReport/evaluation/' f= open(shell_script,"w+") f.write("#!/usr/bin/env bash\n\ntrap ctrl_c INT\n\nfunction ctrl_c() {\necho \"** Trapped CTRL-C\"\nexit\n}\n\n") #f.write('test') # what to investigat test: - #dataset_list = ['bm01', 'bm06'] - #experiment_list = ['CDS_labels'] + #dataset_list = ['bm_01', 'bm_03', 'bm_06', 'bm_12'] + #dataset_list = ['bm_14'] + #experiment_list = ['smallORFs_labels'] + + #experiment_list = ['CDS_labels', 'smallORFs_labels', 'operons_intersect_labels', 'operons_complement_labels'] #overlap_list = ['0.01', '0.5'] #bar_plot_call_dict = {} # full data: - dataset_list = ['bm_01', 'bm_03', 'bm_06', 'bm_12'] - experiment_list = ['CDS_labels', 'smallORFs_labels', 'operons_intersect_labels', 'operons_complement_labels', 'ncRNAs_labels', 'pseudogenes_labels'] + #dataset_list = ['bm_01', 'bm_03', 'bm_06', 'bm_12'] + dataset_list = ['bm_01'] + #dataset_list = ['bm_06', 'bm_12'] + #dataset_list = ['bm_03'] + + experiment_list = ['CDS_labels', 'smallORFs_labels', 'operons_intersect_labels', 'operons_complement_labels'] overlap_list = ['0.01', '0.9', '0.7'] for dataset in dataset_list: bar_plot_call_dict = {} @@ -139,7 +146,6 @@ def main(): #print(experiment) experiment_dict[experiment]=[] for dataset in dataset_list: - #print(dataset) if dataset == 'bm_01': lable = 'EC' elif dataset == 'bm_03': @@ -148,8 +154,10 @@ def main(): lable = 'PA' elif dataset == 'bm_12': lable = 'ST' + elif dataset == 'bm_14': + lable = 'HV' for overlap in overlap_list: - #print(overlap) + #print(lable) stat_path = result_dir + '/' + dataset + '/' + experiment + '/' + overlap+ '/df_stat.csv' hue_symbel = lable + '_' + overlap experiment_dict[experiment].append((stat_path,hue_symbel, dataset, overlap)) @@ -188,7 +196,7 @@ def main(): call_script = ('python3 %s/experiment_barplots.py '%(python_script_dir)) call_exp_barplots = ('%s -e %s -o %s '%(call_script, save_experiment_dict, plot_dir)) #print(call_exp_barplots) - f.write('\n#######################\n%s\n'%(call_exp_barplots)) + # f.write('\n#######################\n%s\n'%(call_exp_barplots)) diff --git a/evaluation/statistics_pos_neg.py b/evaluation/statistics_pos_neg.py index eea7c05..e7dc4ee 100644 --- a/evaluation/statistics_pos_neg.py +++ b/evaluation/statistics_pos_neg.py @@ -1,6 +1,5 @@ #!/usr/bin/env python import pandas as pd -#import mathe import matplotlib as mpl import seaborn import os @@ -25,17 +24,13 @@ def get_dict_with_empty_val(file, tool='RefSeq'): Parameters ---------- tool : string - annotaion or either of the tool names - - Raises - ------ - not jet ... + annotation or either of the tool names Returns ------- tool_dict - dictionay with ID as key and value is the string parameter tool - id of the start:stop:strand:asseionnumber + dictionary with ID as key and value is the string parameter tool + id of the start:stop:strand:assertionnumber """ tool_dict = {} f = open(file, 'r') @@ -56,7 +51,7 @@ def get_dict_with_empty_val(file, tool='RefSeq'): def get_associated_genes(gene_dict, pred_prediction_dict, label): - """function assosiates just one prediction to a gene. + """function associates just one prediction to a gene. Parameters ---------- @@ -65,18 +60,14 @@ def get_associated_genes(gene_dict, pred_prediction_dict, label): pred_prediction_dict : dictionary predictionIds with lists of genes - Raises - ------ - not jet ... - Returns ------- associated_genes_dict - assositation of geneId -> predictionId (one to one relation) + associations of geneId -> predictionId (one to one relation) pred_prediction_sub_dict containing predictionsIds that are not associated to a gene associated_predictions_dict - assositation of predictionId -> geneId (one to one relation) + associations of predictionId -> geneId (one to one relation) """ associated_genes_dict = {} associated_predictions_dict = {} @@ -105,7 +96,7 @@ def get_associated_genes(gene_dict, pred_prediction_dict, label): pass associated_genes_dict[k] = id_pred associated_predictions_dict[id_pred]= k - score_list.append((float(score), overlap, label)) + score_list.append((float(score), float(overlap), label)) sorted_list = [] elif sorted_list == []: @@ -120,22 +111,18 @@ def get_associated_genes(gene_dict, pred_prediction_dict, label): def get_genes(gene_dict, df_overlap_score, tool): - """funciton selects for a saves tool all predicted genes (key) and there - associated predictions (value) in a dictionay + """function selects for a saves tool all predicted genes (key) and there + associated predictions (value) in a dictionary Parameters ---------- gene_dict : dictionary containing all gene IDs (not needed at the moment) df_overlap_score : datafame - conaining overlaps between the reference and prediction + containing overlaps between the reference and prediction tool : string tells for what tool the overlap should be found - Raises - ------ - not jet ... - Returns ------- gene_prediction_dict @@ -144,10 +131,10 @@ def get_genes(gene_dict, df_overlap_score, tool): [['predictionId3', 'score', 'overlap'],...]...} Id: start:stop:strand """ - # get all assosiation of genes with a specific tool + # get all association of genes with a specific tool df_overlap = df_overlap_score[df_overlap_score['id_pred'] == tool] - #prepare the value of the dict in an extra colum of the dataframe + #prepare the value of the dict in an extra column of the dataframe df_overlap_test = df_overlap.astype({"start_pred": str, "stop_pred": str, "score_pred": str, "overlap_percent": str, @@ -160,7 +147,7 @@ def get_genes(gene_dict, df_overlap_score, tool): df_overlap_test['overlap_percent']) #print(df_overlap_test) - # dict key: gene_id; value: list of overlaping prediction (predID$score$overlap) + # dict key: gene_id; value: list of overlapping prediction (predID$score$overlap) gene_prediction_temp_dict = {k: list(v) for k, v in df_overlap_test.groupby( by=['start_ref', 'stop_ref', 'strand_ref', 'chr']) @@ -185,20 +172,17 @@ def get_genes(gene_dict, df_overlap_score, tool): def get_prediction(tool_dict, df_overlap_score, tool): """This fuction selects for a given tool all predictions (key) and there - associated genes (value) and stores them in a dictionay + associated genes (value) and stores them in a dictionary Parameters ---------- tool_dict : dictionary (not needed at the moment) keys containing all predictions ids but jet no values df_overlap_score : datafame - conaining overlaps between the reference and prediction + containing overlaps between the reference and prediction tool : string tells for what tool the overlap should be found - Raises - ------ - not jet ... Returns ------- @@ -208,10 +192,10 @@ def get_prediction(tool_dict, df_overlap_score, tool): [['geneId3', 'overlap'],...]...} Id: start:stop:strand """ - # get all assosiation of genes with a specific tool + # get all association of genes with a specific tool df_overlap_tools = df_overlap_score[df_overlap_score['id_pred'] == tool] - # prepare the value of the dict in an extra colum of the dataframe + # prepare the value of the dict in an extra column of the dataframe df_overlap_test = df_overlap_tools.astype({ "start_ref": str, "stop_ref": str, "overlap_percent": str}) @@ -226,14 +210,14 @@ def get_prediction(tool_dict, df_overlap_score, tool): df_overlap_test.groupby(by=['start_pred', 'stop_pred', 'strand_pred', 'chr']) ['tuple']} - # convertd pred_id tuple into one string + # convert pred_id tuple into one string pred_prediction_dict = {str(k[0]) + ':' + str(k[1]) + ':' + k[2] + ':' + k[3]: pred_prediction_temp_dict[k] for k in pred_prediction_temp_dict} for k in pred_prediction_dict: - # sprlit gene informatio stroed in values into gene_id and overlap_percent + # split gene information stored in values into gene_id and overlap_percent pred_prediction_dict[k] = [val.split('-') for val in pred_prediction_dict[k]] @@ -244,12 +228,12 @@ def get_prediction(tool_dict, df_overlap_score, tool): def assort_genes_with_predictions(ref_file, tools_file, df_overlap_score, tool, label): - """This fuction computes TP, FN and FP for a given tool. + """This function computes TP, FN and FP for a given tool. Parameters ---------- ref_file : string - file name of the referece file (.gtf) + file name of the reference file (.gtf) tools_file : string file name of the tools prediction (.gtf) df_overlap_score : dataframe @@ -257,27 +241,24 @@ def assort_genes_with_predictions(ref_file, tools_file, df_overlap_score, tool, tool : string tool name label: string - specifies if positive or negative data is analysed + specifies if positive or negative data is analyzed - Raises - ------ - nothing Returns ------- assorted_genes - number of genes assoiated with one prediction + number of genes associated with one prediction not_assorted_genes - number of genes without predictiontools_file + number of genes without prediction tools_file suboptimals number of suboptimal predictions associated_gene_list list of all genes found by the currently investigated tool score_list - list contining for the current tool (score, overlap, label) for each associated gene + list containing for the current tool (score, overlap, label) for each associated gene """ - # put all genes of the refernece inside a dictionary + # put all genes of the reference inside a dictionary gene_dict = get_dict_with_empty_val(ref_file) print('number of genes for %s: %i' % (label, len(gene_dict))) @@ -287,10 +268,10 @@ def assort_genes_with_predictions(ref_file, tools_file, df_overlap_score, tool, # get all genes which are found by the tools prediction and the genes that are not found! gene_predicted_dict = get_genes(gene_dict, df_overlap_score, tool) - # get all predicitons of the tools that have a overlap with a gene of the reference + # get all predictions of the tools that have a overlap with a gene of the reference prediction_overlap_dict = get_prediction(tool_dict, df_overlap_score, tool) - # assosiat each gene with just one prediciton. All othere genes + # associate each gene with just one prediction. All other genes associated_genes_dict, pred_prediction_sub_list, associated_predictions_dict, no_genes_lost_prediction, score_list = get_associated_genes(gene_predicted_dict, prediction_overlap_dict,label) @@ -320,7 +301,7 @@ def get_stat_for_tool(ref_file, tools_file, df_overlap_score, ref_neg_file, df_n Parameters ---------- ref_file : string - file name of the referece file (.gtf) + file name of the reference file (.gtf) tools_file : string file name of the tools prediction (.gtf) df_overlap_score : dataframe @@ -330,23 +311,14 @@ def get_stat_for_tool(ref_file, tools_file, df_overlap_score, ref_neg_file, df_n save_dir: directory where the list of scores overlaps and labels can be stores at - Raises - ------ - nothing Returns ------- - gene_list + associated_TP_list list of genes that are found (TP) - prediction_list - list of predictions that are assoiated with a gene stat_list - contains all statistical measuments: + contains all statistical measurements: TP, FP, FN, recall, FNR, precision, FDR, F1, #subopt, tool] - FP_prediction_list - list of all predictions not overlaping with any gene - FN_prediction_list - list of all not predicted genes """ TP, FN, suboptimals_TP, associated_TP_list, score_list_pos = assort_genes_with_predictions(ref_file, tools_file, df_overlap_score, tool, '1') FP, TN, suboptimals_FP, associated_FP_list, score_list_neg = assort_genes_with_predictions(ref_neg_file, tools_file, df_neg_overlap, tool, '0') @@ -360,6 +332,9 @@ def get_stat_for_tool(ref_file, tools_file, df_overlap_score, ref_neg_file, df_n if tool == 'deepribo': FN_pred_ROC = [(-999999,0,'1')]*FN TN_pred_ROC = [(-999999,0,'0')]*TN + elif tool == 'price': + FN_pred_ROC = [(1,0,'1')]*FN + TN_pred_ROC = [(1,0,'0')]*TN else: FN_pred_ROC = [(0,0,'1')]*FN TN_pred_ROC = [(0,0,'0')]*TN @@ -379,7 +354,6 @@ def get_stat_for_tool(ref_file, tools_file, df_overlap_score, ref_neg_file, df_n no_prediction_without_genes = len(df_fp_overlap_tool.index) if(flag_subopt == 1 and flag_no_gene == 1): - #print('\nhallo\n') FP = FP + int(suboptimals_TP) + int(no_prediction_without_genes) elif (flag_subopt == 0 and flag_no_gene == 1): FP = FP + int(no_prediction_without_genes) @@ -441,16 +415,14 @@ def get_stat_for_tool(ref_file, tools_file, df_overlap_score, ref_neg_file, df_n stat_list = [TP, TN, FP, FN, recall, specificity, FNR, precision, FDR, F1, accuracy, suboptimals_TP, suboptimals_FP, no_prediction_without_genes, tool] return stat_list, associated_TP_list - # return (gene_list, prediction_list, stat_list, - # FP_prediction_list, FN_prediction_list) def call_bedtools_overlap(reference_path, tools_path, overlap_cutoff, overlap_file, score_cutoff): - """This fuction computes TP, FN and FP for a given tool. + """This function computes TP, FN and FP for a given tool. Parameters ---------- reference_path : string - file name of the referece file (.gtf) + file name of the reference file (.gtf) tools_path : string file name of the tools prediction (.gtf) overlap_file : string @@ -458,27 +430,27 @@ def call_bedtools_overlap(reference_path, tools_path, overlap_cutoff, overlap_fi score_cutoff : float gives - Raises - ------ - nothing Returns ------- df_overlap_score - dataframe containing overlapping regoins + dataframe containing overlapping regions """ if os.stat(reference_path).st_size == 0: sys.exit('empty file: %s' % (reference_path)) #print(which_set + '\n') cmd = ('bedtools intersect -a ' + reference_path + ' -b ' + tools_path + ' -wo -f ' + str(overlap_cutoff) + ' -r -s > ' + overlap_file) + #print('##########') + #print(cmd) + #print('##########') os.system(cmd) # read gtf overlap file if os.stat(overlap_file).st_size == 0: - sys.exit('no overlap was found for %s' % (overlap_file)) + sys.exit('no overlap (th %f) was found for %s \n and tool %s' % (overlap_cutoff, overlap_file, tools_path)) df_temp = pd.read_csv(overlap_file, header=None, sep="\t", comment="#") df_overlap = pd.DataFrame(df_temp.values, columns=['chr', 'id_ref', @@ -498,7 +470,7 @@ def call_bedtools_overlap(reference_path, tools_path, overlap_cutoff, overlap_fi df_overlap['start_pred'] + 1)) #df_overlap['overlap_percent'] = min((df_overlap['overlap'] /df_overlap['ref_length']),(df_overlap['overlap'] /df_overlap['ref_length'])) df_overlap['overlap_percent'] = df_overlap[['ref_overlap_percent','pred_overlap_percent']].min(axis =1) - + #print(df_overlap['score_pred']) df_overlap = df_overlap.astype({"start_ref": int, "stop_ref": int, "start_pred": int, "stop_pred": int, "score_pred": float, "overlap": int}) @@ -517,13 +489,13 @@ def call_bedtools_overlap(reference_path, tools_path, overlap_cutoff, overlap_fi def call_bedtools_not_overlap(reference_path, tools_path, overlap_cutoff, overlap_file, score_cutoff): - """This fuction computes all genes or predictions (A) which are not overlapping with predictions or genes (B). Given a minmal overlap percentage. + """This function computes all genes or predictions (A) which are not overlapping with predictions or genes (B). Given a minimal overlap percentage. The result is computed using bedtools and the result is saved in a gtf format and a dictionary Parameters ---------- reference_path : string - file name of the referece file (.gtf) + file name of the reference file (.gtf) tools_path : string file name of the tools prediction (.gtf) overlap_file : string @@ -531,14 +503,11 @@ def call_bedtools_not_overlap(reference_path, tools_path, overlap_cutoff, overla score_cutoff : float gives - Raises - ------ - nothing Returns ------- df_overlap_score - dataframe containing not overlapping regoins + dataframe containing not overlapping regions """ #bedtools parameter @@ -554,7 +523,7 @@ def call_bedtools_not_overlap(reference_path, tools_path, overlap_cutoff, overla # test if gtf file is emty: if os.stat(overlap_file).st_size == 0: - print('**********************\nno prediction exitst that could not be associated with any gene\n**********************\n') + print('**********************\nno prediction exits that could not be associated with any gene\n**********************\n') df_overlap_score = score_cutoff else: df_temp = pd.read_csv(overlap_file, header=None, sep="\t", comment="#") @@ -571,26 +540,138 @@ def call_bedtools_not_overlap(reference_path, tools_path, overlap_cutoff, overla return df_overlap_score + +def get_times_gene_value(no_pred_tool_val): + times_gene = (max([len(gene_deepribo_list), + len(gene_ribotish_list), + len(gene_reparation_list), + len(gene_irsom_list), + len(gene_spectre_list), + len(gene_price_list), + len(gene_ribotricer_list), + len(gene_smorfer_list)]) - no_pred_tool_val) + return times_gene + + +def venn_prepataions(gene_deepribo_list, gene_ribotish_list, gene_reparation_list, gene_irsom_list, gene_spectre_list, gene_price_list, gene_ribotricer_list, gene_smorfer_list, save_dir): + max_no_of_genes = max([len(gene_deepribo_list), + len(gene_ribotish_list), + len(gene_reparation_list), + len(gene_irsom_list), + len(gene_spectre_list), + len(gene_price_list), + len(gene_ribotricer_list), + len(gene_smorfer_list)]) + tims_deepribo_gen = max_no_of_genes - len(gene_deepribo_list) + tims_ribotish_gen = max_no_of_genes - len(gene_ribotish_list) + tims_reparation_gen = max_no_of_genes - len(gene_reparation_list) + tims_irsom_gen = max_no_of_genes - len(gene_irsom_list) + tims_spectre_gen = max_no_of_genes - len(gene_spectre_list) + tims_price_gen = max_no_of_genes - len(gene_price_list) + tims_ribotricer_gen =max_no_of_genes - len(gene_ribotricer_list) + tims_smorfer_gen = max_no_of_genes - len(gene_smorfer_list) + + venn_genes_dict = {'deepribo': list(gene_deepribo_list) + ['NA'] * + tims_deepribo_gen, + 'spectre': list(gene_spectre_list) + ['NA'] * + tims_spectre_gen, + 'ribotish': list(gene_ribotish_list) + ['NA'] * + tims_ribotish_gen, + 'reparation': list(gene_reparation_list) + ['NA'] * + tims_reparation_gen, + 'irsom': list(gene_irsom_list) + ['NA']*tims_irsom_gen, + 'spectre': list(gene_spectre_list) + ['NA']* + tims_spectre_gen, + 'price': list(gene_price_list) + ['NA']* + tims_price_gen, + 'ribotricer': list(gene_ribotricer_list) + ['NA']* + tims_ribotricer_gen, + 'smorfer': list(gene_smorfer_list) + ['NA']* + tims_smorfer_gen} + df_venn_genes = pd.DataFrame(venn_genes_dict) + df_venn_genes.to_csv(path_or_buf=save_dir+'df_venn_genes.csv', + index=False, sep='\t') + + + +#def venn_prepataions_bm03(gene_deepribo_list, gene_ribotish_list, +# gene_reparation_list, gene_irsom_list, save_dir): +# max_no_of_genes = max([len(gene_deepribo_list), +# len(gene_ribotish_list), +# len(gene_reparation_list), +# len(gene_irsom_list)]) +# tims_deepribo_gen = max_no_of_genes - len(gene_deepribo_list) +# tims_ribotish_gen = max_no_of_genes - len(gene_ribotish_list) +# tims_reparation_gen = max_no_of_genes - len(gene_reparation_list) +# tims_irsom_gen = max_no_of_genes - len(gene_irsom_list) +# +# venn_genes_dict = {'deepribo': list(gene_deepribo_list) + ['NA'] * +# tims_deepribo_gen, +# 'ribotish': list(gene_ribotish_list) + ['NA'] * +# tims_ribotish_gen, +# 'reparation': list(gene_reparation_list) + ['NA'] * +# tims_reparation_gen, +# 'irsom': list(gene_irsom_list) + ['NA'] * tims_irsom_gen} +# df_venn_genes = pd.DataFrame(venn_genes_dict) +# df_venn_genes.to_csv(path_or_buf=save_dir+'df_venn_genes.csv', +# index=False, sep='\t') + + +def venn_prepataions_bm03(gene_deepribo_list, gene_ribotish_list, + gene_reparation_list, gene_irsom_list, + gene_price_list, gene_ribotricer_list, save_dir): + max_no_of_genes = max([len(gene_deepribo_list), + len(gene_ribotish_list), + len(gene_reparation_list), + len(gene_irsom_list), + len(gene_price_list), + len(gene_ribotricer_list)]) + tims_deepribo_gen = max_no_of_genes - len(gene_deepribo_list) + tims_ribotish_gen = max_no_of_genes - len(gene_ribotish_list) + tims_reparation_gen = max_no_of_genes - len(gene_reparation_list) + tims_irsom_gen = max_no_of_genes - len(gene_irsom_list) + tims_price_gen = max_no_of_genes - len(gene_price_list) + tims_ribotricer_gen =max_no_of_genes - len(gene_ribotricer_list) + + venn_genes_dict = {'deepribo': list(gene_deepribo_list) + ['NA'] * + tims_deepribo_gen, + 'ribotish': list(gene_ribotish_list) + ['NA'] * + tims_ribotish_gen, + 'reparation': list(gene_reparation_list) + ['NA'] * + tims_reparation_gen, + 'irsom': list(gene_irsom_list) + ['NA'] * tims_irsom_gen, + 'price': list(gene_price_list) + ['NA']* + tims_price_gen, + 'ribotricer': list(gene_ribotricer_list) + ['NA']* + tims_ribotricer_gen} + df_venn_genes = pd.DataFrame(venn_genes_dict) + df_venn_genes.to_csv(path_or_buf=save_dir+'df_venn_genes.csv', + index=False, sep='\t') + + + + + def main(): # store commandline args parser = argparse.ArgumentParser(description='') parser.add_argument("-p", "--reference_pos_data", action="store", dest="reference_pos_path", required=True, - default="./data/all_filtered_massspec.gtf", + default="./data/smallORFs_labels_pos.gff", help="path to the positive reference data.") parser.add_argument("-n", "--reference_neg_data", action="store", dest="reference_neg_path", required=True, - default="./data/all_filtered_massspec.gtf", + default="../data/smallORFs_labels_neg.gff", help="path to the negative reference data.") parser.add_argument("-t", "--tool_data", action="store", dest="tools_path", - required=True, default="./data/combined.gtf", + required=True, default="./data/predictions.gff", help="path to the tools data.") parser.add_argument("-o", "--save_path", action="store", dest="save_path", required=True, default="./result_dfs/", help="path to save data to.") parser.add_argument("-c", "--overlap_cutoff", action="store", dest="overlap_cutoff", required=True, - default="0", type=float, help="path to save data to.") + default="0", type=float, help="overlap used to accept if prediction is covering labeled ORF") parser.add_argument("-s", "--flag_subopt", action="store", dest="flag_subopt", required=True, default="0", type=int, help="set to 1 to enable flag. If not 0.") @@ -630,7 +711,6 @@ def main(): "_overlap_fp.gtf") - # bed tools intersect parameter: # -wo Write the original A and B entries plus the number of base pairs of overlap between the two features. # Only A features with overlap are reported. Restricted by -f (min overlap) and -r (min overlap couts for A and B). @@ -643,50 +723,33 @@ def main(): temp_overlap_gtf = (save_dir + temp_overlap_gtf) fp_overlap_gtf = (save_dir + fp_overlap_gtf) - # get fp that are not overlaping with any gene + # get fp that are not overlapping with any gene df_temp_overlap = call_bedtools_not_overlap(tools_file, ref_pos_file, overlap_cutoff, temp_overlap_gtf, score_cutoff) # all predictions which are not overlapping with any gene for the given cutoff df_fp_overlap = call_bedtools_not_overlap(temp_overlap_gtf, ref_neg_file, overlap_cutoff, fp_overlap_gtf, score_cutoff) #print(df_fp_overlap.info) + + df_filterd = df_fp_overlap[df_fp_overlap.start_pred > df_fp_overlap.stop_pred].copy() + print('#########filterd df') + print(len(df_filterd)) + assert len(df_filterd) == 0, "error: some prediction have a greater start than end postion" # # find statistics values stat_list_ribotish, gene_ribotish_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'ribotish', save_dir) stat_list_reparation, gene_reparation_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'reparation', save_dir) stat_list_deepribo, gene_deepribo_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'deepribo', save_dir) stat_list_irsom, gene_irsom_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'irsom', save_dir) + stat_list_spectre, gene_spectre_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'spectre', save_dir) + stat_list_price, gene_price_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'price', save_dir) + stat_list_ribotricer, gene_ribotricer_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'ribotricer', save_dir) + stat_list_smorfer, gene_smorfer_list = get_stat_for_tool(ref_pos_file, tools_file, df_pos_overlap, ref_neg_file, df_neg_overlap, df_fp_overlap, flag_subopt, flag_no_gene, 'smorfer', save_dir) + # print('\n!!!!!!!!!\ngene list deepribo:\n', gene_deepribo_list) - tims_deepribo_gen = (max([len(gene_deepribo_list), - len(gene_ribotish_list), - len(gene_reparation_list), - len(gene_irsom_list)]) - len(gene_deepribo_list)) - tims_ribotish_gen = max([len(gene_deepribo_list), - len(gene_ribotish_list), - len(gene_reparation_list), - len(gene_irsom_list)]) - len(gene_ribotish_list) - tims_reparation_gen = max([len(gene_deepribo_list), - len(gene_ribotish_list), - len(gene_reparation_list), - len(gene_irsom_list)]) - len(gene_reparation_list) - tims_irsom_gen = max([len(gene_deepribo_list), - len(gene_ribotish_list), - len(gene_reparation_list), - len(gene_irsom_list)]) - len(gene_irsom_list) - - - - venn_genes_dict = {'deepribo': list(gene_deepribo_list) + ['NA'] * - tims_deepribo_gen, - 'ribotish': list(gene_ribotish_list) + ['NA'] * - tims_ribotish_gen, - 'reparation': list(gene_reparation_list) + ['NA'] * - tims_reparation_gen, - 'irsom': list(gene_irsom_list) + ['NA']*tims_irsom_gen} - df_venn_genes = pd.DataFrame(venn_genes_dict) - df_venn_genes.to_csv(path_or_buf=save_dir+'df_venn_genes.csv', - index=False, sep='\t') - #print(save_dir) + venn_prepataions(gene_deepribo_list, gene_ribotish_list, gene_reparation_list, gene_irsom_list, gene_spectre_list, gene_price_list, gene_ribotricer_list, gene_smorfer_list, save_dir) + #venn_prepataions_bm03(gene_deepribo_list, gene_ribotish_list, gene_reparation_list, gene_irsom_list, gene_price_list, gene_ribotricer_list, save_dir) + #venn_prepataions_bm03(gene_deepribo_list, gene_ribotish_list, gene_reparation_list, gene_irsom_list, save_dir) if(flag_subopt == 1): stat_file = 'df_stat_suboptimals.csv' @@ -697,10 +760,23 @@ def main(): 'FDR', 'F1', 'accuracy', 'subopt_tp', 'subopt_fp', 'no_genes', 'tool'] df_stat = pd.DataFrame([stat_list_reparation, stat_list_ribotish, - stat_list_deepribo, stat_list_irsom], + stat_list_deepribo, stat_list_irsom, stat_list_spectre, + stat_list_price, stat_list_ribotricer, stat_list_smorfer], columns=stat_list_header) + #df_stat = pd.DataFrame([stat_list_reparation, stat_list_ribotish, + # stat_list_deepribo, stat_list_irsom, + # stat_list_price, stat_list_ribotricer], + # columns=stat_list_header) df_stat.to_csv(path_or_buf=save_dir + stat_file, index=False, sep='\t') + with open(save_dir + "/table_latex.txt", "a") as myfile: + ref_pos_list = ref_pos_file.split('/') + myfile.write('Results of species %s, subset %s and overlap %f\n'%(ref_pos_list[8],ref_pos_list[9], overlap_cutoff)) + df_stat_round = df_stat.round(2) + myfile.write(df_stat_round.to_latex(index=False)) + + + if __name__ == '__main__': main()