This repository has been archived by the owner on Jan 3, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgenerate_report.py
675 lines (589 loc) · 38.7 KB
/
generate_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
from fpdf import FPDF
import sys
import os
import re
import pandas
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import statistics as stats
import collections
import numpy as np
import seaborn as sns
import warnings
# this method will be called last so everything can be calculated
# then rearrange pages in PDF so this become page 2
def overall_main_page_stats(pdf, originalFile, cleanedFile, concordance, dupCon, sexCheck, unspecifiedSex):
num_samples_analyzed = sum(1 for line in open(originalFile+'.fam'))
num_samples_qc_pass = sum(1 for line in open(cleanedFile+'.fam'))
num_snps_analyzed = sum(1 for line in open(originalFile+'.bim'))
num_snps_qc_pass = sum(1 for line in open(cleanedFile+'.bim'))
pdf.add_page()
pdf.set_margins(20, 10, 20)
pdf.set_font('Arial', 'B', 30)
pdf.set_x(20)
pdf.multi_cell(0, 30, "QC Summary Statistics", 0, 1, 'L')
pdf.line(20, 32, 190, 32)
pdf.set_font('Arial', '', 12)
pdf.multi_cell(0, 5, "Listed below are the basic overall summary statistics for this project \
broken down into three categories: Sample Summary, SNP Summary, and Data Summary. For more detailed \
information please refer to the subsequent PDFs and pages. They will provide you with a more granular view of the \
quality control pipeline that was performed as well as parameters and thresholds that were used. At the end \
of the PDF there is also a definitions page that lists what each metric means and how it was calculated. If you \
have any questions or concerns please contact the TICR department at the University of Colorado Anschutz Medical \
Campus at [email protected]. "+'\n\n\n\n\n', 0, 1, 'J')
pdf.set_font('Arial', 'B', 16)
pdf.set_fill_color(200)
pdf.multi_cell(0, 8, "Sample Summary", 1, 'L', True)
pdf.set_x(30)
pdf.set_font('Arial', '', 16)
pdf.multi_cell(0, 8, "Total samples analyzed: "+str(num_samples_analyzed), 1, '1', 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Total samples passing QC: "+str(num_samples_qc_pass), 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Total samples with gender discrepancies: "+str(sexCheck), 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Total samples with missing genders in manifest: "+str(unspecifiedSex), 1, 1, 'L')
pdf.multi_cell(0, 8, "\n\n", 0, 1, 'L')
pdf.set_font('Arial', 'B', 16)
pdf.multi_cell(0, 8, "SNP Summary", 1, 'L', True)
pdf.set_font('Arial', '', 16)
pdf.set_x(30)
pdf.multi_cell(0, 8, "Total SNPs analyzed: "+str(num_snps_analyzed), 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Total SNPs passing QC: "+str(num_snps_qc_pass), 1, 1, 'L')
pdf.multi_cell(0, 8, "\n\n", 0, 1, 'L')
pdf.set_font('Arial', 'B', 16)
pdf.multi_cell(0, 8, "Data Summary", 1, 'L', True)
pdf.set_font('Arial', '', 16)
pdf.set_x(30)
# this is the total number of snps released samples x snps
pdf.multi_cell(0, 8, "Total genotypes passing QC: "+str(int(num_samples_qc_pass) * int(num_snps_qc_pass)), 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Average HapMap concordance with 1000 Genomes: " +str(concordance)+'%', 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Percent duplicate concordance: "+str(dupCon['percent_concordance'])+'%', 1, 1, 'L')
def explanation_of_deliverables(pdf, params):
pdf.add_page()
pdf.set_margins(20, 10, 20)
pdf.set_font('Arial', 'B', 24)
pdf.set_x(20)
pdf.multi_cell(0, 30, "Deliverables", 0, 1, 'L')
pdf.line(20, 32, 190, 32)
pdf.set_font('Arial', 'BI', 16)
pdf.set_x(20)
pdf.multi_cell(0, 10, 'Raw Plink Files', 0, 1, 'L')
pdf.set_font('Arial', '', 12)
pdf.set_x(25)
pdf.multi_cell(0, 5, 'Three PLINK files are provided: .BED, .BIM and .FAM files. These files contain the RAW PRE-FILTERED \
samples and SNPs from your project. These files can be used directly with PLINK software. For more information regarding \
the PLINK format please refer to the following website https://www.cog-genomics.org/plink/1.9/'+'\n\n', 0, 1, 'J')
pdf.set_font('Arial', 'BI', 16)
pdf.set_x(20)
pdf.multi_cell(0, 10, 'Cleaned Plink and VCF Files', 0, 1, 'L')
pdf.set_font('Arial', '', 12)
pdf.set_x(25)
pdf.multi_cell(0, 5, 'Three PLINK files are provided: .BED, .BIM, and .FAM files. These files contain the \
samples and SNPs that have passed our QC pipeline from your project. The only samples that are removed from this data set are \
those that fail missingness thresholds. All samples failing gender mismatches and ambiguities remain in the cleaned data set despite failing our QC thresholds \
in case the investigator can resolve these issues from their manifest file. On the contrary, all SNPs that fail our QC thresholds are removed from the cleaned \
data set. All these files can be used directly with PLINK software. \
For more information regarding the PLINK format please refer to the following website \
https://www.cog-genomics.org/plink/1.9/ Additionally, we provide the same cleaned data set as a VCF file ' + '\n\n', 0, 1, 'J')
pdf.set_font('Arial', 'BI', 16)
pdf.set_x(20)
pdf.multi_cell(0, 10, 'PDF Reports', 0, 1, 'L')
pdf.set_font('Arial', '', 12)
pdf.set_x(25)
pdf.multi_cell(0, 5, 'Three PDF files are provided:', 0, 1, 'J')
pdf.set_x(35)
pdf.multi_cell(0, 5, '\n'+params['projectName']+'_final_summary_report.pdf' + '\n' + params['projectName']+'_final_detailed_report.pdf' +'\n' +
params['projectName']+'_final_glossary_report.pdf'+'\n\n', 0, 1, 'L')
pdf.set_x(25)
pdf.multi_cell(0, 5, ' The '+params['projectName']+'_final_summary_report.pdf is this current PDF which contains information \
regarding the explanation of the deliverables and an overall final QC summary page. This is meant to provide the investigator \
with a quick overview of the final number of samples, snps, and genotypes that were analyzed and that passed our QC pipeline. '+'\n' +
' The ' + params['projectName']+'_final_detailed_report.pdf contains all of the fine details of the samples and snps that pass \
the recommended Illumina sample and SNP QC, as well as SNPs that pass call rate thresholds and samples sex concordance. The \
information and statistics reported are collected sequentially as the data is pushed through the QC pipeline.' +'\n' + ' Finally, \
the '+ params['projectName']+'_final_glossary_report.pdf provides the investigator with an explanation of the parameters and \
thresholds that were used in the pipeline. We have also provided two detailed images of our QC pipeline work flow. All the details \
of how parameters and thresholds are calculated are provided in this document. Additionally, the names of the pipeline parameter names \
are provided in the event that the investigator would like to download and run our QC pipeline on their own copmuter or server under \
differet parameters and thresholds.'+'\n\n', 0, 1, 'J')
pdf.set_font('Arial', 'BI', 16)
pdf.set_x(20)
pdf.multi_cell(0, 10, 'Text Files', 0, 1, 'L')
pdf.set_font('Arial', '', 12)
pdf.set_x(25)
pdf.multi_cell(0, 5, 'Nine text files are provided:', 0, 1, 'J')
pdf.set_x(35)
pdf.multi_cell(0, 5, '\n'+'snps_failing_QC_details.txt' + '\n' + 'samples_failing_callrate_QC_details.txt' +'\n' + 'GenomeStudio_samples_table.txt' +
'\n' + 'GenomeStudio_SNPs_table.txt' + '\n' + 'GenomeStudio_final_report.txt' + '\n' + 'trio_reports.txt' + 'individual_concordance_reports.txt' +
'\n' + 'final_report_statistics_per_sample.txt'+ '\n' + 'md5_check_sum.txt' + '\n\n', 0, 1, 'L')
pdf.set_x(25)
pdf.multi_cell(0, 5, ' The snps_failing_QC_details.txt is a tab-delimited file that contains all the SNPs that were removed due to failing at least \
one QC check. Each line in the first column represents a single SNP name. Any susequent columns in the line are reasons why the SNP failed. \
There may be more than one reason and each reason is a new column. You will notice a number followed by the reasoning; this number represents \
the value that the particular SNP was calculated for that parameter.' +'\n' + ' The samples_failing_QC_details.txt is formatted similarly to the snps_failing_QC_details.txt \
files, except there is an additional column, which is the first column of the files that contains the family ID followed by the second column which \
contains the sample ID.' +'\n' + ' Alhtough these samples fail QC, only those failing the missingness threshold are actually removed from the analysis. \
The GenomeStudio text files are files that contain some infomation that we use in the initial step of our QC pipeline \
regarding your samples and SNPs. For more information on what the columns mean please refer to the glossary report PDF.' +'\n' ' We also provide a \
separate file called trio_reports.txt that gives information on how well the trios performed against 1000 genomes as well as the number of Mendel errors \
in each family and the concordance rate with 1000 genomes. For trios in which an individual has failed QC, we attempt to run a duo anlaysis assuming a parent \
and child still remain. If this is not the case, we list it in the trio_reports.txt as ERROR: NOT FULL DUO OR TRIO. The final_report_statistics_per_sample.txt \
provides general statistics of the log R ratio and B allele frequencies at the per sample level of samples that pass QC. Finally, we provide a md5_check_sums.txt file \
that lists the md5 check sums of all the generated files to ensure files are not altered or corrupted during the transfer process.'+'\n\n', 0, 1, 'J')
# add disclaimer about storage
pdf.add_page()
pdf.set_margins(20, 10, 20)
pdf.set_font('Arial', 'B', 24)
pdf.set_x(20)
pdf.multi_cell(0, 30, "Data Storage and Analysis Disclaimer", 0, 1, 'L')
pdf.line(20, 32, 190, 32)
pdf.set_font('Arial', 'BI', 16)
pdf.set_x(20)
pdf.multi_cell(0, 10, 'Data Storage', 0, 1, 'L')
pdf.set_font('Arial', '', 12)
pdf.set_x(25)
pdf.multi_cell(0, 5, 'Due to the nature of the size of these data sets, we can only keep them on our FTP servers for a limited time before we \
archive them to save on space. With that in mind, we would like you to please take a look at the deliverables and files immediately in \
the event you need access to other files or need something to be re-run. You will have 3 WEEKS from the date you received the deliverables \
to download the data and to email us with any questions or concerns or to download data that was not provided to you in the \
zipped files. After this 3 week window we will archive all the data and place it on a storage server for 9 months before deleting it from our system. \
Please be aware, if you request access to the archived data you will be charged for download time.'+'\n\n', 0, 1, 'L')
pdf.set_font('Arial', 'BI', 16)
pdf.set_x(20)
pdf.multi_cell(0, 10, 'QC Analysis', 0, 1, 'L')
pdf.set_font('Arial', '', 12)
pdf.set_x(25)
pdf.multi_cell(0, 5, 'The default parameters set on this pipeline have been tested and optimized for studies that have a minimum of 500 samples. \
If your data has fewer samples than this we cannot guarantee the parameters are the most optimal. It is possible that some of the threshold set \
may need to be less or more stringent. It is up to you to notify us within 3 WEEKS of receiving the reports if you would like to re-run the pipeline under \
different threholds. The pipeline is also publically available and is hosted on the following URL: https://github.com/tbrunetti/GWAS_QC_pipeline if you choose to run the pipeline yourself. Please \
be aware if you ask us to re-run the pipeline on our server after the 3 week mark, you will be charged for additional time.'+'\n\n\n\n\n', 0, 1, 'L')
pdf.multi_cell(0, 5, 'Thank you for your understanding and please feel free to contact us with any questions or concerns.', 0, 1, 'L')
def thresholds_and_parameters(pdf, params):
pdf.add_page()
pdf.set_margins(20, 10, 20)
pdf.set_font('Arial', 'B', 24)
pdf.set_x(20)
pdf.multi_cell(0, 30, "Parameters and Thresholds", 0, 1, 'L')
pdf.line(20, 32, 190, 32)
pdf.set_font('Arial', '', 16)
for key in params:
if key not in ['sampleTable', 'snpTable', 'projectName', 'config', 'inputPLINK', 'outDir', 'arrayType', 'finalReport']:
pdf.multi_cell(0, 8, str(key)+': '+str(params[key]), 0, 1, 'L')
def illumina_sample_overview(inputFile, fam, pdf, callrate, outDir, cleanup):
warnings.simplefilter(action = "ignore", category = FutureWarning)
print "Running Illumina Sample QC..."
samples_to_remove_text = open(os.path.join(outDir, 'samples_to_remove.txt'), 'w')
pdf.add_page()
pdf.set_margins(20, 10, 20)
pdf.set_font('Arial', 'B', 24)
sample_qc_table = pandas.read_table(inputFile)
total_samples = len(list(sample_qc_table['Sample ID']))
# retrieve sample ids of those with missing call rage less than call rate parameter provided by user; default = 0.991
samples_to_remove = list(sample_qc_table[sample_qc_table['Call Rate'] < callrate]['Sample ID'])
#basic_call_stats = [stats.median(sample_qc_table['Call Rate']), stats.mean(sample_qc_table['Call Rate']), stats.stdev(sample_qc_table['Call Rate']), min(sample_qc_table['Call Rate']), max(sample_qc_table['Call Rate'])]
basic_call_stats = [np.nanmedian(sample_qc_table['Call Rate']), np.nanmean(sample_qc_table['Call Rate']), np.nanstd(sample_qc_table['Call Rate']), np.nanmin(sample_qc_table['Call Rate']), np.nanmax(sample_qc_table['Call Rate'])]
pdf.set_x(20)
pdf.multi_cell(0, 30, "Illumina Sample Quality Assessment", 0, 1, 'L')
pdf.line(20, 32, 190, 32)
pdf.set_fill_color(200)
pdf.set_font('Arial', 'B', 16)
pdf.multi_cell(0, 8, "Total samples analyzed: "+str(total_samples), 1, 'L', True)
pdf.set_fill_color(200)
pdf.set_font('Arial', 'B', 16)
pdf.multi_cell(0, 8, "Number of samples passing missing call rate threshold: "+str(total_samples-len(samples_to_remove)), 1, 'L', True)
pdf.set_font('Arial', '', 16)
pdf.set_x(30)
pdf.multi_cell(0, 8, 'Median call rate: '+str("%.2f" % round(basic_call_stats[0]*100, 2)) + '%', 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Mean call rate: "+ str("%.2f" % round(basic_call_stats[1]*100, 2)) + '%', 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Standard deviation call rate: "+ str("%.2f" % round(basic_call_stats[2]*100, 2)) + '%', 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Minimum call rate: "+ str("%.2f" % round(basic_call_stats[3]*100, 2)) + '%', 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 8, "Maximum call rate: "+ str("%.2f" % round(basic_call_stats[4]*100, 2)) + '%', 1, 1, 'L')
# store batch and chip number of sample that fails missingness threshold
chips_fail_missingness_check = {}
famfile = pandas.read_table(fam, delim_whitespace=True, names=['FID', 'IID', 'PAT', 'MAT', 'SEX', 'AFF'])
reason_samples_fail = open(os.path.join(outDir, 'samples_failing_callrate_QC_details.txt'), 'w')
# create a files called "samples_to_remove.txt" to be passed in proper format to plink for sample removal
temp_remove = {x:list(famfile.loc[famfile['IID'] == x]['FID']) for x in samples_to_remove}
for key in temp_remove:
samples_to_remove_text.write(str(temp_remove[key][0]) + '\t' + str(key) + '\n')
reason_samples_fail.write(str(temp_remove[key][0]) + '\t' + str(key) + '\t' + str('failed Illumina Sample Missingness: ') + str(list(sample_qc_table[sample_qc_table['Sample ID'] == key]['Call Rate'])[0]) + '\n')
# get chips that are failing missingness key is batch list is chip number
if re.search('([A-Z]*[a-z]*[0-9]*)-DNA_([A-Z]{1}[0-9]{2}).*', key):
batch_id = re.search('([A-Z]*[a-z]*[0-9]*)-DNA_([A-Z]{1}[0-9]{2}).*', key)
try:
chipID = [str(batch_id.group(2))[1:]]
chips_fail_missingness_check.setdefault(batch_id.group(1), []).extend(chipID)
except TypeError: # in the event the dictionary is not iterable (list)
chips_fail_missingness_check[batch_id.group(1)].append(chipID)
samples_to_remove_text.flush() # flushes out buffer
reason_samples_fail.flush()
def check_GC_callrate(sampleInfo, cleanup):
warnings.simplefilter(action = "ignore", category = FutureWarning)
sample_quality_graph = sns.jointplot('Call Rate','p10 GC', data=sampleInfo, kind="reg")
plt.suptitle('Overall Sample Quality')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir,'sample_gccallrate.png'))
plt.close()
pdf.image(os.path.join(outDir, "sample_gccallrate.png"), x=20, y=120, w=170)
cleanup.append(os.path.join(outDir, "sample_gccallrate.png")) # puts image in line for deletion; happens after final PDF has been generated
check_GC_callrate(sampleInfo=sample_qc_table, cleanup=cleanup)
return sample_qc_table, samples_to_remove_text, reason_samples_fail, chips_fail_missingness_check, cleanup
def graph_sexcheck(pdf, warning_samples, sexcheck, maxF, minF, outDir, cleanup):
'''
pdf: a FPDF object (separate instance of FPDF object in function batch_effects())
warning_samples: a new open() file object to write samples with gender discrepanices
sexcheck: pipeline_args['inputPLINK'][:-4]+'_passing_QC.sexcheck
maxF: pipeline_args['maxFemale']; float
minF: pipeline_args['minMale']; float
outDir: output directory
cleanup: a list that contains paths to files that can be removed
'''
warnings.simplefilter(action = "ignore", category = FutureWarning)
print("checking sex concordance")
pdf.add_page()
pdf.set_font('Arial', 'B', 24)
pdf.set_margins(20, 10, 20)
pdf.set_x(20)
pdf.multi_cell(0, 30, "Overall Sex Concordance Check", 0, 1, 'L')
pdf.line(20, 32, 190, 32)
sex_check_dataframe = pandas.read_table(sexcheck, delim_whitespace=True)
sorted_sex_check_dataframe = sex_check_dataframe.sort_values(['F'], ascending=True)
sorted_sex_check_dataframe['rank'] = list(range(1, len(list(sorted_sex_check_dataframe['FID']))+1))
sample_sex = sns.lmplot(x='rank', y='F', hue='PEDSEX', data=sorted_sex_check_dataframe, fit_reg=False, palette={0:'black', 1:'pink', 2:'blue'}, scatter_kws={"s": 20})
plt.axhline(y=float(maxF), linestyle="--")
plt.axhline(y=float(minF), linestyle="--")
plt.suptitle('Sex and F coefficient based on pedigree sex data')
sample_sex.set(xlabel='ranked samples', ylabel='F inbreeding coefficient')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'sample_sex.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir, "sample_sex.png"), x=20, y=105, w=79, h=85)
cleanup.append(os.path.join(outDir, "sample_sex.png")) # puts image in line for deletion; happens after final PDF has been generated
imputed_sex = sns.lmplot(x='rank', y='F', hue='SNPSEX', data=sorted_sex_check_dataframe, fit_reg=False, palette={0:'black', 1:'pink', 2:'blue'}, scatter_kws={"s": 20})
plt.axhline(y=float(maxF), linestyle="--")
plt.axhline(y=float(minF), linestyle="--")
plt.suptitle('Sex and F coefficient based on imputed sex data')
imputed_sex.set(xlabel='ranked samples', ylabel='F inbreeing coefficient')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'imputed_sex.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir, "imputed_sex.png"), x=110, y=105, w=79, h=85)
cleanup.append(os.path.join(outDir, "imputed_sex.png")) # puts image in line for deletion; happens after final PDF has been generated
discrepancies_bw_imputed_and_collected = sns.lmplot(x='rank', y='F', hue='STATUS', data=sorted_sex_check_dataframe, fit_reg=False, palette={'OK':'black', 'PROBLEM':'red'}, scatter_kws={"s": 20})
plt.axhline(y=float(maxF), linestyle="--")
plt.axhline(y=float(minF), linestyle="--")
plt.suptitle('Discrepancies between imputed and pedigree data')
plt.subplots_adjust(top=.9)
discrepancies_bw_imputed_and_collected.set(xlabel='ranked samples', ylabel='F inbreeding coefficient')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'discrepancies_sex.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir, "discrepancies_sex.png"), x=20, y=210, w=79, h=85)
cleanup.append(os.path.join(outDir, "discrepancies_sex.png")) # puts image in line for deletion; happens after final PDF has been generated
# determines which discrepanices are probably human error prone versus sample quality error
problem_calls_only = sorted_sex_check_dataframe.loc[sorted_sex_check_dataframe['STATUS'].isin(['PROBLEM'])]
gender_not_in_manifest = problem_calls_only.loc[problem_calls_only['PEDSEX'] == 0]
concordant_calls = sorted_sex_check_dataframe.loc[sorted_sex_check_dataframe['STATUS'].isin(['OK'])]
fixed_sex = list(problem_calls_only[(problem_calls_only['F'] >= minF) | (problem_calls_only['F'] <= maxF)]['IID'])
indeterminate_sex = list(problem_calls_only[(problem_calls_only['F'] < minF) | (problem_calls_only['F'] > maxF)]['IID'])
unknown_sex = list(gender_not_in_manifest['IID'])
pdf.set_font('Arial', 'B', 16)
pdf.set_fill_color(200)
pdf.multi_cell(0, 10, 'Total Number of Concordant Samples: ' + str(len(concordant_calls.index)), 1, 'L', True)
pdf.multi_cell(0, 10, 'Total Number of Discrepancies: '+str((len(fixed_sex)-len(unknown_sex))+(len(indeterminate_sex)-len(fixed_sex))), 1, 'L', True)
pdf.set_font('Arial', '', 16)
pdf.set_x(30)
pdf.multi_cell(0, 10, '# of outlier samples with clear gender mismatches: '+str(len(fixed_sex)-len(unknown_sex)), 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 10, '# of ambiguous samples: '+str(len(indeterminate_sex)-len(fixed_sex)), 1, 1, 'L')
pdf.set_x(30)
pdf.multi_cell(0, 10, '# of gender not specified samples: '+str(len(unknown_sex)), 1, 1, 'L')
ambiguous_samples = list(set(indeterminate_sex) - set(fixed_sex)) # ambiguous sample IDs only
with open(warning_samples.name, 'a+') as sex_outliers:
for sample in list(set(fixed_sex)-set(unknown_sex)):
sex_outliers.write(str(list(sorted_sex_check_dataframe[sorted_sex_check_dataframe['IID'] == sample]['FID'])[0]) + '\t' +str(sample) +'\t' +
'gender_mismatch: '+str(list(sorted_sex_check_dataframe[sorted_sex_check_dataframe['IID'] == sample]['F'])[0]) +'\n')
for ambiguous in ambiguous_samples :
sex_outliers.write(str(list(sorted_sex_check_dataframe[sorted_sex_check_dataframe['IID'] == ambiguous]['FID'])[0]) + '\t' +str(ambiguous) +'\t' +
'ambiguous_gender: '+str(list(sorted_sex_check_dataframe[sorted_sex_check_dataframe['IID'] == ambiguous]['F'])[0]) +'\n')
for unknowns in unknown_sex:
sex_outliers.write(str(list(sorted_sex_check_dataframe[sorted_sex_check_dataframe['IID'] == unknowns]['FID'])[0]) + '\t' +str(unknowns) +'\t' +
'missing_gender_in_manifest: '+str(list(sorted_sex_check_dataframe[sorted_sex_check_dataframe['IID'] == unknowns]['F'])[0]) +'\n')
sex_outliers.flush()
return sex_outliers, str(len(fixed_sex)-len(unknown_sex)), str(len(unknown_sex)), cleanup
def batch_effects(pdf, chipFail, sexcheck, missingness, chip_missingness_fails, maxF, minF, outDir, cleanup):
'''
pdf: a FPDF object (separate instance of FPDF object in function graph_sexcheck())
chipFail: pipeline_args['chipFailure']; integer of max number of failures to consider chip a fail
sexcheck: pipeline_args['inputPLINK'][:-4]+'_passing_QC.sexcheck
missingness: pipeline_args['inputPLINK'][:-4]+'_passing_QC.imiss
chip_missingness_fails: text file of FID and IID of samples to removed
maxF: pipeline_args['maxFemale']; float
minF: pipeline_args['minMale']; float
outDir: output directory
cleanup: a list that contains paths to files that can be removed
'''
warnings.simplefilter(action = "ignore", category = FutureWarning)
batch_summary = FPDF()
# sex concordance between batches
batch_summary.add_page()
batch_summary.set_font('Arial', 'B', 30)
batch_summary.set_margins(20, 10, 20)
batch_summary.set_x(20)
batch_summary.multi_cell(0, 30, "Batch Statistics", 0, 1, 'L')
batch_summary.line(20, 32, 190, 32)
# sex check and sample missingness by batch and by chip
sex_check_dataframe = pandas.read_table(sexcheck, delim_whitespace=True) # sexcheck results from plink
sample_missingness_dataframe = pandas.read_table(missingness, delim_whitespace=True, dtype=str) # imiss results from plink
sample_missingness_dataframe.to_csv(os.path.join(outDir, "sampleMissingness.txt"), index=False, sep="\t")
batch_sex = {}
batch_missing = {}
for batch in list(sex_check_dataframe['IID']):
if re.search('([A-Z]*[a-z]*[0-9]*)-DNA_([A-Z]{1}[0-9]{2}).*', batch):
batch_id = re.search('([A-Z]*[a-z]*[0-9]*)-DNA_([A-Z]{1}[0-9]{2}).*', batch)
print(batch_id.group(0)) # full IID
print(batch_id.group(1)) # batch ID
print(batch_id.group(2)) # well ID
if batch_id.group(1) in batch_sex:
#key batch ID (WG1, WG2, etc...)
batch_sex[batch_id.group(1)] = batch_sex[batch_id.group(1)] + [list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['STATUS']) + list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['PEDSEX']) + list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['SNPSEX']) + list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['F']) + [batch_id.group(2)]]
batch_missing[batch_id.group(1)] = batch_missing[batch_id.group(1)] + [list(sample_missingness_dataframe[sample_missingness_dataframe['IID'] == batch]['F_MISS']) + [batch_id.group(2)]] #battch is IID
else:
batch_sex[batch_id.group(1)] = [list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['STATUS']) + list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['PEDSEX']) + list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['SNPSEX']) + list(sex_check_dataframe[sex_check_dataframe['IID'] == batch]['F']) + [batch_id.group(2)]]
batch_missing[batch_id.group(1)] = [list(sample_missingness_dataframe[sample_missingness_dataframe['IID'] == batch]['F_MISS']) + [batch_id.group(2)]]
else:
print 'BATCH ID [' + str(batch) + '] not formatted properly!'
batch_summary.set_font('Arial', 'B', 16)
batch_summary.set_fill_color(200)
batch_summary.multi_cell(0, 10, 'Total Number of Batches: ' + str(len(batch_sex)), 1, 'L', True)
# missingness data format data for seaborn boxplot/strip plot
all_batch_callrate = []
for key in batch_missing:
callrate_per_batch = [[str(key), str(batch_missing[key][i][0]), str(batch_missing[key][i][1])] for i in range(0, len(batch_missing[key]))]
all_batch_callrate = all_batch_callrate + callrate_per_batch
# missingness data read data into pandas dataframe
missing_call_dataframe = pandas.DataFrame(all_batch_callrate, columns=['batch', 'missing call rate (ratio)', 'wellID'], dtype=str)
missing_call_dataframe['missing call rate (%)']=missing_call_dataframe['missing call rate (ratio)'].astype(float)*100
missing_call_dataframe.dropna(axis=0, how="any", subset=['missing call rate (%)'], inplace=True)
missing_call_dataframe['call rate (%)'] = (100 - missing_call_dataframe['missing call rate (%)'].astype(float))
missing_call_dataframe.to_csv(os.path.join(outDir,"sampleMissingness_update.txt"), index=False, sep="\t")
missing_genotypes = sns.boxplot(x='call rate (%)', y='batch', data=missing_call_dataframe, color=".8")
missing_genotypes = sns.stripplot(x='call rate (%)', y='batch', data=missing_call_dataframe, jitter=True)
plt.suptitle('Overall call rate per sample across batches')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'call_rate_samples.png'))
plt.close()
batch_summary.image(os.path.join(outDir, 'call_rate_samples.png'), x=10, y=140, w=190, h=150)
cleanup.append(os.path.join(outDir, 'call_rate_samples.png')) # puts image in line for deletion; happens after final PDF has been generated
# get sample missingness statistics across batches
batch_call_averages = []
batch_call_averages_paired = {}
for batch_name in batch_missing:
temp = missing_call_dataframe.loc[missing_call_dataframe['batch'].isin([batch_name])]
batch_call_averages.append(temp['call rate (%)'].mean())
batch_call_averages_paired[batch_name] = temp['call rate (%)'].mean()
# record chip statistics
total_chips = 0
total_chips_fail = 0 # fail is when chip has 2 or more sex discrepancies
failing_chip_IDs = {}
# outputs graphs and statistics per batch based on sex
for key in batch_sex:
pdf.add_page()
pdf.set_font('Arial', 'B', 18)
pdf.set_margins(20, 10, 20)
pdf.multi_cell(0, 30, "Sample Statistics for Batch ID: "+str(key), 0, 1, 'L')
pdf.line(20, 30, 190, 30)
batch_dataframe = pandas.DataFrame(batch_sex[key], columns=['Discrepancies', 'PEDSEX', 'SNPSEX', 'F', 'well'])
sorted_sex_batch_dataframe = batch_dataframe.sort_values(['F'], ascending=True)
sorted_sex_batch_dataframe['rank'] = list(range(1, len(list(sorted_sex_batch_dataframe['well']))+1))
unknown_sex_ped = batch_dataframe.loc[batch_dataframe['PEDSEX'] == 0]
total_unknown_sex_ped = len(list(unknown_sex_ped['well']))
# all sex based batch analysis
sample_sex = sns.lmplot(x='rank', y='F', hue='PEDSEX', data=sorted_sex_batch_dataframe, fit_reg=False, palette={0:'black', 1:'pink', 2:'blue'}, scatter_kws={"s": 20})
plt.axhline(y=float(maxF), linestyle="--")
plt.axhline(y=float(minF), linestyle="--")
plt.suptitle('Sex and F coefficient based on pedigree sex data')
sample_sex.set(xlabel='ranked samples', ylabel='F inbreeding coefficient')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'sample_sex'+str(key)+'.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir, "sample_sex"+str(key)+'.png'), x=20, y=85, w=79, h=85)
cleanup.append(os.path.join(outDir, "sample_sex"+str(key)+'.png')) # puts image in line for deletion; happens after final PDF has been generated
imputed_sex = sns.lmplot(x='rank', y='F', hue='SNPSEX', data=sorted_sex_batch_dataframe, fit_reg=False, palette={0:'black', 1:'pink', 2:'blue'}, scatter_kws={"s": 20})
plt.axhline(y=float(maxF), linestyle="--")
plt.axhline(y=float(minF), linestyle="--")
plt.suptitle('Sex and F coefficient based on imputed sex data')
imputed_sex.set(xlabel='ranked samples', ylabel='F inbreeding coefficient')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'imputed_sex'+str(key)+'.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir, "imputed_sex"+str(key)+'.png'), x=110, y=85, w=79, h=85)
cleanup.append(os.path.join(outDir, "imputed_sex"+str(key)+'.png')) # puts image in line for deletion; happens after final PDF has been generated
discrepancies_sex = sns.lmplot(x='rank', y='F', hue='Discrepancies', data=sorted_sex_batch_dataframe, fit_reg=False, palette={"OK":'black', "PROBLEM":'red'}, scatter_kws={"s": 20})
plt.axhline(y=float(maxF), linestyle="--")
plt.axhline(y=float(minF), linestyle="--")
plt.suptitle('Discrepancies between imputed and pedigree data')
discrepancies_sex.set(xlabel='ranked samples', ylabel='F inbreeding coefficient')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'discrepancies_sex'+str(key)+'.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir, "discrepancies_sex"+str(key)+'.png'), x=20, y=190, w=79, h=85)
cleanup.append(os.path.join(outDir, "discrepancies_sex"+str(key)+'.png')) # puts image in line for deletion; happens after final PDF has been generated
contradictions_headers = sorted_sex_batch_dataframe['Discrepancies'].value_counts().index.tolist()
contradictions = sorted_sex_batch_dataframe['Discrepancies'].value_counts()
if 'PROBLEM' not in contradictions_headers:
pdf.set_font('Arial', 'B', 14)
pdf.multi_cell(0, 8, "Total Samples in Batch: "+str(len(batch_sex[key])), 0, 1,'L')
pdf.multi_cell(0, 8, "Percent Sex Concordance in Batch: 100.0%", 0, 1, 'L')
pdf.multi_cell(0, 8, "Total Samples with Sex Discrepancies: "+'0', 0, 1, 'L')
# calculate total number of chips in batch
all_wells = list(sorted_sex_batch_dataframe['well'])
batch_chip_total = []
for i in all_wells:
batch_chip_total.append(i[1:])
total_chips = total_chips + len(list(set(batch_chip_total)))
elif 'OK' not in contradictions_headers:
pdf.set_font('Arial', 'B', 14)
pdf.multi_cell(0, 8, "Total Samples in Batch: "+str(len(batch_sex[key])), 0, 1, 'L')
pdf.multi_cell(0, 8, "Percent Sex Concordance in Batch: '0.0%", 0, 1, 'L')
pdf.multi_cell(0, 8, "Total Samples with Sex Discrepancies: "+ str(contradictions['PROBLEM']), 0, 1, 'L')
problem_wells = list(sorted_sex_batch_dataframe[sorted_sex_batch_dataframe['Discrepancies'] == 'PROBLEM']['well'])
pdf.multi_cell(0, 8, "Wells with Sex Discrepancies: " + ', '.join(problem_wells))
# calculate total number of chips in batch
all_wells = list(sorted_sex_batch_dataframe['well'])
batch_chip_total = []
for i in all_wells:
batch_chip_total.append(i[1:])
total_chips = total_chips + len(list(set(batch_chip_total)))
rows = []
columns = []
for i in problem_wells:
rows.append(i[0])
columns.append(i[1:])
width = 1
all_rows =['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
problem_rows = collections.Counter(rows)
for row in all_rows:
if row not in problem_rows:
problem_rows[row] = 0
all_columns=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
problem_columns = collections.Counter(columns)
for col in all_columns:
if col not in problem_columns:
problem_columns[col] = 0
# count and record number of failing chips
ordered_freq_chips = problem_columns.most_common()
while len(ordered_freq_chips) !=0 and ordered_freq_chips[0][1] > chipFail:
total_chips_fail=total_chips_fail + 1
if key in failing_chip_IDs:
failing_chip_IDs[key] = failing_chip_IDs[key] + [str(ordered_freq_chips[0][0])]
ordered_freq_chips.pop(0)
else:
failing_chip_IDs[key] = [str(ordered_freq_chips[0][0])]
ordered_freq_chips.pop(0)
else:
pdf.set_font('Arial', 'B', 14)
pdf.multi_cell(0, 8, "Total Samples in Batch: "+str(len(batch_sex[key])), 0, 1, 'L')
pdf.multi_cell(0, 8, "Percent Sex Concordance in Batch: " + str("%.2f" % round((float(contradictions['OK'])/float(len(batch_sex[key])))*100, 2))+'%', 0, 1, 'L')
pdf.multi_cell(0, 8, "Total Samples with Sex Discrepancies: "+ str(contradictions['PROBLEM'] - total_unknown_sex_ped), 0, 1, 'L')
problem_wells = list(sorted_sex_batch_dataframe.loc[(sorted_sex_batch_dataframe['Discrepancies'] == 'PROBLEM') & (sorted_sex_batch_dataframe['PEDSEX'] != 0)]['well'])
pdf.multi_cell(0, 8, "Wells with Sex Discrepancies: " + ', '.join(problem_wells))
# calculate total number of chips in batch
all_wells = list(sorted_sex_batch_dataframe['well'])
batch_chip_total = []
for i in all_wells:
batch_chip_total.append(i[1:])
total_chips = total_chips + len(list(set(batch_chip_total)))
rows = []
columns = []
for i in problem_wells:
rows.append(i[0])
columns.append(i[1:])
width = 1
all_rows =['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
problem_rows = collections.Counter(rows)
for row in all_rows:
if row not in problem_rows:
problem_rows[row] = 0
all_columns=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
problem_columns = collections.Counter(columns)
for col in all_columns:
if col not in problem_columns:
problem_columns[col] = 0
# count and record number of failing chips
ordered_freq_chips = problem_columns.most_common()
while len(ordered_freq_chips) != 0 and ordered_freq_chips[0][1] > chipFail:
total_chips_fail=total_chips_fail + 1
if key in failing_chip_IDs:
failing_chip_IDs[key] = failing_chip_IDs[key] + [str(ordered_freq_chips[0][0])]
ordered_freq_chips.pop(0)
else:
failing_chip_IDs[key] = [str(ordered_freq_chips[0][0])]
ordered_freq_chips.pop(0)
plt.bar(np.arange(len(problem_rows.keys())), problem_rows.values(), width, edgecolor='black', align='center')
plt.xticks(np.arange(len(problem_rows.keys())), problem_rows.keys(), fontweight='bold')
plt.title('Distribution of problematic rows', fontweight='bold')
plt.xlabel("plate row ID", fontweight='bold')
plt.ylabel("Frequency", fontweight='bold')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'problem_rows'+str(key)+'.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir+'/'+"problem_rows"+str(key)+'.png'), x=110, y=190, w=79, h=42)
plt.bar(np.arange(len(problem_columns.keys())), problem_columns.values(), width, edgecolor='black', align='center')
plt.xticks(np.arange(len(problem_columns.keys())), problem_columns.keys(), fontweight='bold')
plt.title('Distribution of problematic chips', fontweight='bold')
plt.xlabel("chip number", fontweight='bold')
plt.ylabel("Frequency", fontweight='bold')
plt.tight_layout(pad=2, w_pad=2, h_pad=2)
plt.savefig(os.path.join(outDir, 'problem_chips'+str(key)+'.png'), bbox_inches='tight')
plt.close()
pdf.image(os.path.join(outDir, "problem_chips"+str(key)+'.png'), x=110, y=230, w=79, h=42)
cleanup.append(os.path.join(outDir, 'problem_rows'+str(key)+'.png')) # puts image in line for deletion; happens after final PDF has been generated
cleanup.append(os.path.join(outDir, "problem_chips"+str(key)+'.png')) # puts image in line for deletion; happens after final PDF has been generated
# get number of chips failing missingness threshold
chip_fails_from_missigness = 0
for key, value in chip_missingness_fails.iteritems():
chip, num_occ = collections.Counter(value).most_common(1)[0]
if num_occ > chipFail:
most_freq_fails = collections.Counter(value).most_common()
while len(most_freq_fails) != 0 and most_freq_fails[0][1] > chipFail:
chip_fails_from_missigness = chip_fails_from_missigness +1
most_freq_fails.pop(0)
else:
continue
batch_summary.set_font('Arial', 'B', 16)
batch_summary.set_fill_color(200)
batch_summary.multi_cell(0, 10, 'Total Number of Chips: ' + str(total_chips), 1, 'L', True)
batch_summary.set_font('Arial', '', 14)
batch_summary.set_x(40)
batch_summary.multi_cell(0, 10, 'Total Number Failing due to Sex: ' + str(total_chips_fail), 1, 1, 'L')
batch_summary.set_font('Arial', '', 14)
batch_summary.set_x(40)
batch_summary.multi_cell(0, 10, 'Total Number Failing due to Low Call Rate: ' + str(chip_fails_from_missigness), 1, 1, 'L')
batch_summary.set_font('Arial', 'B', 16)
batch_summary.set_fill_color(200)
batch_summary.multi_cell(0, 10, 'Batch Sample Call Rate Statistics: ', 1, 'L', True)
batch_summary.set_font('Arial', '', 14)
batch_summary.set_x(40)
batch_summary.multi_cell(0, 10, "Mean sample call rate across all batches: "+str("%.2f" % round(stats.mean(batch_call_averages), 2))+'%', 1, 1, 'L')
if len(batch_call_averages) > 1: # std deviation can't be calculated if less than 2 batches exist due to no variance
batch_summary.set_x(40)
batch_summary.multi_cell(0, 10, "Standard Deviation of sample call rate across all batches: "+str("%.2f" % round(stats.stdev(batch_call_averages), 2)), 1, 1, 'L')
else:
batch_summary.set_x(40)
batch_summary.multi_cell(0, 10, "Standard Deviation of sample call rate across all batches: 0.00", 1, 1, 'L')
batch_summary.set_x(40)
batch_summary.multi_cell(0, 10, "Batch with lowest average call rate: "+str(min(batch_call_averages_paired, key=batch_call_averages_paired.get))+' ('+str("%.4f" % round(min(batch_call_averages), 4))+'%)', 1, 1, 'L')
batch_summary.set_x(40)
batch_summary.multi_cell(0, 10, "Batch with highest average call rate: "+str(max(batch_call_averages_paired, key=batch_call_averages_paired.get))+' ('+str("%.4f" % round(max(batch_call_averages), 4))+'%)', 1, 1, 'L')
return cleanup, failing_chip_IDs, batch_summary