-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
167 lines (135 loc) · 6.29 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python
############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# All Rights Reserved
# See file LICENSE for details.
#############################################################################
import os
import subprocess
import sys
import config
from os.path import join
def name_from_fpath(fpath):
return os.path.splitext(os.path.basename(fpath))[0]
def call_subprocess(args, stdin=None, stdout=None, stderr=None, env=None, debug=False):
printed_args = args[:]
if stdin:
printed_args += ['<', stdin.name]
if stdout:
printed_args += ['>>' if stdout.mode in ['a', 'a+'] else '>', stdout.name]
if stderr:
printed_args += ['2>>' if stderr.mode in ['a', 'a+'] else '2>', stderr.name]
if debug:
print ' Starting:', ' '.join(printed_args)
return_code = subprocess.call(args, stdin=stdin, stdout=stdout, stderr=stderr, env=env)
if return_code != 0 and 'ValidateSamFile' not in args and 'VariantRecalibrator' not in args:
print 'ERROR!'
sys.exit(1)
return return_code
def assert_file_exists(fpath, message=''):
if not os.path.isfile(fpath):
print ("File not found (%s): %s" % (message, fpath))
sys.exit(2)
return fpath
def get_path_to_program(program):
"""
returns the path to an executable or None if it can't be found
"""
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
def check_gatk():
if not os.path.exists(config.gatk_fpath):
print 'ERROR! GATK was not found. Please move GenomeAnalysisTK.jar to ' + config.gatk_dirpath + ' and restart the application.'
sys.exit(1)
def check_external_programs(names):
for name in names:
if not get_path_to_program(name):
print 'ERROR! %s was not found in PATH, please install it manually and/or add to PATH variable' % name
sys.exit(1)
def check_dbsnp():
if not os.path.exists(config.dbsnp_fpath):
print 'Warning: dbSNP was not found. Full pipeline requires dbSNP, thus reduced workflow will be used for now. ' \
'If you want to use full pipeline, please rename the archive with dbSNP as dbsnp.vcf.gz, move it to ' + \
config.db_dirpath + ' and restart the application.'
config.reduced_workflow = True
return False
return True
def set_threads_and_mem():
try:
import multiprocessing
config.max_threads = max(1, multiprocessing.cpu_count() / 2)
config.max_memory = max(2, get_free_memory() / 2)
except (ImportError, NotImplementedError):
pass
def get_free_memory():
from psutil import virtual_memory
mem = virtual_memory()
return mem.total / 10 ** 9
def set_reduced_pipeline():
print 'Warning: full pipeline requires hg19 reference genome. Reduced workflow will be used for now.'
config.reduced_workflow = True
def prepare_reference(ref_fpath, output_dirpath, silent=False):
if not silent:
print 'Preparing reference file...'
log_fpath = os.path.join(output_dirpath, 'processing_reference.log')
if not os.path.exists(ref_fpath + '.fai'):
call_subprocess(['samtools', 'faidx', ref_fpath], stderr=open(log_fpath, 'a'))
ref_fname, _ = os.path.splitext(ref_fpath)
if not os.path.exists(ref_fname + '.dict'):
call_subprocess(['java', '-jar', config.picard_fpath, 'CreateSequenceDictionary', 'R=%s' % ref_fpath,
'O=%s' % ref_fname + '.dict'], stderr=open(log_fpath, 'a'))
get_chr_lengths(ref_fpath)
if not config.is_run_on_cloud:
if not config.reduced_workflow or silent: # check for hg19 reference
if config.chr_names != config.hg19_chr_names:
if not silent:
set_reduced_pipeline()
return False
for chr in config.chr_lengths:
if chr not in config.hg19_chr_lengths or config.chr_lengths[chr] != config.hg19_chr_lengths[chr]:
if not silent:
set_reduced_pipeline()
return False
return True
def get_chr_lengths(ref_fpath):
ref_index_file = open(ref_fpath + '.fai')
config.chr_lengths = {}
config.chr_names = []
for line in ref_index_file.read().split('\n'):
if line:
line = line.split()
config.chr_names.append(line[0])
config.chr_lengths[line[0]] = int(line[1])
ref_index_file.close()
def search_hg19_reference():
fasta_gz_fpaths = [join(config.db_dirpath, f) for f in os.listdir(config.db_dirpath) if f.endswith('.fa.gz')
or f.endswith('.fna.gz') or f.endswith('.fasta.gz')]
for gz_fpath in fasta_gz_fpaths:
call_subprocess(['gunzip', '-f', gz_fpath])
fasta_fpaths = [join(config.db_dirpath, f) for f in os.listdir(config.db_dirpath) if f.endswith('.fa')
or f.endswith('.fna') or f.endswith('.fasta')]
for fasta_filepath in fasta_fpaths:
if prepare_reference(fasta_filepath, config.temp_output_dir, silent=True):
return fasta_filepath
hg19_fpath = join(config.db_dirpath, 'hg19.fa')
tar_gz_fpaths = [join(config.db_dirpath, f) for f in os.listdir(config.db_dirpath) if f.endswith('.tar.gz')]
for tar_gz_fpath in tar_gz_fpaths:
tar_gz_name = os.path.basename(tar_gz_fpath)
if tar_gz_name == 'hg19.tar.gz' or tar_gz_name == 'chromFa.tar.gz':
ref_temp_dirpath = join(config.db_dirpath, 'hg19')
if not os.path.exists(ref_temp_dirpath):
os.mkdir(ref_temp_dirpath)
call_subprocess(['tar', '-xzf', tar_gz_fpath, '-C', ref_temp_dirpath])
hg19_chr_filenames = [join(config.db_dirpath, 'hg19', name + '.fa') for name in config.hg19_chr_names]
for filepath in hg19_chr_filenames:
if not os.path.exists(filepath):
return None
call_subprocess(['cat'] + hg19_chr_filenames, stdout=open(hg19_fpath, 'w'))
if prepare_reference(hg19_fpath, config.temp_output_dir, silent=True):
return hg19_fpath
return None