-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfastinterval.py
410 lines (324 loc) · 13.3 KB
/
fastinterval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
"""
An simple interval class for DNA sequences from FASTA files that provides
fast access to sequences and methods for interval logic on those sequences.
Usually you will create a `Genome` and then use that object to create
intervals. The intervals have a sequence property that will look up the
actual sequence::
>>> from fastinterval import Genome, Interval
>>> test_genome = Genome('test/example.fa')
>>> int1 = test_genome.interval(100, 150, chrom='1')
>>> print int1
1:100-150:
>>> print int1.sequence
GATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGA
fastinterval uses pyfasta to retrieve the sequence, so the access is mmapped
(i.e fast). It supports strandedness, which will be respected when accessing
the sequence::
>>> int2 = test_genome.interval(100, 150, chrom='1', strand=-1)
>>> print int2.sequence
TCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATC
The Interval class supports many interval operations::
>>> int1 = test_genome.interval(100, 150, chrom='1')
>>> int2 = test_genome.interval(125, 175, chrom='1')
>>> int1.distance(int2)
0
>>> int1.span(int2)
Interval(100, 175)
>>> int1.overlaps(int2)
True
>>> int1.is_contiguous(int2)
True
>>> int1 in int2
False
>>> int1.intersection(int2)
Interval(125, 150)
>>> int1.union(int2)
Interval(100, 175)
>>> Interval.merge([int1, int2, test_genome.interval(200,250, chrom='1')])
[Interval(100, 175), Interval(200, 250)]
The Interval class is also based on bx python intervals. So you can pass in
a value attritbue to point to an external object, and create interval trees and
so on.
>>> from bx.intervals.intersection import IntervalTree
>>> int3 = test_genome.interval(150, 200, chrom='1', value='foo')
>>> tree = IntervalTree()
>>> _ = map(tree.insert_interval, (int1, int2, int3))
>>> tree.find(190, 195)
[Interval(150, 200, value=foo)]
"""
VERSION = '0.1.1'
from pyfasta import Fasta
from bx.intervals import Interval as BaseInterval
def _convert_strand(strand):
""" convert UCSC +/- to +1/-1"""
if strand == '-': return -1
if strand == '+': return 1
return strand
class Interval(BaseInterval):
""" A genomic interval """
def __init__(self, start, stop, genome=None, **kws):
self.genome = genome
if 'strand' in kws:
kws['strand'] = _convert_strand(kws['strand'])
BaseInterval.__init__(self, start, stop, **kws)
@property
def sequence(self):
""" Return the DNA from this intecal as a string """
if not self.genome:
raise Exception('Cannot retrieve sequence without a genome')
return self.genome.sequence(dict(
start = self.start,
stop = self.end,
chr = self.chrom,
strand = self.strand
), one_based=False).upper()
def __str__(self):
""" Return a chr:start-stop:strand representation of the interval """
return "%s:%s-%s:%s" % (self.chrom, self.start, self.end, self.strand if self.strand else '')
def __len__(self):
""" Return interval length """
return self.end - self.start
@classmethod
def from_string(cls, loc, **kws):
""" Create an interval from a chrx:start-end style string """
# chr1:10858-10967:1
toks = loc.split(':')
chrom = toks[0]
start, end = map(int, toks[1].split('-'))
if len(toks) == 3:
try:
strand = int(toks[2])
except ValueError:
strand = toks[2]
if strand == '+':
strand = 1
elif strand == '-':
strand = -1
elif strand == '':
strand = None
else:
raise Exception('Unknown strand %s' % strand)
else:
strand = None
return cls(start, end, chrom=chrom, strand=strand, **kws)
def distance(self, other):
""" return the distance between two intervals """
if self.chrom != other.chrom: return float('Inf')
if self.start > other.end: return self.start - other.end
if self.end < other.start: return other.start - self.end
return 0
def span(self, other, **kws):
""" Return an interval spanning two intervals """
if self.chrom != other.chrom:
raise Exception('cannot get span over two chromosomes')
return self.copy(
start = min([self.start, other.start]),
end = max([self.end, other.end]),
**kws
)
def span_between(self, other, **kws):
"""Return an Inteval spanning the gap between two intervals """
if self.chrom != other.chrom:
raise Exception('cannot get span over two chromosomes')
if self.overlaps(other):
return None
return self.copy(
start = min([self.end, other.end]),
end = max([self.start, other.start]),
**kws
)
def overlaps(self, other):
""" Return True if the intervals share at least one base """
return self.distance(other) == 0 and not (self.start==other.end or self.end==other.start)
def is_contiguous(self, other):
""" Return True if the intervals are overlapping or contiguous """
return self.distance(other) == 0
def __contains__(self, other):
""" Return true if one interval contains the other """
return self.start <= other.start and self.end >= other.end
def intersection(self, other):
""" Return the interval containing the intersection of two intervals """
if not self.overlaps(other):
return None
return Interval(
max(self.start, other.start),
min(self.end, other.end),
chrom = self.chrom
)
def union(self, other, merge_contiguous=False):
""" Return an interval containing the interval of two overlapping interval"""
test = Interval.is_contiguous if merge_contiguous else Interval.overlaps
if not test(self, other):
raise Exception('cannot get union of non overlapping intervals')
return self.span(other)
def copy(self, **kws):
""" Copy this interval, and optionally provide a dict of new attrs """
if 'start' in kws:
start = kws.pop('start')
else:
start = self.start
if 'end' in kws:
end = kws.pop('end')
else :
end = self.end
template = dict(chrom=self.chrom, strand=self.strand,
value=self.value, genome=self.genome)
template.update(kws)
return Interval(start, end, **template)
def __sub__(self, other):
""" Subtract an interval and return a list of intervals
i.e. (10,50) - (20,30) returns [(10,20), (30,50)]
"""
if not self.overlaps(other):
return [self]
if self in other:
return []
elif other in self and not (self.start == other.start or self.end == other.end):
left = self.copy()
left.end = other.start
right = self.copy()
right.start = other.end
return [left, right]
elif self.start >= other.start:
right = self.copy()
right.start = other.end
return [right]
elif self.end <= other.end:
left = self.copy()
left.end = other.start
return [left]
raise Exception('unhandled subtraction')
@classmethod
def merge(cls, intervals, merge_contiguous=False, **kwargs):
""" merge a list of intervals and return a list of intervals
By default, the intervals must be overlapping to be merged. If you
want to merge contiguous intervals, set merge_contiguous to True.
"""
is_overlapping = cls.is_contiguous if merge_contiguous else cls.overlaps
intervals = sorted(intervals, key=lambda x: (x.chrom, x.end))
if len(intervals) > 1:
done, todo = [intervals[0].copy(**kwargs)], intervals[1:]
while todo:
item = todo.pop(0).copy(**kwargs)
while done and is_overlapping(item, done[-1]):
item = done.pop().union(item, merge_contiguous=merge_contiguous)
done.append(item)
else:
done = intervals
return done
@classmethod
def coverage(cls, intervals):
if not intervals: return []
chrom = intervals[0].chrom
starts = [x.start for x in intervals]
ends = [x.end for x in intervals]
breaks = sorted(set(starts + ends))
return [
cls(start, end, chrom=chrom,
value=1 + len([x for x in starts if x<= start]) - len([x for x in ends if x <= end])
)
for start, end in zip(breaks, breaks[1:])
]
def add_border(self, size=0, upstream=0, downstream=0):
""" return interval with some bases added to each end """
if not (size or upstream or downstream):
return self.copy()
if size and (upstream or downstream):
raise Exception('please either size or upstream/downstream')
if size:
return self.copy(start=self.start-size, end = self.end+size)
if not self.strand:
raise Exception('Cannot add upstrea/downstream to strandless interval')
if self.strand == 1:
return self.copy(start=self.start - upstream, end = self.end + downstream)
else:
return self.copy(start=self.start - downstream, end = self.end + upstream)
def truncate(self, size):
""" truncate this interval to size, respecting the orientation """
if self.strand is None:
raise Exception('cannot truncate unstranded interval')
elif self.strand > 0:
return self.copy(end=min(self.start+size, self.end))
elif self.strand < 0:
return self.copy(start=max(self.end-size, self.start))
class Genome(object):
""" A convienience for creating intervals on the same genome """
def __init__(self, fname, *args, **kws):
""" Create a genome using a Fasta file. Other args passed to pyfasta """
self.fasta = Fasta(fname, *args, **kws)
def interval(self, start, end, **kws):
""" return an interval on this genome """
return Interval(start, end, genome=self.fasta, **kws)
def from_string(self, data):
"""docstring for from_string"""
return Interval.from_string(data, genome=self.fasta)
class MinimalSpanningSet(object):
""" Create a minimal spanning set for target intervals from a set of candidates """
def score_candidate(self, candidate):
return sum([
len(candidate.intersection(x))
for x in self.remaining_targets
if candidate.overlaps(x)
])
def coverage(self, choices):
""" work out the covered based for a set of choices"""
covered = Interval.merge(
[
choice.intersection(x)
for x in self.targets
for choice in choices
if choice.overlaps(x)
]
)
return sum(map(len, covered))
def __init__(self, targets, candidates, score_function=None, sort_key=None):
self.targets = targets
self.remaining_targets = list(targets)
self.candidates = candidates
self.chosen = []
self.sort_key = sort_key
if score_function is None:
self.score_function = MinimalSpanningSet.score_candidate
self._find_set()
def _find_set(self):
""" main loop """
while True:
scores = {}
# work out the score by summing the length of overlaps with the target
for candidate in self.candidates:
scores[candidate] = self.score_candidate(candidate)
# choose the best candidates
rankings = sorted(self.candidates, key=scores.get, reverse=True)
if rankings == []:
break
best = rankings[0]
# break if no improvement is possible
if scores[best] == 0:
break
if self.sort_key:
equiv = [x for x in rankings if scores.get(x)==scores.get(best)]
equiv = sorted(equiv, key=self.sort_key)
best = equiv[-1]
# update the data
self.chosen.append(best)
self.candidates.remove(best)
self._update_targets(best)
# break if no targets or candidates left
if len(self.targets) == 0 or len(self.candidates) == 0:
break
self._remove_redundant()
def _update_targets(self, choice):
""" remove chosen interval from targets """
new_targets = []
for target in self.remaining_targets:
new_targets.extend(target - choice)
self.remaining_targets = new_targets
def _remove_redundant(self):
""" drop any candidates that are completely covered by the rest"""
total_coverage = self.coverage(self.chosen)
for c in list(self.chosen):
# if the coverage without a choice is the same, remove it
test = list(self.chosen)
test.remove(c)
if self.coverage(test) == total_coverage:
self.chosen.remove(c)