Source code for genome_kit.interval

# Copyright (C) 2016-2023 Deep Genomics Inc. All Rights Reserved.
from __future__ import annotations

from operator import attrgetter
from warnings import warn

from . import _cxx
from ._cxx_util import mock, mock_result


[docs] @_cxx.register class Interval(_cxx.Interval): """A genomic interval. An Interval represents a contiguous stranded genomic interval, identified by a chromosome, strand, start position, end position, and reference genome. """ __slots__ = () # <--- DO NOT EXTEND BASE CLASS SLOTS # __new__(self): pass # <--- DO NOT OVERRIDE BASE CLASS IMPLEMENTATION # __del__(self): pass # <--- DO NOT IMPLEMENT # __getattribute__ # <--- DO NOT OVERRIDE BASE CLASS IMPLEMENTATION # __setattribute__ # <--- DO NOT OVERRIDE BASE CLASS IMPLEMENTATION
[docs] def __init__(self, chromosome, strand, start, end, reference_genome, anchor=None, anchor_offset=0): """Initialize an interval from a DNA0 position. Parameters ---------- chromosome : :py:class:`str` The chromosome name ('chr1', ...). strand : :py:class:`chr` The strand ("+" or "-"). start : :py:class:`int` The start position of the interval (inclusive, 0-based). end : :py:class:`int` The end position of the interval (exclusive, 0-based). reference_genome : :py:class:`str` | :py:class:`~genome_kit.Genome` The reference genome in which the interval is defined, for example "hg19". If a genome object is given, its reference genome will be used. anchor : :py:class:`int` | :py:class:`str` | :py:class:`~genome_kit.Interval` | None, optional The anchor is only relevant when extracting DNA from a `VariantGenome` and specifies how length-changing variants are applied. Conceptually, the anchor defines a fixed point when the sequence changes length. See (TODO: link) for the details. The anchor can be any position in the interval. If anchor is `None`, then length-changing variants will change the length of the interval. Anchor can be a string: `5p` or `3p` sets the anchor to the respective end of the interval, respecting the sense strand. `start` or `end` set the anchor to the start or end of the interval on the forward strand. When anchor is `center` then the anchor is set to the middle position of the interval (if the interval has odd length, the position is rounded towards the 5p end of the interval, respecting the sense strand). If anchor is an integer or an empty interval, the anchor is set to that position. anchor_offset : :py:class:`int`, optional The anchor offset is a shift that's applied to the anchor only on the variant genome. The purpose is to align a position on the reference genome with a position *inside* an insertion. The anchor itself would only allow to align with the beginning or end of the insertion. """ if isinstance(anchor, str): if anchor == 'start': anchor = start elif anchor == 'end': anchor = end elif anchor == '5p': anchor = start if strand == '+' else end elif anchor == '3p': anchor = end if strand == '+' else start elif anchor == 'center': anchor = start + (end - start) // 2 if strand == '+' else \ end - (end - start) // 2 # noqa else: raise ValueError("Unrecognized string '{}' for anchor".format(anchor)) # Deliberately skip overhead of super(Interval, self) wrapper _cxx.Interval.__init__(self, chromosome, strand, start, end, reference_genome, anchor, anchor_offset)
@mock @property def chromosome(self): # pragma: no cover """The chromosome name. A shorthand property `chrom` is also available. Returns ------- :py:class:`str` The name of the chromosome this interval is defined on (e.g. "chr1"). """ return mock_result(str) @mock @property def strand(self): # pragma: no cover """The strand ("+" or "-"). Returns ------- :py:class:`chr` The strand this interval is defined on. """ return mock_result(chr) @mock @property def start(self): # pragma: no cover """The integer position of the start (inclusive, 0-based). Returns ------- :py:class:`int` The start position of this interval within the reference genome. """ return mock_result(int) @mock @property def end(self): # pragma: no cover """The integer position of the end (exclusive, 0-based). Returns ------- :py:class:`int` The end position of the this interval within the reference genome. """ return mock_result(int) @mock @property def end5(self): # pragma: no cover """The 5p end of this interval, which is a length-0 interval. Not to be confused with the `end` attribute, which Returns ------- :py:class:`~genome_kit.Interval` The length-0 interval at the 5p end of the interval, i.e. the gap preceding the most upstream base in the interval. """ return mock_result(Interval) @mock @property def end3(self): # pragma: no cover """The 3p end of this interval, which is a length-0 interval. Returns ------- :py:class:`~genome_kit.Interval` The length-0 interval at the 3p end of the interval, i.e. the gap succeeding the most downstream base in the interval. """ return mock_result(Interval) @mock @property def reference_genome(self): # pragma: no cover """The reference genome (assembly name in UCSC format). A shorthand property `refg` is also available. Returns ------- :py:class:`str` The name of the reference genome this interval is defined in. """ return mock_result(str) @mock @property def anchor(self): # pragma: no cover """The anchor position. Returns ------- :py:class:`int` | :py:data:`None` The anchor position associated with this interval, if any. """ return mock_result(int) @mock @property def anchor_offset(self): # pragma: no cover """The anchor offset. Returns ------- :py:class:`int` The anchor offset associated with this interval, if any. """ return mock_result(int)
[docs] @mock def shift(self, amount): # pragma: no cover """The interval shifted upstream/downstream. Parameters ---------- amount : :py:class:`int` The amount by which to shift the start and end positions. A positive amount shifts downstream according to the current strand, and a negative amount shifts the position upstream. Returns ------- :py:class:`~genome_kit.Interval` The shifted interval. """ return mock_result(Interval)
[docs] @mock def expand(self, upstream, dnstream=None): # pragma: no cover """The interval expanded upstream/downstream. Can also be called with a single argument, in which case upstream and dnstream take on the same value. Parameters ---------- upstream : :py:class:`int` Number of upstream positions to add to the interval. dnstream : :py:class:`int` Number of downstream positions to add to the interval. If not specified, defaults to `upstream` (i.e. symmetric). Returns ------- :py:class:`~genome_kit.Interval` An interval containing `upstream` + `downstream` + 1 positions. """ return mock_result(Interval)
[docs] @mock def intersect(self, other): # pragma: no cover """The interval intersecting both intervals. Parameters ---------- other : :class:`~genome_kit.Interval` interval to intersect with this interval. Returns ------- :class:`~genome_kit.Interval` | :data:`None` The intersecting interval (possibly empty) or `None` if the intervals are disjoint """ return mock_result(Interval)
[docs] def subtract(self, other: Interval) -> Sequence[Interval]: """A list of intervals, representing this interval with the its intersection of another interval removed. Parameters ---------- other interval to subtract its intersection from this interval. Returns ------- Sequence[Interval] The intervals after subtraction. If the intervals were disjoint, returns [self]. """ intersection = self.intersect(other) if not intersection: return [self] result = [] if self.start < intersection.start: result.append(Interval(self.chrom, self.strand, self.start, intersection.start, self.refg)) if self.end > intersection.end: result.append(Interval(self.chrom, self.strand, intersection.end, self.end, self.refg)) return result
[docs] @mock def is_positive_strand(self): # pragma: no cover """Test if this interval is on the positive strand. Returns ------- :py:class:`bool` True if `strand` is equal to '+' """ return mock_result(bool)
[docs] @mock def as_positive_strand(self): # pragma: no cover """The same interval on the positive strand. Returns ------- :py:class:`~genome_kit.Interval` The same interval, but on the positive ('+') strand. """ return mock_result(Interval)
[docs] @mock def as_negative_strand(self): # pragma: no cover """The same interval on the negative strand. Returns ------- :py:class:`~genome_kit.Interval` The same interval, but on the negative ('-') strand. """ return mock_result(Interval)
[docs] @mock def as_opposite_strand(self): # pragma: no cover """The same interval on the opposite strand. Returns ------- :py:class:`~genome_kit.Interval` The same interval, but on the opposite strand. """ return mock_result(Interval)
[docs] @mock def upstream_of(self, other): # pragma: no cover """Test if this interval is strictly upstream of `other`. Parameters ---------- other : :py:class:`~genome_kit.Interval` Any coordinate or interval subclass defined on the same reference genome and strand as this interval. Returns ------- :py:class:`bool` True if this interval is strictly upstream of `other`, with no overlap. """ return mock_result(bool)
[docs] @mock def dnstream_of(self, other): # pragma: no cover """Test if this interval is strictly downstream of `other`. Parameters ---------- other : :py:class:`~genome_kit.Interval` Any coordinate or interval subclass defined on the same reference genome and strand as this interval. Returns ------- :py:class:`bool` True if this interval is strictly downstream of `other`, with no overlap. """ return mock_result(bool)
[docs] @mock def contains(self, other): # pragma: no cover """Test if `other` lies within the extents of this interval. Parameters ---------- other : :py:class:`~genome_kit.Interval` Any coordinate or interval subclass defined on the same reference genome and strand as this interval. Returns ------- :py:class:`bool` True if `other` lies entirely within the extents of this interval. """ return mock_result(bool)
[docs] @mock def within(self, other): # pragma: no cover """Test if this interval lies within the extents of `other`. Use of ``interval in other`` expressions is also available. Parameters ---------- other : :py:class:`~genome_kit.Interval` Any coordinate or interval subclass defined on the same reference genome and strand as this interval. Returns ------- :py:class:`bool` True if this interval lies within the extents of `other`. """ return mock_result(bool)
[docs] @mock def overlaps(self, other): # pragma: no cover """Test if this interval overlaps the extents of `other`. Parameters ---------- other : :py:class:`~genome_kit.Interval` Any coordinate or interval subclass defined on the same reference genome and strand as this interval. Returns ------- :py:class:`bool` True if this interval overlaps the extents of `other`. """ return mock_result(bool)
[docs] @staticmethod def from_dna0_coord(chromosome, strand, position, reference_genome): """Create an interval spanning a single DNA0 position. Equivalent to ``Interval(chromosome, strand, position, position+1, reference_genome)``. Parameters ---------- chromosome : :py:class:`str` The chromosome name (e.g. "chr1"). strand : :py:class:`int` The strand ("+" or "-"). position : :py:class:`int` The position the interval should span (0-based). reference_genome : :py:class:`str` | :py:class:`~genome_kit.Genome` The reference genome in which the interval is defined, for example "hg19". If a genome object is given, its reference genome will be used. Returns ------- :py:class:`~genome_kit.Interval` The resulting length-1 interval spanning `position`. """ return Interval(chromosome, strand, position, position + 1, reference_genome)
[docs] @staticmethod def from_dna0(chromosome, strand, start, end, reference_genome): """Create an interval from DNA0 positions. Equivalent to calling :py:class:`~genome_kit.Interval`. Parameters ---------- chromosome : :py:class:`str` The chromosome name (e.g. "chr1"). strand : :py:class:`int` The strand ("+" or "-"). start : :py:class:`int` The start position of the interval (inclusive, 0-based). end : :py:class:`int` The end position of the interval (exclusive, 0-based). reference_genome : :py:class:`str` | :py:class:`~genome_kit.Genome` The reference genome in which the interval is defined, for example "hg19". If a genome object is given, its reference genome will be used. Returns ------- :py:class:`~genome_kit.Interval` The resulting interval. """ return Interval(chromosome, strand, start, end, reference_genome)
[docs] @staticmethod def from_dna1(chromosome, strand, start, end, reference_genome): """Create an interval from DNA1 positions. Positions are automatically converted to DNA0 as (start-1, end). Parameters ---------- chromosome : :py:class:`str` The chromosome name (e.g. "chr1"). strand : :py:class:`int` The strand ("+" or "-"). start : :py:class:`int` The start position of the interval (inclusive, 1-based). end : :py:class:`int` The end position of the interval (inclusive, 1-based). reference_genome : :py:class:`str` | :py:class:`~genome_kit.Genome` The reference genome in which the interval is defined, for example "hg19". If a genome object is given, its reference genome will be used. Returns ------- :py:class:`~genome_kit.Interval` The resulting interval. """ return Interval(chromosome, strand, start - 1, end, reference_genome)
[docs] @staticmethod def from_rna1(chromosome, start, end, reference_genome): """Create an interval from RNA1 positions. The start/end positions are automatically converted to DNA0 as `min(abs(start),abs(end))-1` and `max(abs(start), abs(end)))` respectively. Parameters ---------- chromosome : :py:class:`str` The chromosome name (e.g. "chr1"). strand : :py:class:`int` The strand ("+" or "-"). start : :py:class:`int` The start position of the interval (signed, inclusive, 1-based). end : :py:class:`int` The end position of the interval (signed, inclusive, 1-based). reference_genome : :py:class:`str` | :py:class:`~genome_kit.Genome` The reference genome in which the interval is defined, for example "hg19". If a genome object is given, its reference genome will be used. Returns ------- :py:class:`~genome_kit.Interval` The resulting interval. """ strand = '+' if start > 0 else '-' # verify input if end: if not start <= end: raise ValueError("Start position needs to be <= end (%d, %d)" % (start, end)) if (start >= 0) != (end >= 0): raise ValueError("Start and end must have the same sign: " "(%d, %d)" % (start, end)) # check if it's single position or interval start, end = min(abs(start), abs(end)) - 1, max(abs(start), abs(end)) return Interval(chromosome, strand, start, end, reference_genome)
[docs] @staticmethod def spanning(interval1, interval2): """Create the smallest interval spanning the given pair of existing intervals, or raise an exception if given intervals not on the same chromosome, strand, and reference genome. The given intervals are not required to be overlapping, and may both be of zero length. Example:: >>> a = Interval("chr1", "+", 100, 150, "hg19") >>> b = Interval("chr1", "+", 200, 250, "hg19") >>> Interval.spanning(a, b) Interval("chr1", "+", 100, 250, "hg19") Notes ----- This method is not anchor aware, since that requires a list of variants for context. See :func:`~genome_kit.Variant.spanning` for that use case. Parameters ---------- interval1 : :py:class:`~genome_kit.Interval` an interval interval2 : :py:class:`~genome_kit.Interval` another interval Returns ------- :py:class:`~genome_kit.Interval` The resulting spanning interval. """ interval1.check_same_chromosome(interval2) if interval1.strand != interval2.strand: warn(f"Spanning intervals on opposite strands, got: {interval1}, {interval2}") if any(x.anchor is not None for x in [interval1, interval2]): raise ValueError( "Spanning anchored intervals (should use Variant.spanning), got:" f" {interval1}, {interval2}" ) start = min(interval1.start, interval2.start) end = max(interval1.end, interval2.end) return Interval(interval1.chrom, interval1.strand, start, end, interval1.reference_genome)
@property def midpoint(self): """Identify the midpoint of an interval. Example:: >>> Interval("chr1", "+", 100, 150, "hg19").midpoint Interval("chr1", "+", 125, 125, "hg19") >>> Interval("chr1", "+", 100, 151, "hg19").midpoint Interval("chr1", "+", 125, 126, "hg19") Returns ------- :py:class:`~genome_kit.Interval` The interval spanning the midpoint of the interval. """ return self.expand(-(len(self) // 2))
[docs] def distance(self, other, method='midpoint'): """The distance this interval and another. Example:: >>> interval_a = Interval("chr1", "+", 100, 150, "hg19") >>> interval_b = Interval("chr1", "+", 200, 300, "hg19") >>> interval_c = Interval("chr1", "+", 400, 500, "hg19") >>> interval_a.distance(interval_b) 125.0 >>> interval_a.distance([interval_b, interval_c]) [125.0, 325.0] Parameters ---------- other : :py:class:`~genome_kit.Interval` or :py:class:`~list` an interval or list of intervals method : :py:class:`str` determines which (point) attribute of this and the other interval to use when computing the distance. Must be one of `midpoint`, `end5`, or `end3`. Returns ------- :py:class:`float` or :py:class:`~list` Distance to the interval. If other is of type Interval, return distance as a float; otherwise, a list of distances. """ def midpoint_as_float(interval): x = interval.midpoint return x.start + len(x) / 2 if method == 'midpoint': getter = midpoint_as_float elif method in ['end5', 'end3']: getter = attrgetter(method + '.start') else: raise ValueError("`method` must be in ['midpoint', 'end5', 'end3']") def validated(interval): self.check_same_chromosome(interval) if self.strand != interval.strand: warn( f"Distancing intervals on opposite strands, got: {self}, {interval}" ) if any(x.anchor is not None for x in [self, interval]): raise ValueError( f"Distancing anchored intervals is not supported, got: {self}," f" {interval}" ) return interval if isinstance(other, Interval): return float(abs(getter(self) - getter(validated(other)))) start = getter(self) return [float(abs(start - getter(validated(x)))) for x in other]
[docs] def as_dna0(self): """Returns (start, end) positions using DNA0 convention. Returns ------- :py:class:`tuple` The positions ``(start, end)`` using DNA0 convention. """ return self.start, self.end
[docs] def as_dna1(self): """Returns (start, end) positions using DNA1 convention. Returns ------- :py:class:`tuple` The positions ``(start, end)`` using DNA1 convention. """ if self.start == self.end: raise ValueError("Empty intervals cannot be represented using DNA1 convention.") return self.start + 1, self.end
[docs] def as_rna1(self): """Returns (start, end) positions using RNA1 convention. Returns ------- :py:class:`tuple` The positions ``(start, end)`` using RNA1 convention. """ if self.start == self.end: raise ValueError("Empty intervals cannot be represented using RNA1 convention.") if self.is_positive_strand(): # Forward strand return self.start + 1, self.end else: # Reverse strand return -self.end, -(self.start + 1)
[docs] def as_ucsc(self): """Returns this interval as a UCSC browser coordinate. Returns ------- :py:class:`str` A string in UCSC browser convention, e.g. ``"chr5:500201-5002010"``. """ return "{}:{}-{}".format(self.chromosome, self.start + 1, self.end)
[docs] def with_anchor(self, anchor, anchor_offset=0): """Returns an anchored version of this interval. Returns ------- :py:class:`~genome_kit.Interval` An anchored version of this interval. The previous anchor, if any, is ignored. """ return Interval(self.chromosome, self.strand, self.start, self.end, self.reference_genome, anchor, anchor_offset)
def check_same_chromosome(self, other: Interval) -> None: if self.chromosome != other.chromosome: raise ValueError( f"Both intervals must be on the same chromosome, got: {self}, {other}" ) if self.reference_genome != other.reference_genome: raise ValueError( f"Both intervals must be on the same reference genome, got: {self}," f" {other}" )
[docs] def __repr__(self): anchor_args = "" if self.anchor is not None: anchor_args += ", " + str(self.anchor) if self.anchor_offset: anchor_args += ", " + str(self.anchor_offset) return 'Interval("{}", "{}", {}, {}, "{}"{})'.format(self.chrom, self.strand, self.start, self.end, self.refg, anchor_args)
[docs] def __str__(self): return self.__repr__()