Source code for genome_kit.genome

# Copyright (C) 2016-2023 Deep Genomics Inc. All Rights Reserved.
from __future__ import annotations

import os
from functools import partial
from weakref import WeakValueDictionary

from . import _cxx
from . import genome_annotation as _ga
from . import interval as _interval
from ._apply_variants import apply_variants, check_variants_list
from ._cxx_util import mock, mock_result
from ._util import reverse_complement
from .variant import Variant


[docs] @_cxx.register class Genome(_cxx.Genome): """Initialize a reference genome and its corresponding resources (DNA, annotations, etc). A genome object represents a reference genome and resources available for it, such as DNA and annotations:: >>> genome = Genome("gencode.v19") >>> exon = genome.exons[0] # 1st annotated exon. >>> dna = genome.dna(exon) # Extract DNA. The initializer argument specifies a versioned resource:: # Access resources that are hg19 Genome("hg19") # GENCODE v29 implies hg38 Genome("gencode.v29") The currently recognized strings depend on the data made available by data_manager. Some common ones are:: reference_genomes = [ "hg19", "hg19.p13.plusMT", "hg38", "hg38.p12", "hg38.p13", "hg38.p14", "mm10.p6", "mm39", "rn6", "macFas5", "susScr11", ] annotations = [ "gencode.v19", # implies hg19.p13.plusMT "gencode.v29", # implies hg38.p12 "gencode.v29.basic", # implies hg38.p12 "gencode.v29lift37", # implies hg19 "gencode.v29lift37.basic", # implies hg19 "gencode.v41", # implies hg38.p13 "gencode.v41.basic", # implies hg38.p13 "gencode.v41lift37", # implies hg19 "gencode.v41lift37.basic", # implies hg19 "ucsc_refseq.2017-06-25", # implies hg19 "ncbi_refseq.v105.20190906", # implies hg19.p13.plusMT "ncbi_refseq.v109", # implies hg38.p12 "ncbi_refseq.v110", # implies hg38.p14 "gencode.vM19", # implies mm10.p6 "gencode.vM19.basic", # implies mm10.p6 "gencode.vM30", # implies mm39 "gencode.vM30.basic", # implies mm39 "gencode.vM31", # implies mm39 "gencode.vM31.basic", # implies mm39 "ncbi_refseq.m39.v109", # implies mm39 "ensembl.Rnor_6.0.88", # implies rn6 "ncbi_refseq.Macfas_5.0.v101", # implies macFas5 "ensembl.Macfas_5.0.95", # implies macFas5 "ensembl.Sscrofa11.1.98", # implies susScr11 ] If a resource is requested after object creation, but the version needed cannot be disambiguated from `config`, an exception is raised. Parameters ---------- config : :py:class:`str` Identifies a versioned resource that this genome object is expected to serve. """
[docs] def __init__(self, config): # pragma: no cover self._chromosome_sizes = None self._appris_indices = None self._appris_transcripts = None self._appris_transcripts_by_gene = None self._appris_principality_strings = ('PRINCIPAL:1', 'PRINCIPAL:2', 'PRINCIPAL:3', 'PRINCIPAL:4', 'PRINCIPAL:5', 'ALTERNATIVE:1', 'ALTERNATIVE:2')
genome_cache = WeakValueDictionary()
[docs] def __reduce__(self): return (type(self), (self.config, ))
[docs] def __new__(cls, config): # pragma: no cover if cls == Genome: genome = Genome.genome_cache.get(config, None) if genome: return genome genome = _cxx.Genome.__new__(cls, config) if cls == Genome: # do not cache subclasses (like mocks) Genome.genome_cache[config] = genome return genome
[docs] def interval(self, chromosome, strand, start, end): """Shorthand method for creating an Interval with the same refg as the genome object called on. """ return _interval.Interval(chromosome, strand, start, end, self.refg)
[docs] def variant(self, variant_string): """Shorthand method for creating a Variant with the same refg as the genome object called on. """ return Variant.from_string(variant_string, self)
[docs] @mock def dna(self, interval, allow_outside_chromosome=True): # pragma: no cover """Extract DNA from a reference genome. If `interval` is on the negative strand, the reverse complement sequence is returned. If `interval` is outside the range of the chromosome, an :py:exc:`IndexError` is raised. Parameters ---------- interval : :py:class:`~genome_kit.Interval` The query interval. allow_outside_chromosome : :py:class:`bool` If False, does not allow the interval to be outside the range of the chromosome. Attempting to pass an interval outside the chromosome range will raise an :py:exc:`IndexError`. If not specified, defaults to True. Any base outside the chromosome range will be returned as an 'N'. Returns ------- :py:class:`str` The DNA sequence. """ return mock_result(str)
[docs] def variant_dna(self, interval, variants, allow_outside_chromosome=True): """Extract DNA from a genome with specific variants applied to it. Follows the same rules as :py:meth:`~genome_kit.VariantGenome.dna`. Intended for applying sets of variants that may overlap `interval`. Parameters ---------- interval : :py:class:`~genome_kit.Interval` The query interval. variants : :py:class:`~genome_kit.Variant` | :py:class:`list` of (:py:class:`~genome_kit.Variant`) Either a single variant or a list of variants. See :py:class:`~genome_kit.VariantGenome`. Returns ------- :py:class:`str` The variant DNA sequence. """ return apply_variants( partial(self.dna, allow_outside_chromosome=allow_outside_chromosome), check_variants_list(self, variants), interval, )
@mock @property def annotation(self): # pragma: no cover """Access to annotations. A shorthand property `anno` is also available. Returns ------- :py:class:`~genome_kit.GenomeAnnotation` An object that provides access to annotated genes, transcripts, and exons. """ return mock_result(_ga.GenomeAnnotation) @mock @property def genes(self): # pragma: no cover """Access to annotated genes. This is shorthand for `annotation.genes`. Returns ------- :py:class:`~genome_kit.GeneTable` An object that supports looping or queries over annotated genes. """ return mock_result(_ga.GeneTable) @mock @property def transcripts(self): # pragma: no cover """Access to annotated transcripts. This is shorthand for `annotation.transcripts`. A shorthand property `trans` is also available. Returns ------- :py:class:`~genome_kit.TranscriptTable` An object that supports looping or queries over annotated transcripts. """ return mock_result(_ga.TranscriptTable) @mock @property def exons(self): # pragma: no cover """Access to annotated exons. This is shorthand for `annotation.exons`. Returns ------- :py:class:`~genome_kit.ExonTable` An object that supports looping or queries over annotated exons. """ return mock_result(_ga.ExonTable) @mock @property def introns(self): # pragma: no cover """Access to annotated introns. This is shorthand for `annotation.introns`. Returns ------- :py:class:`~genome_kit.IntronTable` An object that supports looping or queries over annotated introns. """ return mock_result(_ga.IntronTable) @mock @property def cdss(self): # pragma: no cover """Access to annotated coding sequences (CDSs). This is shorthand for `annotation.cdss`. Returns ------- :py:class:`~genome_kit.CdsTable` An object that supports looping or queries over annotated coding sequences. """ return mock_result(_ga.CdsTable) @mock @property def utr5s(self): # pragma: no cover """Access to annotated UTR5 sequences. This is shorthand for `annotation.utr5s`. Returns ------- :py:class:`~genome_kit.Utr5Table` An object that supports looping or queries over annotated coding sequences. """ return mock_result(_ga.Utr5Table) @mock @property def utr3s(self): # pragma: no cover """Access to annotated UTR3 sequences. This is shorthand for `annotation.utr3s`. Returns ------- :py:class:`~genome_kit.Utr3Table` An object that supports looping or queries over annotated coding sequences. """ return mock_result(_ga.Utr3Table)
[docs] def find_motif(self, interval, motif, match_position=0, find_overlapping_motifs=False): """Find a genomic motif in an interval on a :py:class:`~genome_kit.Genome`. Parameters ---------- interval : :py:class:`~genome_kit.Interval` A genomic interval to search for the motif. motif : :py:class:`str` A genomic motif to search for. match_position : :py:class:`int` | :py:class:`str` Very often you want to find a position after a motif or in the middle of a motif. For example, when you are searching for a ``AG`` core acceptor site motif you want to match the 3p end of the ``AG`` motif to return the position of the putative splice site. To do this you can specify a match position as string with the values ``"3p"`` or ``"5p"``, or you can pass an integer that indicates a position in the motif with ``0 <= match_position <= len(motif)``. find_overlapping_motifs : :py:class:`bool` Whether to return matches for overlapping motifs. The default is to only find non-overlapping motifs. Returns ------- :py:class:`list` of :py:class:`~genome_kit.Interval` A list of all matches (length-0 intervals) found in the query interval. Example -------:: >>> genome37 = Genome('hg19') >>> interval = genome37.interval('chr1', '+', 40000, 40080) >>> motif = 'AA' >>> genome37.find_motif(interval, motif) # doctest:+NORMALIZE_WHITESPACE [Interval("chr1", "+", 40031, 40031, "hg19"), Interval("chr1", "+", 40061, 40061, "hg19")] """ # TODO: Longterm we'll want to move towards a generator function to be more efficient # TODO: on long intervals. if match_position == '5p': match_position = 0 elif match_position == '3p': match_position = len(motif) if not (0 <= match_position <= len(motif)): raise ValueError("Require 0 <= match_position <= len(motif) [match_position={}]".format(match_position)) strand = interval.strand == '+' # All motif finding is done on the reference strand. Therefore the motif # is reverse complemented when the interval is on the reverse strand. As # well, the match_position is recomputed to be applied from the end of the # motif (such at it is correct on the reverse strand), e.g. '5p' means # match_position = 0 on the forward strand and match_position = len(motif) on # the reverse strand. if not strand: match_position = len(motif) - match_position motif = motif.upper() dna_start = interval.start if not strand: motif = reverse_complement(motif) dna = self.dna(interval.as_positive_strand()) motif_hits = [] offset = 0 if find_overlapping_motifs: offset_step = 1 else: offset_step = len(motif) while True: pos = dna.find(motif, offset) if pos == -1: break pos_genome = dna_start + pos + match_position hit = _interval.Interval(interval.chromosome, interval.strand, pos_genome, pos_genome, interval.refg) motif_hits.append(hit) offset = pos + offset_step return motif_hits
@mock @property def reference_genome(self): # pragma: no cover """The name of the reference genome. Returns ------- :py:class:`str` The name of the reference genome that any resources map to, e.g. "hg19" or "hg38". """ return mock_result(str) @property def chromosomes(self): """A list of valid chromosome names for this genome.""" return list(self._chrom_sizes().keys())
[docs] def chromosome_size(self, chromosome): """Returns the size of the given chromosome, in bp. Returns ------- :py:class:`int` The size of the given chromosome, as number of basepairs. """ try: return self._chrom_sizes()[chromosome] except KeyError: error_message = "No such chromosome: %s. \nSupported values: %s" % (chromosome, self._chrom_sizes().keys()) raise ValueError(error_message)
@mock @property def config(self): """The configuration strings of this genome. Returns ------- :py:class:`str` The config used to initialize this object. """ return mock_result(str) @mock @property def data_dir(self): # pragma: no cover """The data_dir from which resources are opened. Returns ------- :py:class:`str` The path to where resource files are being opened. """ return mock_result(str)
[docs] def __repr__(self): return f'Genome("{self.config}")'
def _chrom_sizes(self): if self._chromosome_sizes: return self._chromosome_sizes self._chromosome_sizes = {} with open(self._chrom_sizes_filename()) as file: for line in file: (key, val) = line.split() self._chromosome_sizes[str(key)] = int(val) return self._chromosome_sizes def _chrom_sizes_filename(self): from . import gk_data # If this Genome object has a custom data_dir resource, we must respect it. datafile_path = os.path.join(self.data_dir, f"{self.refg}.chrom.sizes") # Before returning, ensure that the path resolves to an actual file # on the local file system, pulling from store if necessary. return gk_data.resolve_datafile_path(datafile_path) def _load_appris(self): """Parse the APPRIS text file corresponding to this genome's annotations.""" import pickle from . import gk_data from ._gk_data_config import get_appris_filename # Figure out which annotation version we should match with APPRIS datafile_name = get_appris_filename(self.config) datafile_path = os.path.join(self.data_dir, datafile_name) try: datafile_path = gk_data.resolve_datafile_path(datafile_path) except Exception as e: raise ApprisNotAvailableError(f"APPRIS not available for {self.annotation.filename}") from e with open(datafile_path, 'rb') as fp: data = pickle.load(fp) # List of int, where item [i] is the principality index (0..6) for self.transcripts[i] self._appris_indices = data[0] # List of Transcript indices, ordered by APPRIS index self._appris_transcripts = data[1] # List of lists, where item [i] is a list of Transcript indices belonging to self.genes[i], # ordered by APPRIS index. self._appris_transcripts_by_gene = data[2]
[docs] def appris_transcripts(self, gene=None): """Returns a list of transcript objects ordered by our custom APPRIS index. If a gene is provided, returns a list of transcripts for that gene ordered by decreasing priority. Parameters ---------- gene : :py:class:`~genome_kit.Gene` | :py:class:`str`, optional Restrict to a specific gene. If using `str`, use the geneID. Returns ------- :py:class:`list` of :py:class:`~genome_kit.Transcript` A list of transcripts that have APPRIS scores available. If a gene is provided as input, a list of transcripts for that gene in the order from most to least principle; so the first transcript should be used as the canonical. Raises ------ ApprisNotAvailableError: APPRIS data is not available for this genome's annotations. """ if self._appris_transcripts is None: self._load_appris() if gene is None: # Use all transcript indices ordered by principality. transcript_indices = self._appris_transcripts else: if isinstance(gene, str): # Get transcript indices ordered by APPRIS principality. gene = self.genes[gene] gene_idx = self.genes.index_of(gene) transcript_indices = self._appris_transcripts_by_gene[gene_idx] transcripts = [self.transcripts[i] for i in transcript_indices] if gene is not None: for transcript in transcripts: assert transcript.gene == gene, f"Unexpected gene {transcript.gene} for transcript {transcript}" return transcripts
[docs] def appris_principality(self, transcript, as_string=False): """Returns the APPRIS principality index (isoform specific number) for this transcript. An APPRIS principality index is an integer from 0..6 where 0 is considered most prinicpal and 6 considered least principal. In the original APPRIS database, the 7 principality levels are reported using strings, in the following order:: appris_index_names = [ "PRINCIPAL:1", # index 0 "PRINCIPAL:2", # index 1 "PRINCIPAL:3", # index 2 "PRINCIPAL:4", # index 3 "PRINCIPAL:5", # index 4 "ALTERNATIVE:1", # index 5 "ALTERNATIVE:2", # index 6 ] See the `APPRIS database documentation <http://appris.bioinfo.cnio.es/#/help/database/report>`_ for details. The APPRIS version defaults to `2017_05.v22` most annotations; see _build_appris.REMOTE_FILES for the rest. Parameters ---------- transcript : :py:class:`~genome_kit.Transcript` | :py:class:`str` A transcript to query. as_string : :py:class:`bool`, optional Toggle True if the APPRIS principality string (e.g. `"PRINCIPAL:1"`) is desired. Returns ------- :py:class:`int` | :py:class:`str` The APPRIS principality index [0..6] of this transcript, or the string representation if `as_string=True` Raises ------ ApprisNotAvailableError: APPRIS data is not available for this genome's annotations. """ if self._appris_indices is None: self._load_appris() if isinstance(transcript, str): transcript = self.transcripts[transcript] # Index of transcript object tidx = self.transcripts.index_of(transcript) if as_string: return self._appris_principality_strings[self._appris_indices[tidx]] return self._appris_indices[tidx]
class ApprisNotAvailableError(RuntimeError): pass