Source code for rnalib.constants

from enum import IntEnum

from IPython.core.display import Markdown

#: Convenience lists of canonical chromosome names
CANONICAL_CHROMOSOMES = {
    "GRCh38": [f"chr{c}" for c in list(range(1, 23)) + ["X", "Y", "M"]],
    "GRCm38": [f"chr{c}" for c in list(range(1, 20)) + ["X", "Y", "MT"]],
    "mm39": [f"chr{c}" for c in list(range(1, 20)) + ["X", "Y", "MT"]],
    "dmel": ["2L", "2R", "3L", "3R", "4", "X", "Y"],
}

#: maximum integer value, assuming 32-bit ints
MAX_INT = 2**31 - 1

#: Maps valid sub-feature types (e.g., 'exon', 'CDS') types to SO terms (e.g., '3UTR' -> 'three_prime_UTR')
FTYPE_TO_SO = {
    "gene": "gene",
    "ncRNA_gene": "gene",
    "transcript": "transcript",
    "mRNA": "transcript",
    "ncRNA": "transcript",
    "lnc_RNA": "transcript",
    "pseudogenic_transcript": "transcript",
    "pre_miRNA": "transcript",
    "rRNA": "transcript",
    "snRNA": "transcript",
    "snoRNA": "transcript",
    "tRNA": "transcript",
    "miRNA": "transcript",
    "exon": "exon",
    "intron": "intron",
    "CDS": "CDS",
    "three_prime_UTR": "three_prime_UTR",
    "3UTR": "three_prime_UTR",
    "UTR3": "three_prime_UTR",
    "five_prime_UTR": "five_prime_UTR",
    "5UTR": "five_prime_UTR",
    "UTR5": "five_prime_UTR",
}

#: Maps info field names for various GFF flavours
GFF_FLAVOURS = {
    ("gencode", "gff"): {
        "gid": "ID",
        "tid": "ID",
        "tx_gid": "Parent",
        "feat_tid": "Parent",
        "gene_name": "gene_name",
        "ftype_to_SO": FTYPE_TO_SO,
        "copied_fields": ["source", "gene_type"],
    },
    ("gencode", "gtf"): {
        "gid": "gene_id",
        "tid": "transcript_id",
        "tx_gid": "gene_id",
        "feat_tid": "transcript_id",
        "gene_name": "gene_name",
        "ftype_to_SO": FTYPE_TO_SO,
        "copied_fields": ["source", "gene_type"],
    },
    ("ensembl", "gff"): {
        "gid": "ID",
        "tid": "ID",
        "tx_gid": "Parent",
        "feat_tid": "Parent",
        "gene_name": "Name",
        "ftype_to_SO": FTYPE_TO_SO
        | {
            "pseudogene": "gene"
        },  # 'pseudogene': maps to 'gene' in ensembl but to tx in
        # flybase
        "copied_fields": ["source", "gene_type"],
    },
    ("flybase", "gtf"): {
        "gid": "gene_id",
        "tid": "transcript_id",
        "tx_gid": "gene_id",
        "feat_tid": "transcript_id",
        "gene_name": "gene_symbol",
        "ftype_to_SO": FTYPE_TO_SO
        | {
            "pseudogene": "transcript"
        },  # 'pseudogene': maps to 'gene' in ensembl but to
        # tx in flybase
        "copied_fields": ["source", "gene_type"],
    },
    ("wormbase", "gff"): {
        "gid": "ID",
        "tid": "ID",
        "tx_gid": "Parent",
        "feat_tid": "Parent",
        "gene_name": "Name",
        "ftype_to_SO": FTYPE_TO_SO | {"pseudogenic_transcript": None, "snRNA": None},
        # 'pseudogene': maps to 'gene' in ensembl but to tx in wormbase
        # NOTE that some wormbase files contain 'pseudogenic_transcript' features that are not enveloped by the
        # referenced genes, so we map them to None (which skips them).
        # Also, snRNA features have no associated 'Parent' id and are therefore skipped.
        "copied_fields": ["source", "biotype", "so_term_name"],
    },
    ("ucsc", "gtf"): {
        "gid": None,
        "tid": "transcript_id",
        "tx_gid": "gene_id",
        "feat_tid": "transcript_id",
        "gene_name": "gene_name",
        "ftype_to_SO": FTYPE_TO_SO,
        "copied_fields": ["source", "gene_type"],
    },
    ("chess", "gff"): {
        "gid": None,
        "tid": "ID",
        "tx_gid": "Parent",
        "feat_tid": "Parent",
        "gene_name": "gene_name",
        "ftype_to_SO": FTYPE_TO_SO,
        "copied_fields": ["source", "gene_type"],
    },
    ("chess", "gtf"): {
        "gid": None,
        "tid": "transcript_id",
        "tx_gid": "gene_id",
        "feat_tid": "transcript_id",
        "gene_name": "gene_name",
        "ftype_to_SO": FTYPE_TO_SO,
        "copied_fields": ["source", "gene_type"],
    },
    ("mirgenedb", "gff"): {
        "gid": None,
        "tid": "ID",
        "tx_gid": None,
        "feat_tid": None,
        "gene_name": "Alias",
        "ftype_to_SO": {"pre_miRNA": "transcript", "miRNA": "transcript"},
        "copied_fields": ["source", "Alias"],
    },
    ("mirbase", "gff"): {
        "gid": None,
        "tid": "ID",
        "tx_gid": None,
        "feat_tid": None,
        "gene_name": "Alias",
        "ftype_to_SO": {"miRNA_primary_transcript": "transcript", "miRNA": "transcript"},
        "copied_fields": ["source", "Alias"],
    },
    ("generic", "gff"): {
        "gid": "ID",
        "tid": "ID",
        "tx_gid": "Parent",
        "feat_tid": "Parent",
        "gene_name": "gene_name",
        "ftype_to_SO": FTYPE_TO_SO,
        "copied_fields": [],  # should be passed via constructor
    },
    ("generic", "gtf"): {
        "gid": "gene_id",
        "tid": "transcript_id",
        "tx_gid": "gene_id",
        "feat_tid": "transcript_id",
        "gene_name": "gene_name",
        "ftype_to_SO": FTYPE_TO_SO,
        "copied_fields": [],  # should be passed via constructor
    },
}


[docs] class BamFlag(IntEnum): """BAM flags, @see https://broadinstitute.github.io/picard/explain-flags.html""" BAM_FPAIRED = 0x1 # the read is paired in sequencing, no matter whether it is mapped in a pair BAM_FPROPER_PAIR = 0x2 # the read is mapped in a proper pair BAM_FUNMAP = 0x4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR BAM_FMUNMAP = 0x8 # the mate is unmapped BAM_FREVERSE = 0x10 # the read is mapped to the reverse strand BAM_FMREVERSE = 0x20 # the mate is mapped to the reverse strand BAM_FREAD1 = 0x40 # this is read1 BAM_FREAD2 = 0x80 # this is read2 BAM_FSECONDARY = 0x100 # not primary alignment BAM_FQCFAIL = 0x200 # QC failure BAM_FDUP = 0x400 # optical or PCR duplicate BAM_SUPPLEMENTARY = 0x800 # optical or PCR duplicate
#: default BAM flag filter (int 3844); comparable to samtools view -F 3844; also used as default filter in IGV. DEFAULT_FLAG_FILTER = ( BamFlag.BAM_FUNMAP | BamFlag.BAM_FSECONDARY | BamFlag.BAM_FQCFAIL | BamFlag.BAM_FDUP | BamFlag.BAM_SUPPLEMENTARY ) #: Markdown separator for jupyter notebooks; `display(SEP)` SEP = Markdown("---") #: mygeneinfo fields mygeneinfo_fields = { "accession": "object", "accession.genomic": "text", "accession.protein": "text", "accession.rna": "text", "accession.translation": "object", "alias": "keyword", "AnimalQTLdb": "text", "biocarta": "text", "clingen": "object", "clingen.clinical_validity": "object", "clingen.clinical_validity.classification": "keyword", "clingen.clinical_validity.classification_date": "date", "clingen.clinical_validity.disease_label": "text", "clingen.clinical_validity.gcep": "text", "clingen.clinical_validity.moi": "keyword", "clingen.clinical_validity.mondo": "keyword", "clingen.clinical_validity.online_report": "text", "clingen.clinical_validity.sop": "keyword", "ec": "keyword", "ensembl": "object", "ensembl.gene": "keyword", "ensembl.protein": "keyword", "ensembl.transcript": "keyword", "ensembl.translation": "object", "ensembl.type_of_gene": "keyword", "entrezgene": "keyword", "exac": "object", "exac.all": "object", "exac.bp": "integer", "exac.cds_end": "integer", "exac.cds_start": "integer", "exac.n_exons": "integer", "exac.nonpsych": "object", "exac.nontcga": "object", "exac.transcript": "text", "exons": "object", "exons_hg19": "object", "exons_mm10": "object", "exons_mm9": "object", "FLYBASE": "keyword", "generif": "object", "generif.pubmed": "long", "generif.text": "text", "genomic_pos": "object", "genomic_pos.chr": "keyword", "genomic_pos.end": "long", "genomic_pos.start": "long", "genomic_pos.strand": "byte", "genomic_pos_hg19": "object", "genomic_pos_hg19.chr": "keyword", "genomic_pos_hg19.end": "long", "genomic_pos_hg19.start": "long", "genomic_pos_hg19.strand": "byte", "genomic_pos_mm9": "object", "genomic_pos_mm9.chr": "keyword", "genomic_pos_mm9.end": "long", "genomic_pos_mm9.start": "long", "genomic_pos_mm9.strand": "byte", "go": "object", "go.BP": "object", "go.BP.category": "text", "go.BP.evidence": "text", "go.BP.id": "keyword", "go.BP.pubmed": "long", "go.BP.term": "text", "go.CC": "object", "go.CC.category": "text", "go.CC.evidence": "text", "go.CC.id": "keyword", "go.CC.pubmed": "long", "go.CC.term": "text", "go.MF": "object", "go.MF.category": "text", "go.MF.evidence": "text", "go.MF.id": "keyword", "go.MF.pubmed": "long", "go.MF.term": "text", "HGNC": "keyword", "homologene": "object", "homologene.genes": "long", "homologene.id": "long", "HPRD": "keyword", "humancyc": "text", "interpro": "object", "interpro.desc": "text", "interpro.id": "keyword", "interpro.short_desc": "text", "ipi": "keyword", "kegg": "text", "locus_tag": "keyword", "map_location": "text", "MGI": "keyword", "MIM": "keyword", "miRBase": "keyword", "mousecyc": "text", "name": "text", "netpath": "text", "other_names": "text", "pantherdb": "object", "pantherdb.Araport": "keyword", "pantherdb.dictyBase": "keyword", "pantherdb.EcoGene": "keyword", "pantherdb.Ensembl": "keyword", "pantherdb.EnsemblGenome": "keyword", "pantherdb.FlyBase": "keyword", "pantherdb.Gene": "keyword", "pantherdb.Gene_Name": "keyword", "pantherdb.Gene_OrderedLocusName": "keyword", "pantherdb.Gene_ORFName": "keyword", "pantherdb.GeneID": "keyword", "pantherdb.HGNC": "keyword", "pantherdb.MGI": "keyword", "pantherdb.ortholog": "object", "pantherdb.ortholog.Araport": "keyword", "pantherdb.ortholog.dictyBase": "keyword", "pantherdb.ortholog.EcoGene": "keyword", "pantherdb.ortholog.Ensembl": "keyword", "pantherdb.ortholog.EnsemblGenome": "keyword", "pantherdb.ortholog.FlyBase": "keyword", "pantherdb.ortholog.Gene": "keyword", "pantherdb.ortholog.Gene_Name": "keyword", "pantherdb.ortholog.Gene_OrderedLocusName": "keyword", "pantherdb.ortholog.Gene_ORFName": "keyword", "pantherdb.ortholog.GeneCards": "keyword", "pantherdb.ortholog.GeneID": "keyword", "pantherdb.ortholog.HGNC": "keyword", "pantherdb.ortholog.MGI": "keyword", "pantherdb.ortholog.ortholog_type": "keyword", "pantherdb.ortholog.panther_family": "keyword", "pantherdb.ortholog.PomBase": "keyword", "pantherdb.ortholog.RGD": "keyword", "pantherdb.ortholog.SGD": "keyword", "pantherdb.ortholog.TAIR": "keyword", "pantherdb.ortholog.taxid": "integer", "pantherdb.ortholog.uniprot_kb": "keyword", "pantherdb.ortholog.WormBase": "keyword", "pantherdb.ortholog.ZFIN": "keyword", "pantherdb.PomBase": "keyword", "pantherdb.RGD": "keyword", "pantherdb.SGD": "keyword", "pantherdb.TAIR": "keyword", "pantherdb.uniprot_kb": "keyword", "pantherdb.WormBase": "keyword", "pantherdb.ZFIN": "keyword", "pathway": "object", "pathway.biocarta": "object", "pathway.biocarta.id": "text", "pathway.biocarta.name": "text", "pathway.humancyc": "object", "pathway.humancyc.id": "text", "pathway.humancyc.name": "text", "pathway.kegg": "object", "pathway.kegg.id": "text", "pathway.kegg.name": "text", "pathway.mousecyc": "object", "pathway.mousecyc.id": "text", "pathway.mousecyc.name": "text", "pathway.netpath": "object", "pathway.netpath.id": "text", "pathway.netpath.name": "text", "pathway.pharmgkb": "object", "pathway.pharmgkb.id": "text", "pathway.pharmgkb.name": "text", "pathway.pid": "object", "pathway.pid.id": "text", "pathway.pid.name": "text", "pathway.reactome": "object", "pathway.reactome.id": "text", "pathway.reactome.name": "text", "pathway.smpdb": "object", "pathway.smpdb.id": "text", "pathway.smpdb.name": "text", "pathway.wikipathways": "object", "pathway.wikipathways.id": "text", "pathway.wikipathways.name": "text", "pathway.yeastcyc": "object", "pathway.yeastcyc.id": "text", "pathway.yeastcyc.name": "text", "pdb": "keyword", "pfam": "keyword", "pharmgkb": "keyword", "pharos": "object", "pharos.target_id": "integer", "pid": "text", "pir": "keyword", "prosite": "keyword", "RATMAP": "keyword", "reactome": "text", "reagent": "object", "reagent.CM-LibrX-no-seq": "object", "reagent.CM-LibrX-no-seq.id": "keyword", "reagent.CM-LibrX-no-seq.relationship": "text", "reagent.CondMedia_CM_LibrAB": "object", "reagent.CondMedia_CM_LibrAB.id": "keyword", "reagent.CondMedia_CM_LibrAB.relationship": "text", "reagent.GNF_hs-druggable_lenti-shRNA": "object", "reagent.GNF_hs-druggable_lenti-shRNA.id": "keyword", "reagent.GNF_hs-druggable_lenti-shRNA.relationship": "text", "reagent.GNF_hs-druggable_plasmid-shRNA": "object", "reagent.GNF_hs-druggable_plasmid-shRNA.id": "keyword", "reagent.GNF_hs-druggable_plasmid-shRNA.relationship": "text", "reagent.GNF_hs-druggable_siRNA": "object", "reagent.GNF_hs-druggable_siRNA.id": "keyword", "reagent.GNF_hs-druggable_siRNA.relationship": "text", "reagent.GNF_hs-GPCR_IDT-siRNA": "object", "reagent.GNF_hs-GPCR_IDT-siRNA.id": "keyword", "reagent.GNF_hs-GPCR_IDT-siRNA.relationship": "text", "reagent.GNF_hs-oncomine_IDT-siRNA": "object", "reagent.GNF_hs-oncomine_IDT-siRNA.id": "keyword", "reagent.GNF_hs-oncomine_IDT-siRNA.relationship": "text", "reagent.GNF_hs-ORFeome1_1_reads": "object", "reagent.GNF_hs-ORFeome1_1_reads.id": "keyword", "reagent.GNF_hs-ORFeome1_1_reads.relationship": "text", "reagent.GNF_hs-Origene": "object", "reagent.GNF_hs-Origene.id": "keyword", "reagent.GNF_hs-Origene.relationship": "text", "reagent.GNF_hs-pkinase_IDT-siRNA": "object", "reagent.GNF_hs-pkinase_IDT-siRNA.id": "keyword", "reagent.GNF_hs-pkinase_IDT-siRNA.relationship": "text", "reagent.GNF_hs_LentiORF-HA-MYC": "object", "reagent.GNF_hs_LentiORF-HA-MYC.id": "keyword", "reagent.GNF_hs_LentiORF-HA-MYC.relationship": "text", "reagent.GNF_hs_LentiORF-Jred": "object", "reagent.GNF_hs_LentiORF-Jred.id": "keyword", "reagent.GNF_hs_LentiORF-Jred.relationship": "text", "reagent.GNF_mm+hs-MGC": "object", "reagent.GNF_mm+hs-MGC.id": "keyword", "reagent.GNF_mm+hs-MGC.relationship": "text", "reagent.GNF_mm+hs_RetroCDNA": "object", "reagent.GNF_mm+hs_RetroCDNA.id": "keyword", "reagent.GNF_mm+hs_RetroCDNA.relationship": "text", "reagent.GNF_mm-GIPZ_shRNA": "object", "reagent.GNF_mm-GIPZ_shRNA.id": "keyword", "reagent.GNF_mm-GIPZ_shRNA.relationship": "text", "reagent.GNF_mm-kinase_lenti-shRNA": "object", "reagent.GNF_mm-kinase_lenti-shRNA.id": "keyword", "reagent.GNF_mm-kinase_lenti-shRNA.relationship": "text", "reagent.GNF_mm-kinase_plasmid-shRNA": "object", "reagent.GNF_mm-kinase_plasmid-shRNA.id": "keyword", "reagent.GNF_mm-kinase_plasmid-shRNA.relationship": "text", "reagent.GNF_mm-TLR_lenti_shRNA": "object", "reagent.GNF_mm-TLR_lenti_shRNA.id": "keyword", "reagent.GNF_mm-TLR_lenti_shRNA.relationship": "text", "reagent.GNF_Qia_hs-genome_v1_siRNA": "object", "reagent.GNF_Qia_hs-genome_v1_siRNA.id": "keyword", "reagent.GNF_Qia_hs-genome_v1_siRNA.relationship": "text", "reagent.IDT_27mer_hs_ATPase_siRNAs": "object", "reagent.IDT_27mer_hs_ATPase_siRNAs.id": "keyword", "reagent.IDT_27mer_hs_ATPase_siRNAs.relationship": "text", "reagent.Invitrogen_IVTHSSIPKv2": "object", "reagent.Invitrogen_IVTHSSIPKv2.id": "keyword", "reagent.Invitrogen_IVTHSSIPKv2.relationship": "text", "reagent.MasterSecretomicsList": "object", "reagent.MasterSecretomicsList.id": "keyword", "reagent.MasterSecretomicsList.relationship": "text", "reagent.NIBRI_hs-Secretome_pDEST": "object", "reagent.NIBRI_hs-Secretome_pDEST.id": "keyword", "reagent.NIBRI_hs-Secretome_pDEST.relationship": "text", "reagent.NOVART_hs-genome_siRNA": "object", "reagent.NOVART_hs-genome_siRNA.id": "keyword", "reagent.NOVART_hs-genome_siRNA.relationship": "text", "reagent.Qiagen_mouse_QMIHSINHIBv1": "object", "reagent.Qiagen_mouse_QMIHSINHIBv1.id": "keyword", "reagent.Qiagen_mouse_QMIHSINHIBv1.relationship": "text", "reagent.Qiagen_mouse_QMIHSMIMv1": "object", "reagent.Qiagen_mouse_QMIHSMIMv1.id": "keyword", "reagent.Qiagen_mouse_QMIHSMIMv1.relationship": "text", "refseq": "object", "refseq.genomic": "text", "refseq.protein": "text", "refseq.rna": "text", "refseq.translation": "object", "reporter": "object", "reporter.AraGene-1_0": "keyword", "reporter.BovGene-1_0": "keyword", "reporter.CanGene-1_0": "keyword", "reporter.ChiGene-1_0": "keyword", "reporter.CynGene-1_0": "keyword", "reporter.CyRGene-1_0": "keyword", "reporter.DroGene-1_0": "keyword", "reporter.EleGene-1_0": "keyword", "reporter.EquGene-1_0": "keyword", "reporter.FelGene-1_0": "keyword", "reporter.GNF1H": "keyword", "reporter.GNF1M": "keyword", "reporter.GuiGene-1_0": "keyword", "reporter.HG-U133_Plus_2": "keyword", "reporter.HG-U95Av2": "keyword", "reporter.HG-U95B": "keyword", "reporter.HTA-2_0": "keyword", "reporter.HuEx-1_0": "keyword", "reporter.HuGene-1_1": "keyword", "reporter.HuGene-2_1": "keyword", "reporter.MarGene-1_0": "keyword", "reporter.MG-U74Av2": "keyword", "reporter.MG-U74Bv2": "keyword", "reporter.MoEx-1_0": "keyword", "reporter.MoGene-1_1": "keyword", "reporter.MoGene-2_1": "keyword", "reporter.Mouse430_2": "keyword", "reporter.MTA-1_0": "keyword", "reporter.PorGene-1_0": "keyword", "reporter.RabGene-1_0": "keyword", "reporter.RaEx-1_0": "keyword", "reporter.RaGene-1_1": "keyword", "reporter.RaGene-2_1": "keyword", "reporter.Rat230_2": "keyword", "reporter.RCnGene-1_0": "keyword", "reporter.RG-U34A": "keyword", "reporter.RG-U34B": "keyword", "reporter.RheGene-1_0": "keyword", "reporter.RJpGene-1_0": "keyword", "reporter.RUSGene-1_0": "keyword", "reporter.snowball": "keyword", "reporter.SoyGene-1_0": "keyword", "reporter.ZebGene-1_0": "keyword", "retired": "long", "RGD": "keyword", "SGD": "keyword", "smpdb": "text", "summary": "text", "symbol": "keyword", "TAIR": "keyword", "taxid": "integer", "type_of_gene": "keyword", "umls": "object", "umls.cui": "keyword", "umls.protein_cui": "keyword", "unigene": "keyword", "uniprot": "object", "uniprot.Swiss-Prot": "keyword", "uniprot.TrEMBL": "keyword", "Vega": "text", "wikipathways": "text", "wikipedia": "object", "wikipedia.url_stub": "text", "WormBase": "keyword", "Xenbase": "keyword", "yeastcyc": "text", "ZFIN": "keyword", }