from enum import IntEnum
from IPython.core.display import Markdown
#: Convenience lists of canonical chromosome names
CANONICAL_CHROMOSOMES = {
"GRCh38": [f"chr{c}" for c in list(range(1, 23)) + ["X", "Y", "M"]],
"GRCm38": [f"chr{c}" for c in list(range(1, 20)) + ["X", "Y", "MT"]],
"mm39": [f"chr{c}" for c in list(range(1, 20)) + ["X", "Y", "MT"]],
"dmel": ["2L", "2R", "3L", "3R", "4", "X", "Y"],
}
#: maximum integer value, assuming 32-bit ints
MAX_INT = 2**31 - 1
#: Maps valid sub-feature types (e.g., 'exon', 'CDS') types to SO terms (e.g., '3UTR' -> 'three_prime_UTR')
FTYPE_TO_SO = {
"gene": "gene",
"ncRNA_gene": "gene",
"transcript": "transcript",
"mRNA": "transcript",
"ncRNA": "transcript",
"lnc_RNA": "transcript",
"pseudogenic_transcript": "transcript",
"pre_miRNA": "transcript",
"rRNA": "transcript",
"snRNA": "transcript",
"snoRNA": "transcript",
"tRNA": "transcript",
"miRNA": "transcript",
"exon": "exon",
"intron": "intron",
"CDS": "CDS",
"three_prime_UTR": "three_prime_UTR",
"3UTR": "three_prime_UTR",
"UTR3": "three_prime_UTR",
"five_prime_UTR": "five_prime_UTR",
"5UTR": "five_prime_UTR",
"UTR5": "five_prime_UTR",
}
#: Maps info field names for various GFF flavours
GFF_FLAVOURS = {
("gencode", "gff"): {
"gid": "ID",
"tid": "ID",
"tx_gid": "Parent",
"feat_tid": "Parent",
"gene_name": "gene_name",
"ftype_to_SO": FTYPE_TO_SO,
"copied_fields": ["source", "gene_type"],
},
("gencode", "gtf"): {
"gid": "gene_id",
"tid": "transcript_id",
"tx_gid": "gene_id",
"feat_tid": "transcript_id",
"gene_name": "gene_name",
"ftype_to_SO": FTYPE_TO_SO,
"copied_fields": ["source", "gene_type"],
},
("ensembl", "gff"): {
"gid": "ID",
"tid": "ID",
"tx_gid": "Parent",
"feat_tid": "Parent",
"gene_name": "Name",
"ftype_to_SO": FTYPE_TO_SO
| {
"pseudogene": "gene"
}, # 'pseudogene': maps to 'gene' in ensembl but to tx in
# flybase
"copied_fields": ["source", "gene_type"],
},
("flybase", "gtf"): {
"gid": "gene_id",
"tid": "transcript_id",
"tx_gid": "gene_id",
"feat_tid": "transcript_id",
"gene_name": "gene_symbol",
"ftype_to_SO": FTYPE_TO_SO
| {
"pseudogene": "transcript"
}, # 'pseudogene': maps to 'gene' in ensembl but to
# tx in flybase
"copied_fields": ["source", "gene_type"],
},
("wormbase", "gff"): {
"gid": "ID",
"tid": "ID",
"tx_gid": "Parent",
"feat_tid": "Parent",
"gene_name": "Name",
"ftype_to_SO": FTYPE_TO_SO | {"pseudogenic_transcript": None, "snRNA": None},
# 'pseudogene': maps to 'gene' in ensembl but to tx in wormbase
# NOTE that some wormbase files contain 'pseudogenic_transcript' features that are not enveloped by the
# referenced genes, so we map them to None (which skips them).
# Also, snRNA features have no associated 'Parent' id and are therefore skipped.
"copied_fields": ["source", "biotype", "so_term_name"],
},
("ucsc", "gtf"): {
"gid": None,
"tid": "transcript_id",
"tx_gid": "gene_id",
"feat_tid": "transcript_id",
"gene_name": "gene_name",
"ftype_to_SO": FTYPE_TO_SO,
"copied_fields": ["source", "gene_type"],
},
("chess", "gff"): {
"gid": None,
"tid": "ID",
"tx_gid": "Parent",
"feat_tid": "Parent",
"gene_name": "gene_name",
"ftype_to_SO": FTYPE_TO_SO,
"copied_fields": ["source", "gene_type"],
},
("chess", "gtf"): {
"gid": None,
"tid": "transcript_id",
"tx_gid": "gene_id",
"feat_tid": "transcript_id",
"gene_name": "gene_name",
"ftype_to_SO": FTYPE_TO_SO,
"copied_fields": ["source", "gene_type"],
},
("mirgenedb", "gff"): {
"gid": None,
"tid": "ID",
"tx_gid": None,
"feat_tid": None,
"gene_name": "Alias",
"ftype_to_SO": {"pre_miRNA": "transcript", "miRNA": "transcript"},
"copied_fields": ["source", "Alias"],
},
("generic", "gff"): {
"gid": "ID",
"tid": "ID",
"tx_gid": "Parent",
"feat_tid": "Parent",
"gene_name": "gene_name",
"ftype_to_SO": FTYPE_TO_SO,
"copied_fields": [], # should be passed via constructor
},
("generic", "gtf"): {
"gid": "gene_id",
"tid": "transcript_id",
"tx_gid": "gene_id",
"feat_tid": "transcript_id",
"gene_name": "gene_name",
"ftype_to_SO": FTYPE_TO_SO,
"copied_fields": [], # should be passed via constructor
},
}
[docs]
class BamFlag(IntEnum):
"""BAM flags, @see https://broadinstitute.github.io/picard/explain-flags.html"""
BAM_FPAIRED = 0x1 # the read is paired in sequencing, no matter whether it is mapped in a pair
BAM_FPROPER_PAIR = 0x2 # the read is mapped in a proper pair
BAM_FUNMAP = 0x4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
BAM_FMUNMAP = 0x8 # the mate is unmapped
BAM_FREVERSE = 0x10 # the read is mapped to the reverse strand
BAM_FMREVERSE = 0x20 # the mate is mapped to the reverse strand
BAM_FREAD1 = 0x40 # this is read1
BAM_FREAD2 = 0x80 # this is read2
BAM_FSECONDARY = 0x100 # not primary alignment
BAM_FQCFAIL = 0x200 # QC failure
BAM_FDUP = 0x400 # optical or PCR duplicate
BAM_SUPPLEMENTARY = 0x800 # optical or PCR duplicate
#: default BAM flag filter (int 3844); comparable to samtools view -F 3844; also used as default filter in IGV.
DEFAULT_FLAG_FILTER = (
BamFlag.BAM_FUNMAP
| BamFlag.BAM_FSECONDARY
| BamFlag.BAM_FQCFAIL
| BamFlag.BAM_FDUP
| BamFlag.BAM_SUPPLEMENTARY
)
#: Markdown separator for jupyter notebooks; `display(SEP)`
SEP = Markdown("---")
#: mygeneinfo fields
mygeneinfo_fields = {
"accession": "object",
"accession.genomic": "text",
"accession.protein": "text",
"accession.rna": "text",
"accession.translation": "object",
"alias": "keyword",
"AnimalQTLdb": "text",
"biocarta": "text",
"clingen": "object",
"clingen.clinical_validity": "object",
"clingen.clinical_validity.classification": "keyword",
"clingen.clinical_validity.classification_date": "date",
"clingen.clinical_validity.disease_label": "text",
"clingen.clinical_validity.gcep": "text",
"clingen.clinical_validity.moi": "keyword",
"clingen.clinical_validity.mondo": "keyword",
"clingen.clinical_validity.online_report": "text",
"clingen.clinical_validity.sop": "keyword",
"ec": "keyword",
"ensembl": "object",
"ensembl.gene": "keyword",
"ensembl.protein": "keyword",
"ensembl.transcript": "keyword",
"ensembl.translation": "object",
"ensembl.type_of_gene": "keyword",
"entrezgene": "keyword",
"exac": "object",
"exac.all": "object",
"exac.bp": "integer",
"exac.cds_end": "integer",
"exac.cds_start": "integer",
"exac.n_exons": "integer",
"exac.nonpsych": "object",
"exac.nontcga": "object",
"exac.transcript": "text",
"exons": "object",
"exons_hg19": "object",
"exons_mm10": "object",
"exons_mm9": "object",
"FLYBASE": "keyword",
"generif": "object",
"generif.pubmed": "long",
"generif.text": "text",
"genomic_pos": "object",
"genomic_pos.chr": "keyword",
"genomic_pos.end": "long",
"genomic_pos.start": "long",
"genomic_pos.strand": "byte",
"genomic_pos_hg19": "object",
"genomic_pos_hg19.chr": "keyword",
"genomic_pos_hg19.end": "long",
"genomic_pos_hg19.start": "long",
"genomic_pos_hg19.strand": "byte",
"genomic_pos_mm9": "object",
"genomic_pos_mm9.chr": "keyword",
"genomic_pos_mm9.end": "long",
"genomic_pos_mm9.start": "long",
"genomic_pos_mm9.strand": "byte",
"go": "object",
"go.BP": "object",
"go.BP.category": "text",
"go.BP.evidence": "text",
"go.BP.id": "keyword",
"go.BP.pubmed": "long",
"go.BP.term": "text",
"go.CC": "object",
"go.CC.category": "text",
"go.CC.evidence": "text",
"go.CC.id": "keyword",
"go.CC.pubmed": "long",
"go.CC.term": "text",
"go.MF": "object",
"go.MF.category": "text",
"go.MF.evidence": "text",
"go.MF.id": "keyword",
"go.MF.pubmed": "long",
"go.MF.term": "text",
"HGNC": "keyword",
"homologene": "object",
"homologene.genes": "long",
"homologene.id": "long",
"HPRD": "keyword",
"humancyc": "text",
"interpro": "object",
"interpro.desc": "text",
"interpro.id": "keyword",
"interpro.short_desc": "text",
"ipi": "keyword",
"kegg": "text",
"locus_tag": "keyword",
"map_location": "text",
"MGI": "keyword",
"MIM": "keyword",
"miRBase": "keyword",
"mousecyc": "text",
"name": "text",
"netpath": "text",
"other_names": "text",
"pantherdb": "object",
"pantherdb.Araport": "keyword",
"pantherdb.dictyBase": "keyword",
"pantherdb.EcoGene": "keyword",
"pantherdb.Ensembl": "keyword",
"pantherdb.EnsemblGenome": "keyword",
"pantherdb.FlyBase": "keyword",
"pantherdb.Gene": "keyword",
"pantherdb.Gene_Name": "keyword",
"pantherdb.Gene_OrderedLocusName": "keyword",
"pantherdb.Gene_ORFName": "keyword",
"pantherdb.GeneID": "keyword",
"pantherdb.HGNC": "keyword",
"pantherdb.MGI": "keyword",
"pantherdb.ortholog": "object",
"pantherdb.ortholog.Araport": "keyword",
"pantherdb.ortholog.dictyBase": "keyword",
"pantherdb.ortholog.EcoGene": "keyword",
"pantherdb.ortholog.Ensembl": "keyword",
"pantherdb.ortholog.EnsemblGenome": "keyword",
"pantherdb.ortholog.FlyBase": "keyword",
"pantherdb.ortholog.Gene": "keyword",
"pantherdb.ortholog.Gene_Name": "keyword",
"pantherdb.ortholog.Gene_OrderedLocusName": "keyword",
"pantherdb.ortholog.Gene_ORFName": "keyword",
"pantherdb.ortholog.GeneCards": "keyword",
"pantherdb.ortholog.GeneID": "keyword",
"pantherdb.ortholog.HGNC": "keyword",
"pantherdb.ortholog.MGI": "keyword",
"pantherdb.ortholog.ortholog_type": "keyword",
"pantherdb.ortholog.panther_family": "keyword",
"pantherdb.ortholog.PomBase": "keyword",
"pantherdb.ortholog.RGD": "keyword",
"pantherdb.ortholog.SGD": "keyword",
"pantherdb.ortholog.TAIR": "keyword",
"pantherdb.ortholog.taxid": "integer",
"pantherdb.ortholog.uniprot_kb": "keyword",
"pantherdb.ortholog.WormBase": "keyword",
"pantherdb.ortholog.ZFIN": "keyword",
"pantherdb.PomBase": "keyword",
"pantherdb.RGD": "keyword",
"pantherdb.SGD": "keyword",
"pantherdb.TAIR": "keyword",
"pantherdb.uniprot_kb": "keyword",
"pantherdb.WormBase": "keyword",
"pantherdb.ZFIN": "keyword",
"pathway": "object",
"pathway.biocarta": "object",
"pathway.biocarta.id": "text",
"pathway.biocarta.name": "text",
"pathway.humancyc": "object",
"pathway.humancyc.id": "text",
"pathway.humancyc.name": "text",
"pathway.kegg": "object",
"pathway.kegg.id": "text",
"pathway.kegg.name": "text",
"pathway.mousecyc": "object",
"pathway.mousecyc.id": "text",
"pathway.mousecyc.name": "text",
"pathway.netpath": "object",
"pathway.netpath.id": "text",
"pathway.netpath.name": "text",
"pathway.pharmgkb": "object",
"pathway.pharmgkb.id": "text",
"pathway.pharmgkb.name": "text",
"pathway.pid": "object",
"pathway.pid.id": "text",
"pathway.pid.name": "text",
"pathway.reactome": "object",
"pathway.reactome.id": "text",
"pathway.reactome.name": "text",
"pathway.smpdb": "object",
"pathway.smpdb.id": "text",
"pathway.smpdb.name": "text",
"pathway.wikipathways": "object",
"pathway.wikipathways.id": "text",
"pathway.wikipathways.name": "text",
"pathway.yeastcyc": "object",
"pathway.yeastcyc.id": "text",
"pathway.yeastcyc.name": "text",
"pdb": "keyword",
"pfam": "keyword",
"pharmgkb": "keyword",
"pharos": "object",
"pharos.target_id": "integer",
"pid": "text",
"pir": "keyword",
"prosite": "keyword",
"RATMAP": "keyword",
"reactome": "text",
"reagent": "object",
"reagent.CM-LibrX-no-seq": "object",
"reagent.CM-LibrX-no-seq.id": "keyword",
"reagent.CM-LibrX-no-seq.relationship": "text",
"reagent.CondMedia_CM_LibrAB": "object",
"reagent.CondMedia_CM_LibrAB.id": "keyword",
"reagent.CondMedia_CM_LibrAB.relationship": "text",
"reagent.GNF_hs-druggable_lenti-shRNA": "object",
"reagent.GNF_hs-druggable_lenti-shRNA.id": "keyword",
"reagent.GNF_hs-druggable_lenti-shRNA.relationship": "text",
"reagent.GNF_hs-druggable_plasmid-shRNA": "object",
"reagent.GNF_hs-druggable_plasmid-shRNA.id": "keyword",
"reagent.GNF_hs-druggable_plasmid-shRNA.relationship": "text",
"reagent.GNF_hs-druggable_siRNA": "object",
"reagent.GNF_hs-druggable_siRNA.id": "keyword",
"reagent.GNF_hs-druggable_siRNA.relationship": "text",
"reagent.GNF_hs-GPCR_IDT-siRNA": "object",
"reagent.GNF_hs-GPCR_IDT-siRNA.id": "keyword",
"reagent.GNF_hs-GPCR_IDT-siRNA.relationship": "text",
"reagent.GNF_hs-oncomine_IDT-siRNA": "object",
"reagent.GNF_hs-oncomine_IDT-siRNA.id": "keyword",
"reagent.GNF_hs-oncomine_IDT-siRNA.relationship": "text",
"reagent.GNF_hs-ORFeome1_1_reads": "object",
"reagent.GNF_hs-ORFeome1_1_reads.id": "keyword",
"reagent.GNF_hs-ORFeome1_1_reads.relationship": "text",
"reagent.GNF_hs-Origene": "object",
"reagent.GNF_hs-Origene.id": "keyword",
"reagent.GNF_hs-Origene.relationship": "text",
"reagent.GNF_hs-pkinase_IDT-siRNA": "object",
"reagent.GNF_hs-pkinase_IDT-siRNA.id": "keyword",
"reagent.GNF_hs-pkinase_IDT-siRNA.relationship": "text",
"reagent.GNF_hs_LentiORF-HA-MYC": "object",
"reagent.GNF_hs_LentiORF-HA-MYC.id": "keyword",
"reagent.GNF_hs_LentiORF-HA-MYC.relationship": "text",
"reagent.GNF_hs_LentiORF-Jred": "object",
"reagent.GNF_hs_LentiORF-Jred.id": "keyword",
"reagent.GNF_hs_LentiORF-Jred.relationship": "text",
"reagent.GNF_mm+hs-MGC": "object",
"reagent.GNF_mm+hs-MGC.id": "keyword",
"reagent.GNF_mm+hs-MGC.relationship": "text",
"reagent.GNF_mm+hs_RetroCDNA": "object",
"reagent.GNF_mm+hs_RetroCDNA.id": "keyword",
"reagent.GNF_mm+hs_RetroCDNA.relationship": "text",
"reagent.GNF_mm-GIPZ_shRNA": "object",
"reagent.GNF_mm-GIPZ_shRNA.id": "keyword",
"reagent.GNF_mm-GIPZ_shRNA.relationship": "text",
"reagent.GNF_mm-kinase_lenti-shRNA": "object",
"reagent.GNF_mm-kinase_lenti-shRNA.id": "keyword",
"reagent.GNF_mm-kinase_lenti-shRNA.relationship": "text",
"reagent.GNF_mm-kinase_plasmid-shRNA": "object",
"reagent.GNF_mm-kinase_plasmid-shRNA.id": "keyword",
"reagent.GNF_mm-kinase_plasmid-shRNA.relationship": "text",
"reagent.GNF_mm-TLR_lenti_shRNA": "object",
"reagent.GNF_mm-TLR_lenti_shRNA.id": "keyword",
"reagent.GNF_mm-TLR_lenti_shRNA.relationship": "text",
"reagent.GNF_Qia_hs-genome_v1_siRNA": "object",
"reagent.GNF_Qia_hs-genome_v1_siRNA.id": "keyword",
"reagent.GNF_Qia_hs-genome_v1_siRNA.relationship": "text",
"reagent.IDT_27mer_hs_ATPase_siRNAs": "object",
"reagent.IDT_27mer_hs_ATPase_siRNAs.id": "keyword",
"reagent.IDT_27mer_hs_ATPase_siRNAs.relationship": "text",
"reagent.Invitrogen_IVTHSSIPKv2": "object",
"reagent.Invitrogen_IVTHSSIPKv2.id": "keyword",
"reagent.Invitrogen_IVTHSSIPKv2.relationship": "text",
"reagent.MasterSecretomicsList": "object",
"reagent.MasterSecretomicsList.id": "keyword",
"reagent.MasterSecretomicsList.relationship": "text",
"reagent.NIBRI_hs-Secretome_pDEST": "object",
"reagent.NIBRI_hs-Secretome_pDEST.id": "keyword",
"reagent.NIBRI_hs-Secretome_pDEST.relationship": "text",
"reagent.NOVART_hs-genome_siRNA": "object",
"reagent.NOVART_hs-genome_siRNA.id": "keyword",
"reagent.NOVART_hs-genome_siRNA.relationship": "text",
"reagent.Qiagen_mouse_QMIHSINHIBv1": "object",
"reagent.Qiagen_mouse_QMIHSINHIBv1.id": "keyword",
"reagent.Qiagen_mouse_QMIHSINHIBv1.relationship": "text",
"reagent.Qiagen_mouse_QMIHSMIMv1": "object",
"reagent.Qiagen_mouse_QMIHSMIMv1.id": "keyword",
"reagent.Qiagen_mouse_QMIHSMIMv1.relationship": "text",
"refseq": "object",
"refseq.genomic": "text",
"refseq.protein": "text",
"refseq.rna": "text",
"refseq.translation": "object",
"reporter": "object",
"reporter.AraGene-1_0": "keyword",
"reporter.BovGene-1_0": "keyword",
"reporter.CanGene-1_0": "keyword",
"reporter.ChiGene-1_0": "keyword",
"reporter.CynGene-1_0": "keyword",
"reporter.CyRGene-1_0": "keyword",
"reporter.DroGene-1_0": "keyword",
"reporter.EleGene-1_0": "keyword",
"reporter.EquGene-1_0": "keyword",
"reporter.FelGene-1_0": "keyword",
"reporter.GNF1H": "keyword",
"reporter.GNF1M": "keyword",
"reporter.GuiGene-1_0": "keyword",
"reporter.HG-U133_Plus_2": "keyword",
"reporter.HG-U95Av2": "keyword",
"reporter.HG-U95B": "keyword",
"reporter.HTA-2_0": "keyword",
"reporter.HuEx-1_0": "keyword",
"reporter.HuGene-1_1": "keyword",
"reporter.HuGene-2_1": "keyword",
"reporter.MarGene-1_0": "keyword",
"reporter.MG-U74Av2": "keyword",
"reporter.MG-U74Bv2": "keyword",
"reporter.MoEx-1_0": "keyword",
"reporter.MoGene-1_1": "keyword",
"reporter.MoGene-2_1": "keyword",
"reporter.Mouse430_2": "keyword",
"reporter.MTA-1_0": "keyword",
"reporter.PorGene-1_0": "keyword",
"reporter.RabGene-1_0": "keyword",
"reporter.RaEx-1_0": "keyword",
"reporter.RaGene-1_1": "keyword",
"reporter.RaGene-2_1": "keyword",
"reporter.Rat230_2": "keyword",
"reporter.RCnGene-1_0": "keyword",
"reporter.RG-U34A": "keyword",
"reporter.RG-U34B": "keyword",
"reporter.RheGene-1_0": "keyword",
"reporter.RJpGene-1_0": "keyword",
"reporter.RUSGene-1_0": "keyword",
"reporter.snowball": "keyword",
"reporter.SoyGene-1_0": "keyword",
"reporter.ZebGene-1_0": "keyword",
"retired": "long",
"RGD": "keyword",
"SGD": "keyword",
"smpdb": "text",
"summary": "text",
"symbol": "keyword",
"TAIR": "keyword",
"taxid": "integer",
"type_of_gene": "keyword",
"umls": "object",
"umls.cui": "keyword",
"umls.protein_cui": "keyword",
"unigene": "keyword",
"uniprot": "object",
"uniprot.Swiss-Prot": "keyword",
"uniprot.TrEMBL": "keyword",
"Vega": "text",
"wikipathways": "text",
"wikipedia": "object",
"wikipedia.url_stub": "text",
"WormBase": "keyword",
"Xenbase": "keyword",
"yeastcyc": "text",
"ZFIN": "keyword",
}