Source code for rnalib.tools

"""
Implementation of rnalib command line tools.
"""
import ast
import hashlib
import json
import logging
import os
from collections import Counter
from pathlib import Path

import intervaltree
import pandas as pd

import pysam

import rnalib as rna
from tqdm.auto import tqdm

from rnalib import DEFAULT_FLAG_FILTER



[docs]
def prune_tags(bam_in, bam_out, kept_tags=("NH", "HI", "AS")):
    """
    Parameters
    ----------
    bam_in
        input bam file
    bam_out
        output bam file
    kept_tags
        list of tags to be kept

    Example
    -------
    >>> prune_tags('small.ACTB+SOX2.bam', 'small.ACTB+SOX2.clean.bam')

    """
    with rna.ReadIterator(bam_in, flag_filter=0) as it:
        with pysam.AlignmentFile(bam_out, "wb", template=it.file) as out:
            for _, r in tqdm(it):
                for tag in r.get_tags():
                    if tag[0] not in kept_tags:
                        r.set_tag(tag[0], None)
                out.write(r)




[docs]
def tag_tc(
    bam_file,
    included_chrom=None,
    snp_vcf_file=None,
    out_prefix=None,
    fractional_counts=False,
    write_density_histogram=True,
    tags=None,
    min_mapping_quality=0,
    flag_filter=DEFAULT_FLAG_FILTER,
    min_base_quality=10,
    reverse_strand=False,
):
    """
    Determines the T/C status per read while masking SNP positions and create density histogram per chrom.
    Will output two files: a BAM file with the T/C status encoded in the XC tag and a TSV file containing the T/C
    density per chrom.

    Parameters
    ----------
    bam_file : str
        bam file to be analysed
    included_chrom : list
        list of chromosomes to be included (subset of reference dict)
    snp_vcf_file : str
        VCF file containing variant calls from the respective cell line. T/C and A/G SNPs will be masked
    out_prefix : str
        output file prefix
    fractional_counts : bool
        if set, reads will be counted by 1/NH
    write_density_histogram : bool
        if set, a (gzipped) TSV file containing T/C density data will be written. default is True
    tags : dict
        dictionary containing the BAM tags to be used for the output BAM file.
        Default is {'ntc': 'xc', 'ntt': 'xt', 'col': 'YC'}
    min_mapping_quality : int
        minimum mapping quality
    flag_filter : int
        flag filter for filtering reads.
    min_base_quality: int
        minimum base quality
    reverse_strand: bool
        if set, the reverse strand will be considered as the conversion strand. Default is False.

    """
    if tags is None:
        tags = {"ntc": "xc", "ntt": "xt", 'mmtc': "xm", "col": "YC"}  # Used BAM tags
    elif isinstance(tags, str):
        tags = ast.literal_eval(tags)  # parse from string
    # examples: "1,2,3,4", "(1, 2, 3)"
    if included_chrom is not None and isinstance(included_chrom, str):
        try:
            included_chrom = ast.literal_eval(included_chrom)
        except ValueError as e:  # noqa
            included_chrom = included_chrom.split(",")  # parse from string
    if included_chrom is not None:
        included_chrom = [str(x) for x in included_chrom]
        logging.info("Considered chromosomes: %s" % included_chrom)
    out_prefix = rna.remove_extension(bam_file) if out_prefix is None else out_prefix
    out_bam_file = f"{out_prefix}+tc.bam"
    out_tsv_file = f"{out_prefix}.density.tsv"
    profile = Counter()
    # reference dict containing canonical chrom in correct order
    if snp_vcf_file is None:
        refdict = rna.RefDict.merge_and_validate(
            rna.RefDict.load(bam_file), included_chrom=included_chrom
        )
    else:
        refdict = rna.RefDict.merge_and_validate(
            rna.RefDict.load(bam_file),
            rna.RefDict.load(snp_vcf_file),
            included_chrom=included_chrom,
        )
    # print(f'Considered chromosomes: {refdict.keys()}')
    with rna.ReadIterator(
        bam_file,
        report_mismatches=False,
        min_mapping_quality=min_mapping_quality,
        flag_filter=flag_filter,
        min_base_quality=min_base_quality,
    ) as it:
        with pysam.AlignmentFile(
            out_bam_file, "wb", template=it.file
        ) as out:  # @UndefinedVariable
            with tqdm(total=it.max_items()) as pbar:
                for chrom in refdict:
                    reg = rna.gi(chrom)
                    it.set_region(reg)
                    # get genomic positions of all T/C and A/G snps in the considered region
                    masked_pos = (
                        set(
                            [
                                loc.start
                                for loc, snp in rna.VcfIterator(
                                    snp_vcf_file, region=reg
                                )
                                if ((snp.ref, snp.alt) == ("T", "C"))
                                or ((snp.ref, snp.alt) == ("A", "G"))
                            ]
                        )
                        if snp_vcf_file is not None
                        else set()
                    )
                    pbar.set_description(
                        f"{chrom} with {len(masked_pos)} masked positions"
                    )
                    for loc, r in it:
                        pbar.update(1)
                        is_rev = not r.is_reverse if r.is_read2 else r.is_reverse
                        if reverse_strand:
                            is_rev = not is_rev
                        refc = "A" if is_rev else "T"
                        altc = "G" if is_rev else "C"
                        mm = [
                            (off, pos + 1, ref.upper(), r.query_sequence[off])
                            for (off, pos, ref) in r.get_aligned_pairs(
                                with_seq=True, matches_only=True
                            )
                            if (ref.upper() == refc)
                            and (r.query_qualities[off] >= it.min_base_quality)
                            and (pos + 1 not in masked_pos)
                        ]
                        mm_tc = [
                            (off, pos1, ref, alt)
                            for off, pos1, ref, alt in mm
                            if alt == altc
                        ]
                        # count value of this read. if fractional_counts is True: count by 1/NH
                        cnt = (
                            (1 / r.get_tag("NH") if r.has_tag("NH") else 1)
                            if fractional_counts
                            else 1
                        )
                        profile[r.reference_name, refc, len(mm), len(mm_tc)] += cnt
                        # set bam tags, etc.
                        r.set_tag(tag=tags["ntt"], value=len(mm), value_type="i")
                        r.set_tag(tag=tags["ntc"], value=len(mm_tc), value_type="i")
                        r.set_tag(
                            tag=tags["mmtc"],
                            value=','.join(
                                [str(off) for (off, pos1, ref, alt) in mm_tc]
                            ),
                            value_type="Z",
                        )
                        if len(mm_tc) > 0:
                            ftc = len(mm_tc) / len(mm)
                            r.set_tag(
                                tag=tags["col"],
                                value=f"{255 - int(255 * ftc)},0,0",
                                value_type="Z",
                            )  # set color relative to conversion rate
                        else:
                            r.set_tag(tag=tags["col"], value=None)  # delete tag
                        out.write(r)
    try:
        pysam.index(out_bam_file)  # @UndefinedVariable
    except Exception as e:
        print("error indexing bam: %s" % e)
    # filter for min number of counts
    profile = pd.DataFrame(
        [list(k) + [v] for k, v in profile.items()],
        columns=["chromosome", "ref", "convertible", "converted", "count"],
    )
    if write_density_histogram:
        profile.to_csv(out_tsv_file, index=False, sep="\t")
        rna.bgzip_and_tabix(out_tsv_file, create_index=False)  # bgzip
    else:
        out_tsv_file = None
    return profile, out_bam_file, out_tsv_file




[docs]
def quantise_values(values, bins=10):
    """
    Quantise values into bins.

    Parameters
    ----------
    values : list
        list of values to be quantised
    bins : int
        number of bins

    Returns
    -------
    list
        list of quantised values
    """
    return pd.cut(values, bins, labels=False)




[docs]
def filter_tc(bam_file, out_file=None, min_tc=1, tags=None):
    """
    Filters a T/C annotated BAM file for reads with at least min_tc T/C conversions.

    Parameters
    ----------
    bam_file : str
        T/C annotated bam file to be analysed
    out_file : str
        output file. Default is {bam_file_name}_tc-only.bam
    min_tc : int
        minimum number of T/C conversions
    tags : dict
        dictionary containing the BAM tags to be used for the output BAM file.
        Default is {'ntc': 'xc', 'ntt': 'xt', 'col': 'YC'}
    """
    if tags is None:
        tags = {"ntc": "xc", "ntt": "xt", "col": "YC"}  # Used BAM tags)
    elif isinstance(tags, str):
        tags = ast.literal_eval(tags)  # parse from string
    if min_tc is None:
        min_tc = 1
    if out_file is None:
        out_file = f"{Path(bam_file).stem}_tc-only.bam"
    with rna.ReadIterator(
        bam_file,
        report_mismatches=False,
        min_mapping_quality=0,
        flag_filter=0,
        min_base_quality=0,
    ) as it:
        with pysam.AlignmentFile(
            out_file, "wb", template=it.file
        ) as out:  # @UndefinedVariable
            for loc, r in tqdm(it, total=it.max_items()):
                if r.get_tag(tags["ntc"], 0) >= min_tc:
                    out.write(r)
    try:
        pysam.index(out_file)  # @UndefinedVariable
    except Exception as e:
        print("error indexing bam: %s" % e)
    return out_file




[docs]
def build_amplicon_resources(
    transcriptome_name, bed_file, fasta_file, out_dir, padding=100, amp_extension=100
):
    """
    Builds resources for amplicon analyses from a bed file.

    This tool creates all required genomics resources for downstream analyses, such as the FASTA file with the
    (possibly padded) sequences, a GFF file that annotates the amplicons in transcriptome coordinates and various
    indexing data structures.

    Intervals are parsed from the bed file (and possibly extended and padded= and sequences are extracted from the
    fasta file. The sequences are written to a FASTA file. For downstream analyses, a chrom.sizes file,
    a .dict file, a GFF3 file and a BED file are created. Additionally, a log file is written to the output
    directory.

    The parsed intervals are merged and only one 'amplicon' per merged interval is written to the FASTA file.
    The GFF3 file contains the amplicon coordinates and the BED file contains the amplicon coordinates and the
    original bed item score and strand information.

    The created amplicon sequences may be padded 3' and 5' with Ns (needed by some mappers). The amplicon sequences
    can also be extended 3' and 5' which is recommended to allow mappers to map reads that extend beyond the
    amplicon coordinates. The created FASTA file is indexed with samtools faidx. The GFF3 and BED files are bgzipped
    and tabix indexed.

    Parameters
    ----------
    transcriptome_name : str
        name of the transcriptome
    bed_file : str
        bed file containing the amplicon coordinates
    fasta_file : str
        fasta file containing the reference genome
    out_dir : str
        output directory
    padding : int
        padding to be added to the amplicon sequences
    amp_extension : int
        extension of the amplicon sequences

    Example
    -------
    >>> import tempfile
    >>> with tempfile.TemporaryDirectory() as tmpdir:
    >>>     # create a small BED file with two defined amplicons that map to the genomic positions in the FASTA file
    >>>     with open(f'{tmpdir}/test.bed', 'wt') as out:
    >>>         rna.MemoryIterator({'amp1.1': rna.gi('chr3:1-100'), 'amp1.2': rna.gi('chr3:50-150'),
    >>>             'amp2': rna.gi('chr7:10-50')}).to_bed(out, no_header=True)
    >>>     # build amplicon resources
    >>>     rna.tools.build_amplicon_resources('test_transcriptome', bed_file=...,
    >>>         fasta_file=rna.get_resource('ACTB+SOX2_genome'), padding=50, amp_extension=10, out_dir=tmpdir)
    >>>     rna.print_dir_tree(tmpdir) # show the created resources

    Returns
    -------
    log: dict
        a dictionary containing information about the created resources
    """
    print(f"extracting sequences for {transcriptome_name}")
    refdict = rna.RefDict.load(fasta_file)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    log = Counter()
    # get amplicon coordinates
    ampcoords = {}
    for loc, bed_item in rna.BedIterator(bed_file):
        if loc.chromosome not in refdict:
            print(f"Chromosome {loc.chromosome} not found in reference dict. Skipping.")
            continue
        ampchrom = f"{bed_item.name}_{loc.chromosome}_{loc.start}_{loc.end}"
        ampstart = max(1, loc.start - amp_extension)  # avoid negative start
        ampend = loc.end + amp_extension
        if loc.chromosome not in ampcoords:
            ampcoords[loc.chromosome] = intervaltree.IntervalTree()
        ampcoords[loc.chromosome].addi(ampstart, ampend, ampchrom)
        log["parsed_intervals"] += 1
    for chrom in ampcoords:  # merge intervals and concat names
        ampcoords[chrom].merge_overlaps(
            data_reducer=lambda a, b: "+".join([a, b]), strict=False
        )
    # create FASTA + GFF
    out_file_fasta = f"{out_dir}/{transcriptome_name}.fa"
    out_file_chrsize = f"{out_dir}/{transcriptome_name}.fa.chrom.sizes"
    out_file_dict = f"{out_dir}/{transcriptome_name}.fa.dict"
    out_file_gff3 = f"{out_dir}/{transcriptome_name}.gff3"
    out_file_bed = f"{out_dir}/{transcriptome_name}.bed"
    written_fa = set()
    with open(out_file_fasta, "w") as out_fasta:
        with open(out_file_chrsize, "w") as out_chrsize:
            with open(out_file_dict, "w") as out_dict:
                with open(out_file_gff3, "w") as out_gff3:
                    with open(out_file_bed, "w") as out_bed:
                        with pysam.FastaFile(
                            fasta_file
                        ) as genome:  # @UndefinedVariable
                            for loc, bed_item in rna.BedIterator(bed_file):
                                if loc.chromosome not in refdict:
                                    continue  # skip
                                gi = next(
                                    iter(ampcoords[loc.chromosome][loc.start : loc.end])
                                )  # get amplicon
                                # interval (genomic coords)
                                ampstart, ampend, ampchrom = gi.begin, gi.end, gi.data
                                offset = padding + (loc.start - ampstart) + 1
                                seq = (
                                    "N" * padding
                                    + genome.fetch(loc.chromosome, ampstart - 1, ampend)
                                    + "N" * padding
                                )
                                ampchrom_len = len(seq)
                                log["mean_amplicon_length"] += ampchrom_len
                                # FASTA
                                if ampchrom not in written_fa:
                                    log["written_amplicons"] += 1
                                    print(">%s" % ampchrom, file=out_fasta)
                                    print(rna.format_fasta(seq), file=out_fasta)
                                    # chrom.sizes file
                                    print(
                                        "%s\t%i" % (ampchrom, ampchrom_len),
                                        file=out_chrsize,
                                    )
                                    # DICT
                                    print(
                                        "@SQ\tSN:%s\tLN:%i\tM5:%s\tUR:file:%s"
                                        % (
                                            ampchrom,
                                            ampchrom_len,
                                            hashlib.md5(
                                                seq.encode("utf-8")
                                            ).hexdigest(),
                                            # M5 MD5 checksum of the sequence in the uppercase,
                                            # excluding spaces but including pads (as ‘*’s)
                                            os.path.abspath(out_file_fasta),
                                        ),
                                        file=out_dict,
                                    )
                                    written_fa.add(ampchrom)
                                # BED
                                print(
                                    "\t".join(
                                        [
                                            str(x)
                                            for x in [
                                                ampchrom,
                                                offset - 1,
                                                offset + loc.end - loc.start,
                                                bed_item.name,
                                                "."
                                                if bed_item.score is None
                                                else bed_item.score,
                                                "."
                                                if loc.strand is None
                                                else loc.strand,
                                            ]
                                        ]
                                    ),
                                    file=out_bed,
                                )
                                # GFF3
                                print(
                                    "\t".join(
                                        [
                                            str(x)
                                            for x in [
                                                ampchrom,
                                                "transcriptome_tools",
                                                "transcript",
                                                offset,
                                                offset + loc.end - loc.start,
                                                "."
                                                if bed_item.score is None
                                                else bed_item.score,
                                                "."
                                                if loc.strand is None
                                                else loc.strand,
                                                0,
                                                f"gene_name={bed_item.name}",
                                            ]
                                        ]
                                    ),
                                    file=out_gff3,
                                )
    log["mean_amplicon_length"] = log["mean_amplicon_length"] / log["written_amplicons"]
    out_file_log = f"{out_dir}/{transcriptome_name}.log"
    with open(out_file_log, "wt") as file:
        file.write(json.dumps(log))
    # compress + index output files
    rna.bgzip_and_tabix(
        out_file_gff3,
        sort=True,
        seq_col=0,
        start_col=3,
        end_col=4,
        line_skip=0,
        zerobased=False,
    )
    rna.bgzip_and_tabix(
        out_file_bed,
        sort=True,
        seq_col=0,
        start_col=1,
        end_col=2,
        line_skip=0,
        zerobased=True,
    )
    pysam.faidx(out_file_fasta)  # @UndefinedVariable
    print("Created resources:")
    print("FASTA file + idx:\t" + out_file_fasta)
    print("CHROMSIZE file:\t" + out_file_chrsize)
    print("DICT file:\t" + out_file_dict)
    print("GFF file + idx:\t" + out_file_gff3)
    print("BED file + idx:\t" + out_file_bed)
    return log




[docs]
def calculate_mm_profile(
    bam_file,
    gff_file,
    fasta_file,
    annotation_flavour="gencode",
    min_cov=10,
    max_mm_frac=0.1,
    max_sample=1e6,
    strand_specific=True,
    out_file=None,
):
    """
    Calculates the mismatch profile of a BAM file from random genes.

    Parameters
    ----------
    bam_file: str
        input BAM file
    gff_file: str
        gene annotation GFF file
    fasta_file:
        Reference sequence FASTA file
    annotation_flavour: str
        The annotation flavour of the GFF file. Default is 'gencode'.
    min_cov : int
        The minimum coverage required for a position to be considered.
    max_mm_frac : float
        The maximum mismatch fraction allowed for a position to be considered.
    max_sample : int
        The maximum number of mismatches to count.
    strand_specific : bool
        If True, the strand-specific profile is created.
    out_file: str
        TSV file to write the mismatch profile to. Deafult: {bam_file}.mm.tsv

    """
    t = rna.Transcriptome(
        annotation_gff=gff_file,
        annotation_flavour=annotation_flavour,
        genome_fa=fasta_file,
        load_sequence_data=True,
    )
    out_file = (
        rna.remove_extension(bam_file) + '.mm.tsv' if out_file is None else out_file
    )
    se = rna.MismatchProfile.from_bam(
        bam_file,
        features=t.genes,
        min_cov=min_cov,
        max_mm_frac=max_mm_frac,
        max_sample=max_sample,
        strand_specific=strand_specific,
    )
    se.to_tsv(out_file)