Source code for seqtools.utils

import subprocess
import tempfile
import gzip

try:
    transtab = str.maketrans('ACGTNacgtn','TGCANtgcan')
except:
    import string
    transtab = string.maketrans('ACGTNacgtn','TGCANtgcan')


revcompdict = {}
[docs]def revcomp(sequence):
    """Reverse complement a string

    :param sequence: The DNA string (all caps)

    This function includes a caching mechanism, so watch memory usage!
    """
    if(sequence in revcompdict):
        return revcompdict[sequence]
    tmp=sequence[::-1].translate(transtab)
    revcompdict[sequence]=tmp
    return(tmp)


[docs]def fileOpen(fname,mode='rt',encoding='latin-1'):
    """Open a file, including gzip files

    :param fname: The filename to open.  Gzip files are distinguished by ending in '.gz'
    :param mode: File mode for opening [default 'r']
    :returns: An open file handle

    Note: the gzip module in python is REALLY slow, so this function uses
    subprocess and command-line gzip instead.
    """
    # gzip in python is REALLY slow, so use pipes instead.
    if(fname.endswith('.gz')):
        return gzip.open(fname,mode=mode,encoding=encoding)
    else:
        return open(fname,mode)


[docs]def sortVcfBySequence(vcf,seqnames,seqmap=None):
    """Sort a tabixed VCF file by sequence names

    >>> import vcf
    >>> v = vcf.Reader('vcf.gz')
    >>> from seqtools.utils import sortVcfBySequence
    >>> w = vcf.Writer(open('sorted.vcf','w'),v)
    >>> faifile = open('ucsc.hg19.fasta.fai')
    >>> seqnames = [x.split('\t')[0] for x in faifile]
    >>> for rec in sortVcfBySequence(v,seqnames):
        w.write_record(rec)
    >>> w.close()
    >>> v.close()"""

    if(seqmap):
        revseqmap = dict([(v,k) for (k,v) in list(seqmap.items())])

    print(seqmap)
    
    for s in seqnames:
        # seqmap maps seqnames to tabix index names
        if(seqmap is not None):
            region = vcf.fetch(seqmap[s],start=0)
        else:
            try:
                region = vcf.fetch(s,start=0)
                print(region)
            except ValueError:
                print("here",s)
                continue
        print(region)
        print(s)
        for rec in region:
            if(seqmap is not None):
                rec.CHROM=revseqmap[rec.CHROM]
            yield rec