Source code for seqtools.utils

import subprocess
import tempfile
import gzip

try:
    transtab = str.maketrans('ACGTNacgtn','TGCANtgcan')
except:
    import string
    transtab = string.maketrans('ACGTNacgtn','TGCANtgcan')


revcompdict = {}
[docs]def revcomp(sequence): """Reverse complement a string :param sequence: The DNA string (all caps) This function includes a caching mechanism, so watch memory usage! """ if(sequence in revcompdict): return revcompdict[sequence] tmp=sequence[::-1].translate(transtab) revcompdict[sequence]=tmp return(tmp)
[docs]def fileOpen(fname,mode='rt',encoding='latin-1'): """Open a file, including gzip files :param fname: The filename to open. Gzip files are distinguished by ending in '.gz' :param mode: File mode for opening [default 'r'] :returns: An open file handle Note: the gzip module in python is REALLY slow, so this function uses subprocess and command-line gzip instead. """ # gzip in python is REALLY slow, so use pipes instead. if(fname.endswith('.gz')): return gzip.open(fname,mode=mode,encoding=encoding) else: return open(fname,mode)
[docs]def sortVcfBySequence(vcf,seqnames,seqmap=None): """Sort a tabixed VCF file by sequence names >>> import vcf >>> v = vcf.Reader('vcf.gz') >>> from seqtools.utils import sortVcfBySequence >>> w = vcf.Writer(open('sorted.vcf','w'),v) >>> faifile = open('ucsc.hg19.fasta.fai') >>> seqnames = [x.split('\t')[0] for x in faifile] >>> for rec in sortVcfBySequence(v,seqnames): w.write_record(rec) >>> w.close() >>> v.close()""" if(seqmap): revseqmap = dict([(v,k) for (k,v) in list(seqmap.items())]) print(seqmap) for s in seqnames: # seqmap maps seqnames to tabix index names if(seqmap is not None): region = vcf.fetch(seqmap[s],start=0) else: try: region = vcf.fetch(s,start=0) print(region) except ValueError: print("here",s) continue print(region) print(s) for rec in region: if(seqmap is not None): rec.CHROM=revseqmap[rec.CHROM] yield rec