Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def parse_ctgs(bestedges, frgtoctg):
cache = "frgtoctg.cache"
if need_update(frgtoctg, cache):
reads_to_ctgs = {}
frgtodeg = frgtoctg.replace(".frgctg", ".frgdeg")
iidtouid = frgtoctg.replace(".posmap.frgctg", ".iidtouid")
fp = open(iidtouid)
frgstore = {}
for row in fp:
tag, iid, uid = row.split()
if tag == "FRG":
frgstore[uid] = int(iid)
for pf, f in zip(("ctg", "deg"), (frgtoctg, frgtodeg)):
fp = open(f)
logging.debug("Parse posmap file `{0}`".format(f))
for row in fp:
frg, ctg = row.split()[:2]
frg = frgstore[frg]
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
bedfile, fastafile = args
mindepth = opts.mindepth
bedgraph = make_bedgraph(bedfile)
bedgraphfiltered = bedgraph + ".d{0}".format(mindepth)
if need_update(bedgraph, bedgraphfiltered):
filter([bedgraph, "--minaccn={0}".format(mindepth),
"--outfile={0}".format(bedgraphfiltered)])
merged = bedgraphfiltered + ".merge.fasta"
if need_update(bedgraphfiltered, merged):
mergeBed(bedgraphfiltered, sorted=True)
plate = op.basename(fastafile).split(".")[0]
mated = (opts.size != 0)
mean, sv = get_mean_sv(opts.size)
if mated:
libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
else:
libname = plate
frgfile = libname + ".frg"
if opts.clean:
cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
if need_update(fastafile, cleanfasta):
clean([fastafile, "--canonical", "-o", cleanfasta])
fastafile = cleanfasta
if mated:
qualfile = make_qual(fastafile, score=21)
if opts.matefile:
matefile = opts.matefile
assert op.exists(matefile)
else:
matefile = make_matepairs(fastafile)
cmd = "convert-fasta-to-v2.pl"
cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
if mated:
cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)
"""
from jcvi.formats.base import DictFile
from jcvi.apps.align import blast
p = OptionParser(sizes.__doc__)
opts, args = p.parse_args(args)
if len(args) != 3:
sys.exit(not p.print_help())
gapsbed, afasta, bfasta = args
pf = gapsbed.rsplit(".", 1)[0]
extbed = pf + ".ext.bed"
extfasta = pf + ".ext.fasta"
if need_update(gapsbed, extfasta):
extbed, extfasta = flanks([gapsbed, afasta])
q = op.basename(extfasta).split(".")[0]
r = op.basename(bfasta).split(".")[0]
blastfile = "{0}.{1}.blast".format(q, r)
if need_update([extfasta, bfasta], blastfile):
blastfile = blast([bfasta, extfasta, "--wordsize=50", "--pctid=98"])
labelsfile = blast_to_twobeds(blastfile)
labels = DictFile(labelsfile, delimiter='\t')
bed = Bed(gapsbed)
for b in bed:
b.score = b.span
accn = b.accn
print("\t".join((str(x) for x in (b.seqid, b.start - 1, b.end, accn,
p.set_home("cdhit", default="/usr/local/bin/")
p.set_home("fiona", default="/usr/local/bin/")
p.set_home("jellyfish", default="/usr/local/bin/")
p.set_cpus()
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
fastqfile, = args
cpus = opts.cpus
depth = opts.depth
pf, sf = fastqfile.rsplit(".", 1)
diginormfile = pf + ".diginorm." + sf
if need_update(fastqfile, diginormfile):
diginorm([fastqfile, "--single", "--depth={0}".format(depth)])
keepabund = fastqfile + ".keep.abundfilt"
sh("cp -s {0} {1}".format(keepabund, diginormfile))
jf = pf + "-K23.histogram"
if need_update(diginormfile, jf):
jellyfish([diginormfile, "--prefix={0}".format(pf),
"--cpus={0}".format(cpus),
"--jellyfish_home={0}".format(opts.jellyfish_home)])
genomesize = histogram([jf, pf, "23"])
fiona = pf + ".fiona.fa"
if need_update(diginormfile, fiona):
cmd = op.join(opts.fiona_home, "fiona")
cmd += " -g {0} -nt {1} --sequencing-technology {2}".\
format(genomesize, cpus, opts.technology)
def check_aln(dbfile, readfile, cpus=32):
from jcvi.formats.fastq import guessoffset
saifile = readfile.rsplit(".", 1)[0] + ".sai"
if need_update((dbfile, readfile), saifile):
offset = guessoffset([readfile])
cmd = "bwa aln " + " ".join((dbfile, readfile))
cmd += " -t {0}".format(cpus)
if offset == 64:
cmd += " -I"
sh(cmd, outfile=saifile)
else:
logging.error("`{0}` exists. `bwa aln` already run.".format(saifile))
return saifile
def rmdup_bedpe(filtered, rmdup, dupwiggle=10):
sortedfiltered = filtered + ".sorted"
if need_update(filtered, sortedfiltered):
sh("sort -k1,1 -k2,2n -i {0} -o {1}".format(filtered, sortedfiltered))
logging.debug("Rmdup criteria: wiggle <= {0}".format(dupwiggle))
fp = must_open(sortedfiltered)
fw = must_open(rmdup, "w")
data = [BedpeLine(x) for x in fp]
retained = total = 0
for seqid, ss in groupby(data, key=lambda x: x.seqid1):
ss = list(ss)
for i, a in enumerate(ss):
if a.isdup:
continue
for b in ss[i + 1:]:
if b.start1 > a.start1 + dupwiggle:
break
if b.isdup:
cmd += " -@ {0}".format(cpus)
if opts.unique:
cmd += " -q 1"
if samfile.endswith(".sam") and need_update(samfile, bamfile):
sh(cmd)
# Already sorted?
if bamfile.endswith(".sorted.bam"):
sortedbamfile = bamfile
else:
prefix = bamfile.replace(".bam", "")
sortedbamfile = prefix + ".sorted.bam"
if need_update(bamfile, sortedbamfile):
cmd = "samtools sort {0} -o {1}".format(bamfile, sortedbamfile)
cmd += " -@ {0}".format(cpus)
sh(cmd)
baifile = sortedbamfile + ".bai"
if need_update(sortedbamfile, baifile):
sh("samtools index {0}".format(sortedbamfile))
return sortedbamfile
ies_name = "{0:05d}-r{1}".format(ies_id, count)
if count < opts.mindepth:
continue
print("\t".join(str(x) for x in \
(seqid, start - 1, end, ies_name)), file=fw)
ies_id += 1
fw.close()
sort([countbedfile, "-i", sort_tmpdir])
# Remove deletions that contain some read depth
depthbedfile = pf + ".depth.bed"
if need_update((sortedbedfile, countbedfile), depthbedfile):
depth([sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)])
validbedfile = pf + ".valid.bed"
if need_update(depthbedfile, validbedfile):
fw = open(validbedfile, "w")
logging.debug("Filter valid deletions to `{0}`.".format(validbedfile))
bed = Bed(depthbedfile)
all_scores = [float(b.score) for b in bed]
lb, ub = outlier_cutoff(all_scores)
logging.debug("Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format(lb, ub))
for b in bed:
if float(b.score) > ub:
continue
print(b, file=fw)
fw.close()
# Remove deletions that contain sequencing gaps on its flanks
selectedbedfile = pf + ".selected.bed"
if need_update(validbedfile, selectedbedfile):
flanksbedfile = pf + ".flanks.bed"
def __init__(self, bedfile, sizesfile):
bedfile = sort([bedfile])
coveragefile = bedfile + ".coverage"
if need_update(bedfile, coveragefile):
cmd = "genomeCoverageBed"
cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile)
sh(cmd, outfile=coveragefile)
self.sizes = Sizes(sizesfile).mapping
filename = coveragefile
assert filename.endswith(".coverage")
super(Coverage, self).__init__(filename)