Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
%prog merge folder1 ...
Consolidate split contents in the folders. The folders can be generated by
the split() process and several samples may be in separate fastq files. This
program merges them.
"""
p = OptionParser(merge.__doc__)
p.set_outdir(outdir="outdir")
opts, args = p.parse_args(args)
if len(args) < 1:
sys.exit(not p.print_help())
folders = args
outdir = opts.outdir
mkdir(outdir)
files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
files = list(files)
key = lambda x: op.basename(x).split(".")[0]
files.sort(key=key)
for id, fns in groupby(files, key=key):
fns = list(fns)
outfile = op.join(outdir, "{0}.fastq".format(id))
FileMerger(fns, outfile=outfile).merge(checkexists=True)
Assemble each BAC separately using newbler.
"""
from jcvi.formats.fasta import join
p = OptionParser(assemble.__doc__)
p.add_option("--overwrite", default=False, action="store_true",
help="Overwrite the separate BAC assembly [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
sffdir, = args
asmdir = "newbler"
fastadir = "fasta"
mkdir(asmdir, overwrite=opts.overwrite)
mkdir(fastadir, overwrite=opts.overwrite)
cmd = "runAssembly -cpu 8 -o {0} {1}"
for sffile in glob("{0}/*.sff".format(sffdir)):
pf = op.basename(sffile).split(".")[1]
pf = pf.lower()
outdir = op.join(asmdir, pf)
if op.exists(outdir):
logging.debug("`{0}` exists. Ignored.".format(outdir))
continue
acmd = cmd.format(outdir, sffile)
sh(acmd)
ctgfile = op.join(outdir, "454LargeContigs.fna")
if not op.exists(ctgfile): # newbler failure
logging.error("File `{0}` not found (newbler failure).".\
if len(args) < 1:
sys.exit(not p.print_help())
fastq = args
tag, tagj, taglj = "frag_reads", "jump_reads", "long_jump_reads"
ploidy = opts.ploidy
haploidify = opts.haploidify
suffix = opts.suffix
assert (not haploidify) or (haploidify and ploidy == '2')
prepare(["Unknown"] + fastq + ["--norun"])
datadir = opts.dir
mkdir(datadir)
fullpath = op.join(os.getcwd(), datadir)
nthreads = " NUM_THREADS={0}".format(opts.cpus)
phred64 = (guessoffset([args[0]]) == 64)
orig = datadir + "/{0}_orig".format(tag)
origfastb = orig + ".fastb"
if need_update(fastq, origfastb):
cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}' PLOIDY={2}".\
format(fullpath, opts.cpus, ploidy)
if phred64:
cmd += " PHRED_64=True"
sh(cmd)
if op.exists(origfastb):
correct_frag(datadir, tag, origfastb, nthreads, dedup=opts.fragsdedup,
haploidify=haploidify, suffix=suffix)
def error(args):
"""
%prog error version backup_folder
Find all errors in ../5-consensus/*.err and pull the error unitigs into
backup/ folder.
"""
p = OptionParser(error.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
version, backup_folder = args
mkdir(backup_folder)
fw = open("errors.log", "w")
seen = set()
for g in glob("../5-consensus/*.err"):
if "partitioned" in g:
continue
fp = open(g)
partID = op.basename(g).rsplit(".err", 1)[0]
partID = int(partID.split("_")[-1])
for row in fp:
if row.startswith(working):
unitigID = row.split("(")[0].split()[-1]
continue
rid = list(Fasta(ctgfasta).iterkeys())
assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"
rid = rid[0]
splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)
# Run BLAST
blastfile = ctgfasta + ".blast"
run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)
# Extract contigs and merge using minimus2
closuredir = prefix + ".closure"
closure = False
if need_update(blastfile, closuredir):
mkdir(closuredir, overwrite=True)
closure = True
if closure:
idsfile = op.join(closuredir, prefix + ".ids")
cmd = "cut -f2 {0} | sort -u".format(blastfile)
sh(cmd, outfile=idsfile)
idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile)
sh(cmd)
# This step is a hack to weight the bases from original sequences more
# than the pulled sequences, by literally adding another copy to be used
# in consensus calls.
redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
format([ctgfasta, redundantfastafile, "--prefix=RED."])
def write_lst(bedfile):
pf = op.basename(bedfile).split(".")[0]
mkdir(pf)
bed = Bed(bedfile)
stanza = []
for seqid, bs in bed.sub_beds():
fname = op.join(pf, "{0}.lst".format(seqid))
fw = open(fname, "w")
for b in bs:
print("{0}{1}".format(b.accn.replace(" ", ""), b.strand), file=fw)
stanza.append((seqid, fname))
fw.close()
return pf, stanza
a2 = row[tred + ".2"]
fdp = row[tred + ".FDP"]
pdp = row[tred + ".PDP"]
pedp = row[tred + ".PEDP"]
dp[sk] = (a1, a2, fdp, pdp, pedp)
# Build a consolidated dataframe
ef = pd.DataFrame.from_dict(dp, orient="index")
ef.columns = [tred + ".1", tred + ".2", tred + ".FDP",
tred + ".PDP", tred + ".PEDP"]
ef.index.name = "SampleKey"
mf = df.merge(ef, how="right", left_index=True, right_index=True)
# Plot a bunch of figures
outdir = "output"
mkdir(outdir)
xlim = ylim = (0, 100)
draw_jointplot(outdir + "/A", "MeanCoverage", "HD.FDP",
data=mf, xlim=xlim, ylim=ylim, format=format)
draw_jointplot(outdir + "/B", "MeanCoverage", "HD.PDP",
data=mf, color='g', xlim=xlim, ylim=ylim, format=format)
draw_jointplot(outdir + "/C", "MeanCoverage", "HD.PEDP",
data=mf, color='m', xlim=xlim, ylim=ylim, format=format)
xlim = (0, 50)
draw_jointplot(outdir + "/D", "HD.2", "HD.FDP",
data=mf, xlim=xlim, ylim=ylim, format=format)
draw_jointplot(outdir + "/E", "HD.2", "HD.PDP",
data=mf, color='g', xlim=xlim, ylim=ylim, format=format)
draw_jointplot(outdir + "/F", "HD.2", "HD.PEDP",
data=mf, color='m', xlim=xlim, ylim=ylim, format=format)
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs):
"""
build maximum likelihood tree of DNA seqs with RAxML
"""
work_dir = op.join(work_dir, "work")
mkdir(work_dir)
phy_file = op.join(work_dir, "aln.phy")
AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed")
raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
mkdir(raxml_work)
raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
sequences=phy_file, algorithm="a", model="GTRGAMMA", \
parsimony_seed=12345, rapid_bootstrap_seed=12345, \
num_replicates=100, name="aln", \
working_dir=raxml_work, **kwargs)
logging.debug("Building ML tree using RAxML: %s" % raxml_cl)
stdout, stderr = raxml_cl()
tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work)
if not op.exists(tree_file):
print("***RAxML failed.", file=sys.stderr)
sh("rm -rf %s" % raxml_work, log=False)
return None
sh("cp {0} {1}".format(tree_file, outfile), log=False)
if self.concurrency:
qsub += " -tc {0}".format(self.concurrency)
if self.name:
qsub += ' -N "{0}"'.format(self.name)
if self.hold_jid:
param = "-hold_jid_ad" if self.arr else "-hold_jid"
qsub += " {0} {1}".format(param, self.hold_jid)
if self.extra:
qsub += " {0}".format(self.extra)
# I/O
infile = self.infile
outfile = self.outfile
errfile = self.errfile
outdir = self.outdir
mkdir(outdir)
redirect_same = outfile and (outfile == errfile)
if infile:
qsub += " -i {0}".format(infile)
if outfile:
self.outfile = op.join(outdir, outfile)
qsub += " -o {0}".format(self.outfile)
if errfile:
if redirect_same:
qsub += " -j y"
else:
self.errfile = op.join(outdir, errfile)
qsub += " -e {0}".format(self.errfile)
cmd = " ".join((qsub, self.cmd))
return cmd
def slink(p, pf, tag, extra=None):
mkdir(pf, overwrite=True)
cwd = os.getcwd()
os.chdir(pf)
# Create sym-links for the input files
i = 1
for f in sorted(p):
gz = ".gz" if f.endswith(".gz") else ""
if "PE-0" in f:
sh("ln -sf ../{0} PE-0.fastq{1}".format(f, gz))
continue
for t in tag:
sh("ln -sf ../{0} {1}.{2}.fastq{3}".format(f, t, i, gz))
i += 1
if extra:
for e in extra: