Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def make_ortholog(blocksfile, rbhfile, orthofile):
from jcvi.formats.base import DictFile
# Generate mapping both ways
adict = DictFile(rbhfile)
bdict = DictFile(rbhfile, keypos=1, valuepos=0)
adict.update(bdict)
fp = open(blocksfile)
fw = open(orthofile, "w")
nrecruited = 0
for row in fp:
a, b = row.split()
if b == '.':
if a in adict:
b = adict[a]
nrecruited += 1
b += "'"
print("\t".join((a, b)), file=fw)
logging.debug("Recruited {0} pairs from RBH.".format(nrecruited))
noversion = opts.noversion
sequential = opts.sequential
sequentialoffset = opts.sequentialoffset
sep = opts.sep
idx = opts.index
mapfile = opts.switch
annotfile = opts.annotation
desc = not opts.nodesc
idsfile = opts.ids
idsfile = open(idsfile, "w") if idsfile else None
upper = opts.upper
if mapfile:
mapping = DictFile(mapfile, delimiter="\t")
if annotfile:
annotation = DictFile(annotfile, delimiter="\t")
fp = SeqIO.parse(must_open(infasta), "fasta")
fw = must_open(outfasta, "w")
for i, rec in enumerate(fp):
origid = rec.id
description = rec.description.replace(origid, "").strip()
if sep:
rec.id = rec.description.split(sep)[idx].strip()
if gb:
# gi|262233616|gb|GU123895.1| Coffea arabica clone BAC
atoms = rec.id.split("|")
if len(atoms) >= 3:
rec.id = atoms[3]
elif len(atoms) == 2:
rec.id = atoms[1]
if pairs:
"""
from jcvi.formats.base import DictFile
p = OptionParser(annotation.__doc__)
p.add_option("--queryids", help="Query IDS file to switch [default: %default]")
p.add_option("--subjectids", help="Subject IDS file to switch [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
blastfile, = args
d = "\t"
qids = DictFile(opts.queryids, delimiter=d) if opts.queryids else None
sids = DictFile(opts.subjectids, delimiter=d) if opts.subjectids else None
blast = Blast(blastfile)
for b in blast:
query, subject = b.query, b.subject
if qids:
query = qids[query]
if sids:
subject = sids[subject]
print("\t".join((query, subject)))
def rename(args):
"""
%prog rename in.gff3 switch.ids > reindexed.gff3
Change the IDs within the gff3.
"""
p = OptionParser(rename.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
ingff3, switch = args
switch = DictFile(switch)
gff = Gff(ingff3)
for g in gff:
id, = g.attributes["ID"]
newname = switch.get(id, id)
g.attributes["ID"] = [newname]
if "Parent" in g.attributes:
parents = g.attributes["Parent"]
g.attributes["Parent"] = [switch.get(x, x) for x in parents]
g.update_attributes()
print(g)
invent_name_attr = opts.invent_name_attr
invent_protein_feat = opts.invent_protein_feat
compute_signature = False
outfile = opts.outfile
mapping = None
mod_attrs = set()
if mapfile and op.isfile(mapfile):
mapping = DictFile(mapfile, delimiter="\t", strict=strict)
mod_attrs.add("ID")
if note:
note = DictFile(note, delimiter="\t", strict=strict)
mod_attrs.add("Note")
if source and op.isfile(source):
source = DictFile(source, delimiter="\t", strict=strict)
if ftype and op.isfile(ftype):
ftype = DictFile(ftype, delimiter="\t", strict=strict)
if names:
names = DictFile(names, delimiter="\t", strict=strict)
mod_attrs.add("Name")
if attrib_files:
attr_values = {}
for fn in attrib_files:
attr_name = op.basename(fn).rsplit(".", 1)[0]
if attr_name not in reserved_gff_attributes:
attr_name = attr_name.lower()
attr_values[attr_name] = DictFile(fn, delimiter="\t", strict=strict)
mod_attrs.add(attr_name)
if dbxref_files:
dbxref_values = {}
certificates based on agpfile. At first, it seems counter-productive to
convert first agp to certificates then certificates back to agp.
The certificates provide a way to edit the overlap information, so that the
agpfile can be corrected (without changing agpfile directly).
"""
from jcvi.formats.base import DictFile
p = OptionParser(agp.__doc__)
opts, args = p.parse_args(args)
if len(args) != 3:
sys.exit(not p.print_help())
tpffile, certificatefile, agpfile = args
orientationguide = DictFile(tpffile, valuepos=2)
cert = Certificate(certificatefile)
cert.write_AGP(agpfile, orientationguide=orientationguide)
JCVI.Medtr.v4.fasta -o features.fasta
"""
from jcvi.formats.fasta import longestorf
p = OptionParser(orient.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
ingff3, fastafile = args
idsfile = fastafile.rsplit(".", 1)[0] + ".orf.ids"
if need_update(fastafile, idsfile):
longestorf([fastafile, "--ids"])
orientations = DictFile(idsfile)
gff = Gff(ingff3)
flipped = 0
for g in gff:
id = None
for tag in ("ID", "Parent"):
if tag in g.attributes:
id, = g.attributes[tag]
break
assert id
orientation = orientations.get(id, "+")
if orientation == '-':
g.strand = {"+": "-", "-": "+"}[g.strand]
flipped += 1
print(g)
num_leaves = len(t.get_leaf_names())
yinterval = canvas / (num_leaves + 1)
# get exons structures, if any
structures = {}
if gffdir:
gffiles = glob("{0}/*.gff*".format(gffdir))
setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True)
structures = dict((a, (b, c)) for a, b, c in setups)
if sizes:
sizes = Sizes(sizes).mapping
if barcodefile:
barcodemap = DictFile(barcodefile, delimiter="\t")
if leafcolorfile:
leafcolors = DictFile(leafcolorfile, delimiter="\t")
coords = {}
i = 0
for n in t.traverse("postorder"):
dist = n.get_distance(t)
xx = xstart + scale * dist
if n.is_leaf():
yy = ystart - i * yinterval
i += 1
if trunc_name:
name = truncate_name(n.name, rule=trunc_name)
gb = opts.groupby
g = make_index(gff_file)
tf = "transcript.sizes"
if need_update(gff_file, tf):
fw = open(tf, "w")
for feat in g.features_of_type("mRNA"):
fid = feat.id
conf_class = feat.attributes.get(gb, "all")
tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \
if c.featuretype == "exon")
print("\t".join((fid, str(tsize), conf_class)), file=fw)
fw.close()
tsizes = DictFile(tf, cast=int)
conf_classes = DictFile(tf, valuepos=2)
logging.debug("A total of {0} transcripts populated.".format(len(tsizes)))
genes = []
for feat in g.features_of_type("gene"):
fid = feat.id
transcripts = [c.id for c in g.children(fid, 1) \
if c.featuretype == "mRNA"]
transcript_sizes = [tsizes[x] for x in transcripts]
exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
if c.featuretype == "exon")
conf_class = conf_classes[transcripts[0]]
gs = GeneStats(feat, conf_class, transcript_sizes, exons)
genes.append(gs)
r = {} # Report
distinct_groups = set(conf_classes.values())