Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def filter_bins(self, options):
"""Filter bins command"""
make_sure_path_exists(options.output_dir)
genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
if not self._check_nuclotide_seqs(genome_files):
self.logger.warning('All files must contain nucleotide sequences.')
sys.exit()
outliers = Outliers()
for genome_file in genome_files:
gf = remove_extension(genome_file) + '.filtered.' + options.genome_ext
out_genome = os.path.join(options.output_dir, gf)
outliers.remove_outliers(genome_file, options.filter_file, out_genome, options.modified_only)
self.logger.info('Modified genome written to: ' + options.output_dir)
sys.exit()
if options.outlier_file and options.compatible_file:
self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
sys.exit()
failed_to_add = []
failed_to_remove = []
if options.add or options.remove:
failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file,
options.scaffold_file,
options.add,
options.remove,
options.output_genome)
elif options.outlier_file:
outliers = Outliers()
outliers.remove_outliers(options.genome_file,
options.outlier_file,
options.output_genome,
False)
elif options.compatible_file:
outliers = Outliers()
if options.unique_only:
outliers.add_compatible_unique(options.scaffold_file,
options.genome_file,
options.compatible_file,
options.min_len,
options.output_genome)
elif options.closest_only:
outliers.add_compatible_closest(options.scaffold_file,
options.genome_file,
options.compatible_file,
signature_file : str
Name of file to read.
Returns
-------
dict : d[seq_id] -> tetranucleotide signature in canonical order
Count of each kmer.
"""
try:
sig = {}
with open(signature_file) as f:
header = f.readline().split('\t')
kmer_order = [x.strip().upper() for x in header[1:]]
if len(kmer_order) != len(self.canonical_order()):
raise ParsingError("[Error] Tetranucleotide file must contain exactly {:,} tetranucleotide columns.".format(len(self.canonical_order())))
canonical_order_index = np.argsort(kmer_order)
canonical_order = [kmer_order[i] for i in canonical_order_index]
if canonical_order != self.canonical_order():
raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)
for line in f:
line_split = line.split('\t')
sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]
return sig
except IOError:
print('[Error] Failed to open signature file: %s' % signature_file)
sys.exit(-1)
except ParsingError:
"""Read statistics for scaffolds.
Parameters
----------
stats_file : str
File with statistics for individual scaffolds.
"""
try:
sig = {}
self.genome_ids = set()
with open(stats_file) as f:
header = f.readline().split('\t')
if 'AAAA' not in header:
raise ParsingError("[Error] Statistics file is missing tetranucleotide signature data: %s" % stats_file)
tetra_index = header.index('AAAA')
self.signature_headers = [x.strip() for x in header[tetra_index:]]
self.coverage_headers = [x.strip() for x in header[4:tetra_index]]
self.scaffolds_in_genome = defaultdict(set)
self.stats = {}
for line in f:
line_split = line.split('\t')
scaffold_id = line_split[0]
genome_id = line_split[1]
gc = float(line_split[2])
scaffold_len = int(line_split[3])
coverage = []
for cov in line_split[4:tetra_index]:
for line in f:
line_split = line.split('\t')
scaffold_id = line_split[0]
scaffold_len = int(line_split[1])
length[scaffold_id] = scaffold_len
for i, cov in enumerate(line_split[2:]):
coverage[scaffold_id][bam_ids[i]] = float(cov)
except IOError:
self.logger.error('Failed to open signature file: %s' % coverage_file)
sys.exit(-1)
except:
print(traceback.format_exc())
print('')
raise ParsingError("[Error] Failed to process coverage file: " + coverage_file)
sys.exit(-1)
return coverage, length
coverage.append(float(cov))
signature = []
for freq in line_split[tetra_index:]:
signature.append(float(freq))
self.stats[scaffold_id] = self.ScaffoldStats(genome_id, gc, scaffold_len, coverage, signature)
if genome_id != self.unbinned:
self.scaffolds_in_genome[genome_id].add(scaffold_id)
return sig
except IOError:
print('[Error] Failed to open scaffold statistics file: %s' % stats_file)
sys.exit(-1)
except ParsingError:
sys.exit(-1)
Count of each kmer.
"""
try:
sig = {}
with open(signature_file) as f:
header = f.readline().split('\t')
kmer_order = [x.strip().upper() for x in header[1:]]
if len(kmer_order) != len(self.canonical_order()):
raise ParsingError("[Error] Tetranucleotide file must contain exactly {:,} tetranucleotide columns.".format(len(self.canonical_order())))
canonical_order_index = np.argsort(kmer_order)
canonical_order = [kmer_order[i] for i in canonical_order_index]
if canonical_order != self.canonical_order():
raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)
for line in f:
line_split = line.split('\t')
sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]
return sig
except IOError:
print('[Error] Failed to open signature file: %s' % signature_file)
sys.exit(-1)
except ParsingError:
sys.exit(-1)
def split(self, options):
"""Split command"""
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.genome_file)
make_sure_path_exists(options.output_dir)
self.logger.info('Reading scaffold statistics.')
scaffold_stats = ScaffoldStats()
scaffold_stats.read(options.scaffold_stats_file)
cluster = Cluster(1)
cluster.split(scaffold_stats,
options.criteria1,
options.criteria2,
options.genome_file,
options.output_dir)
self.logger.info('Partitioned sequences written to: ' + options.output_dir)
def taxon_profile(self, options):
"""Call genes command"""
make_sure_path_exists(options.output_dir)
check_file_exists(options.scaffold_stats_file)
check_file_exists(options.taxonomy_file)
check_file_exists(options.db_file)
gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
if not self._check_protein_seqs(gene_files):
self.logger.warning('All files must contain amino acid sequences.')
sys.exit()
# build gene profile
taxon_profile = TaxonProfile(options.cpus, options.output_dir)
taxon_profile.run(gene_files,
options.scaffold_stats_file,
options.db_file,
options.taxonomy_file,
options.per_to_classify,
options.evalue,
options.per_identity,
options.per_aln_len,
options.tmpdir)
self.logger.info('Results written to: %s' % options.output_dir)
def reference(self, options):
"""Reference command"""
check_file_exists(options.scaffold_prot_file)
check_file_exists(options.scaffold_stats_file)
make_sure_path_exists(options.output_dir)
ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext)
if not self._check_protein_seqs(ref_gene_files):
self.logger.warning('All files must contain amino acid sequences.')
sys.exit()
reference = Reference(options.cpus, options.output_dir)
reference_out = reference.run(options.scaffold_prot_file,
options.scaffold_stats_file,
ref_gene_files,
options.db_file,
options.evalue,
options.per_identity,
options.per_aln_len)
self.logger.info('Results written to: ' + reference_out)