Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _quarantine_sample(self, sample):
"""Add sample name and pileup width to a list."""
# Note: the below assumes we haven't split a pileup on minor positions.
# This should be the case: chunking on minor positions only occurs for
# larger regions.
start, _ = sample.first_pos
end, _ = sample.last_pos
end += 1 # end exclusive
self._quarantined.append((
medaka.common.Region(sample.ref_name, start, end), sample.size
))
if samples is not None:
# yield samples in the order they are asked for
for sample, fname in samples:
yield DataStore(fname).load_sample(sample)
else:
all_samples = self.index
if regions is None:
regions = [
medaka.common.Region.from_string(x)
for x in sorted(all_samples)]
for reg in regions:
if reg.ref_name not in self.index:
continue
for sample in self.index[reg.ref_name]:
# samples can have major.minor coords, round to end excl.
sam_reg = medaka.common.Region(
sample['ref_name'],
int(float(sample['start'])),
int(float(sample['end'])) + 1)
if sam_reg.overlaps(reg):
with DataStore(sample['filename']) as store:
yield store.load_sample(sample['sample_key'])
:returns: list of `Region` objects.
"""
with pysam.AlignmentFile(bam) as bam_fh:
ref_lengths = dict(zip(bam_fh.references, bam_fh.lengths))
if region_strs is not None:
if os.path.isfile(region_strs[0]):
with open(region_strs[0]) as fh:
region_strs = [ l.strip() for l in fh.readlines()]
regions = []
for r in parse_regions(region_strs):
start = r.start if r.start is not None else 0
end = r.end if r.end is not None else ref_lengths[r.ref_name]
regions.append(Region(r.ref_name, start, end))
else:
regions = [Region(ref_name, 0, end) for ref_name, end in ref_lengths.items()]
return regions
"""
# filter truth alignments to restrict ourselves to regions of the ref where the truth
# in unambiguous
alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end)
filtered_alignments = TruthAlignment.filter_alignments(alignments, start=region.start, end=region.end)
if len(filtered_alignments) == 0:
logging.info("Filtering removed all alignments of truth to ref from {}.".format(region))
samples = []
for aln in filtered_alignments:
mock_compr = self.max_hp_len > 1 and not self.is_compressed
truth_chunk = aln.get_positions_and_labels(ref_compr_rle=ref_rle_fq, mock_compr=mock_compr,
is_compressed=self.is_compressed, rle_dtype=True)
sample = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle_fq, read_fraction=read_fraction)
# Create labels according to positions in pileup
pad = (encoding[_gap_], 1) if len(truth_chunk.reads[0].dtype) > 0 else encoding[_gap_]
padder = itertools.repeat(pad)
position_to_label = defaultdict(padder.__next__,
zip([tuple(p) for p in truth_chunk.positions],
[a for a in truth_chunk.reads[0]]))
padded_labels = np.fromiter((position_to_label[tuple(p)] for p in sample.positions),
dtype=truth_chunk.reads[0].dtype, count=len(sample.positions))
sample = sample._asdict()
sample['labels'] = padded_labels
samples.append(Sample(**sample))
return tuple(samples)
"""
with pysam.AlignmentFile(bam) as bam_fh:
ref_lengths = dict(zip(bam_fh.references, bam_fh.lengths))
if region_strs is not None:
if os.path.isfile(region_strs[0]):
with open(region_strs[0]) as fh:
region_strs = [line.strip() for line in fh.readlines()]
regions = []
for r in (Region.from_string(x) for x in region_strs):
start = r.start if r.start is not None else 0
end = r.end if r.end is not None else ref_lengths[r.ref_name]
regions.append(Region(r.ref_name, start, end))
else:
regions = [
Region(ref_name, 0, end)
for ref_name, end in ref_lengths.items()]
return regions
def get_homo_regions(ref_name, pos, min_len=1000):
regions = []
hetero_gaps = np.ediff1d(pos)
sort_inds = np.argsort(hetero_gaps)[::-1]
homo_len = 0
for i in sort_inds:
if hetero_gaps[i] < min_len:
break
homo_len += hetero_gaps[i]
start = pos[i]
end = start + hetero_gaps[i]
regions.append(medaka.common.Region(ref_name, start, end))
return regions, homo_len
if args.model is not None:
fe_kwargs = load_yaml_data(args.model, _feature_opt_path_)
print(fe_kwargs)
else:
fe_kwargs = { k:v.default for (k,v) in inspect.signature(FeatureEncoder.__init__).parameters.items()
if v.default is not inspect.Parameter.empty}
opt_str = '\n'.join(['{}: {}'.format(k,v) for k, v in fe_kwargs.items()])
logging.info('FeatureEncoder options: \n{}'.format(opt_str))
fe = FeatureEncoder(**fe_kwargs)
reg_str = '\n'.join(['\t\t\t{ref_name}:{start}-{end}'.format(**r._asdict()) for r in regions])
logging.info('Got regions:\n{}'.format(reg_str))
chunked_regions = [Region(region.ref_name, start, end)
for region in regions
for start, end in segment_limits(region.start, region.end,
segment_len=10*args.chunk_len,
overlap_len=5*args.chunk_ovlp)]
unique_refs = set((region.ref_name for region in regions))
ref_rles = {r: fe.process_ref_seq(r, args.ref_fastq) for r in unique_refs}
rg = partial(fe.bams_to_training_samples, args.truth, args.bam, read_fraction=args.read_fraction)
chunker = partial(chunk_samples, chunk_len=args.chunk_len, overlap=args.chunk_ovlp)
label_encoding, label_decoding = get_label_encoding(args.max_label_len)
s2xy = partial(sample_to_x_y, encoding=label_encoding, max_label_len=args.max_label_len)
f = partial(_process_region, rg, alphabet_filter, chunker, s2xy)