Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""Get processed truth alignments.
:param truth_bam: (sorted indexed) bam with true sequence
aligned to reference
:param region: `medaka.common.Region` obj,
(all alignments with any overlap with the interval
start:end will be retrieved)
:param haplotag: bam tag specifying which haplotype the alignment
belongs to (for polyploid labels)
:param min_length: minimum length for valid alignments.
:returns: list of tuples where each tuple contains `TruthAlignment`
for each haplotype trimmed to common genomic window.
"""
algns = TruthAlignment._load_alignments(truth_bam, region, haplotag)
# filter truth alignments to restrict ourselves to regions of the
# ref where the truth is unambiguous in each haplotype
algns = {
h: TruthAlignment._filter_alignments(
h_algns, region=region, min_length=min_length)
for h, h_algns in algns.items()}
# Group truth alignments from the haplotypes by genomic window and trim
# to common genomic range
if len(algns) == 0:
return []
else:
grouped = TruthAlignment._group_and_trim_by_haplotype(algns)
return grouped
the reference will be parsed.
:param chunk_len: int, chunk length in reference coordinates.
:param overlap: int, overlap of adjacent chunks in reference coordinates.
:yields: tuple of `Pileup` objects (one item for each input bam) for each
chunk.
.. note:: Chunks might be missing if `truth_bam` is provided and
regions with multiple mappings were encountered.
"""
tview = MultiTView(bams, ref_fasta, columns=chunk_len)
# filter truth alignments to restrict ourselves to regions of the ref where the truth
# in unambiguous
alignments = TruthAlignment.bam_to_alignments(truth_bam, ref_name, start=start, end=end)
filtered_alignments = TruthAlignment.filter_alignments(alignments, start=start, end=end)
if len(filtered_alignments) == 0:
raise RuntimeError("Filtering removed all alignments of truth to ref, cannot continue.")
for aln in filtered_alignments:
msg = "Chunking {}: {}-{} in chunks of {} overlapping by {}"
logger.info(msg.format(ref_name, aln.start, aln.end, chunk_len, overlap))
for chunk_start, chunk_end in segment_limits(aln.start, aln.end, segment_len=chunk_len, overlap_len=overlap):
truth_chunk = aln.get_positions_and_labels(start=chunk_start, end=chunk_end)
try:
pileups, ref_seq = tview.pileup(ref_name, chunk_start, chunk_end)
except TViewException:
logger.info("Skipping region {}-{} as TView did not find any reads".format(chunk_start, chunk_end))
continue
:param truth_bam: .bam file of truth aligned to ref to generate labels.
:param bam: input .bam file.
:param region: `medaka.common.Region` instance for region to process.
:param label_scheme: a `LabelScheme` describing network outputs.
:param truth_haplotag: two letter tag name used for grouping truth
labels by haplotype.
:param min_length: minimum length for valid alignments.
:returns: tuple of `medaka.common.Sample` objects.
.. note:: Chunks might be missing if `truth_bam` is provided and
regions with multiple mappings were encountered.
"""
# Find truth alignments (with some filtering).
alns = medaka.labels.TruthAlignment.bam_to_alignments(
truth_bam, region, haplotag=truth_haplotag,
min_length=min_length)
if len(alns) == 0:
self.logger.info(
"Filtering and grouping removed all alignments "
"of truth to ref from {}.".format(region))
samples = []
for aln in alns:
# get labels from truth alignments.
truth_pos, truth_labels = label_scheme.encode(aln)
# get features from read alignment data
aln_samples = self.bam_to_sample(bam, medaka.common.Region(
region.ref_name, aln[0].start, aln[0].end))
:param chunk_len: int, chunk length in reference coordinates.
:param overlap: int, overlap of adjacent chunks in reference coordinates.
:yields: tuple of `Pileup` objects (one item for each input bam) for each
chunk.
.. note:: Chunks might be missing if `truth_bam` is provided and
regions with multiple mappings were encountered.
"""
tview = MultiTView(bams, ref_fasta, columns=chunk_len)
# filter truth alignments to restrict ourselves to regions of the ref where the truth
# in unambiguous
alignments = TruthAlignment.bam_to_alignments(truth_bam, ref_name, start=start, end=end)
filtered_alignments = TruthAlignment.filter_alignments(alignments, start=start, end=end)
if len(filtered_alignments) == 0:
raise RuntimeError("Filtering removed all alignments of truth to ref, cannot continue.")
for aln in filtered_alignments:
msg = "Chunking {}: {}-{} in chunks of {} overlapping by {}"
logger.info(msg.format(ref_name, aln.start, aln.end, chunk_len, overlap))
for chunk_start, chunk_end in segment_limits(aln.start, aln.end, segment_len=chunk_len, overlap_len=overlap):
truth_chunk = aln.get_positions_and_labels(start=chunk_start, end=chunk_end)
try:
pileups, ref_seq = tview.pileup(ref_name, chunk_start, chunk_end)
except TViewException:
logger.info("Skipping region {}-{} as TView did not find any reads".format(chunk_start, chunk_end))
continue
assert truth_chunk.positions[0]['major'] == pileups[0].positions[0]['major']
belongs to (for polyploid labels)
:returns: {haplotype: [`TruthAlignment`]}
"""
alignments = collections.defaultdict(list)
with pysam.AlignmentFile(truth_bam, 'rb') as bamfile:
aln_reads = bamfile.fetch(
reference=region.ref_name,
start=region.start, end=region.end)
for r in aln_reads:
if (r.is_unmapped or r.is_secondary):
continue
else:
hap = r.get_tag(haplotag) if haplotag is not None else None
alignments[hap].append(TruthAlignment(r))
logger = medaka.common.get_named_logger("TruthAlign")
for hap in alignments.keys():
alignments[hap].sort(key=attrgetter('start'))
logger.info("Retrieved {} alignments for haplotype {}.".format(
len(alignments[hap]), hap))
return alignments
:param chunk_len: int, chunk length in reference coordinates.
:param overlap: int, overlap of adjacent chunks in reference
coordinates.
:returns: tuple of `Sample` objects (one item for each input bam) for
each chunk.
.. note:: Chunks might be missing if `truth_bam` is provided and
regions with multiple mappings were encountered.
"""
# filter truth alignments to restrict ourselves to regions of the ref where the truth
# in unambiguous
alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end)
filtered_alignments = TruthAlignment.filter_alignments(alignments, start=region.start, end=region.end)
if len(filtered_alignments) == 0:
logging.info("Filtering removed all alignments of truth to ref from {}.".format(region))
samples = []
for aln in filtered_alignments:
mock_compr = self.max_hp_len > 1 and not self.is_compressed
truth_chunk = aln.get_positions_and_labels(ref_compr_rle=ref_rle_fq, mock_compr=mock_compr,
is_compressed=self.is_compressed, rle_dtype=True)
sample = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle_fq, read_fraction=read_fraction)
# Create labels according to positions in pileup
pad = (encoding[_gap_], 1) if len(truth_chunk.reads[0].dtype) > 0 else encoding[_gap_]
padder = itertools.repeat(pad)
position_to_label = defaultdict(padder.__next__,
zip([tuple(p) for p in truth_chunk.positions],
[a for a in truth_chunk.reads[0]]))
padded_labels = np.fromiter((position_to_label[tuple(p)] for p in sample.positions),
for each haplotype trimmed to common genomic window.
"""
algns = TruthAlignment._load_alignments(truth_bam, region, haplotag)
# filter truth alignments to restrict ourselves to regions of the
# ref where the truth is unambiguous in each haplotype
algns = {
h: TruthAlignment._filter_alignments(
h_algns, region=region, min_length=min_length)
for h, h_algns in algns.items()}
# Group truth alignments from the haplotypes by genomic window and trim
# to common genomic range
if len(algns) == 0:
return []
else:
grouped = TruthAlignment._group_and_trim_by_haplotype(algns)
return grouped
of `get_runs_from_fastq`
:param chunk_len: int, chunk length in reference coordinates.
:param overlap: int, overlap of adjacent chunks in reference
coordinates.
:returns: tuple of `Sample` objects (one item for each input bam) for
each chunk.
.. note:: Chunks might be missing if `truth_bam` is provided and
regions with multiple mappings were encountered.
"""
# filter truth alignments to restrict ourselves to regions of the ref where the truth
# in unambiguous
alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end)
filtered_alignments = TruthAlignment.filter_alignments(alignments, start=region.start, end=region.end)
if len(filtered_alignments) == 0:
logging.info("Filtering removed all alignments of truth to ref from {}.".format(region))
samples = []
for aln in filtered_alignments:
mock_compr = self.max_hp_len > 1 and not self.is_compressed
truth_chunk = aln.get_positions_and_labels(ref_compr_rle=ref_rle_fq, mock_compr=mock_compr,
is_compressed=self.is_compressed, rle_dtype=True)
sample = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle_fq, read_fraction=read_fraction)
# Create labels according to positions in pileup
pad = (encoding[_gap_], 1) if len(truth_chunk.reads[0].dtype) > 0 else encoding[_gap_]
padder = itertools.repeat(pad)
position_to_label = defaultdict(padder.__next__,
zip([tuple(p) for p in truth_chunk.positions],
[a for a in truth_chunk.reads[0]]))