Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def process_vcf(self, inputfile):
"""
Main function for parsing VCF
"""
# initialize reference genome
fasta_reader = Fasta(self.args.fastafile, read_ahead=1000000)
# initialize vcf reader
if self.args.samplefile:
keep_samples = parseSampleFile(self.args.samplefile)
vcf_reader = VCF(
inputfile,
mode='rb',
gts012=True,
lazy=True,
samples=keep_samples)
else:
vcf_reader = VCF(inputfile, mode='rb', gts012=True, lazy=True)
nbp = (self.args.length - 1) // 2
# index samples
if (self.args.samplefile and self.args.groupvar):
all_samples = vcf_reader.samples
sg_dict = indexGroups(self.args.samplefile, self.args.groupvar)
samples = sorted(list(set(sg_dict.values())))
def variants():
vcf = VCF(str(HERE.parent.joinpath('examples', 'sample.vcf')), gts012 = True)
return list(vcf)
This implies that some compounds will have the status 'not_loaded'=True.
When loading all variants for a region then all variants should
have status 'not_loaded'=False.
"""
adapter = real_populated_database
variant_type = "clinical"
category = "snv"
## GIVEN a database without any variants
assert adapter.variant_collection.find_one() is None
institute_obj = adapter.institute_collection.find_one()
institute_id = institute_obj["_id"]
## WHEN loading variants into the database without updating compound information
vcf_obj = VCF(variant_clinical_file)
rank_results_header = parse_rank_results_header(vcf_obj)
vep_header = parse_vep_header(vcf_obj)
individual_positions = {}
for i, ind in enumerate(vcf_obj.samples):
individual_positions[ind] = i
variants = []
for i, variant in enumerate(vcf_obj):
parsed_variant = parse_variant(
variant=variant,
case=case_obj,
variant_type="clinical",
rank_results_header=rank_results_header,
vep_header=vep_header,
individual_positions=individual_positions,
def run_benchmark_vcf(args):
before = time.perf_counter()
records = cyvcf2.VCF(args.input)
duration = time.perf_counter() - before
print("Read BCF header in {:.2f} seconds".format(duration))
before = time.perf_counter()
count = 0
for record in records:
count += 1
duration = time.perf_counter() - before
print("Read {} VCF records in {:.2f} seconds".format(count, duration))
def convert(
vcf_file, ancestral_states_file, pedigree_file, output_file, max_variants=None,
show_progress=False):
git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
git_provenance = {
"repo": "git@github.com:mcveanlab/treeseq-inference.git",
"hash": git_hash.decode().strip(),
"dir": "human-data",
"notes:": (
"Use the Makefile to download and process the upstream data files")}
with tsinfer.SampleData(path=output_file, num_flush_threads=2) as sample_data:
pop_id_map = add_populations(sample_data)
vcf = cyvcf2.VCF(vcf_file)
individual_names = list(vcf.samples)
vcf.close()
with open(pedigree_file, "r") as ped_file:
add_samples(ped_file, pop_id_map, individual_names, sample_data)
ancestral_states = read_ancestral_states(ancestral_states_file, show_progress)
iterator = variants(vcf_file, ancestral_states, show_progress, max_variants)
for site in iterator:
sample_data.add_site(
position=site.position, genotypes=site.genotypes,
alleles=site.alleles, metadata=site.metadata)
sample_data.record_provenance(
command=sys.argv[0], args=sys.argv[1:], git=git_provenance)
def variants(vcf_path, ancestral_states, show_progress=False, max_sites=None):
"""
Yield a tuple of position, alleles, genotypes, metadata. Ancestral_states is a
dictionary mapping RSID to the ancestral allele.
"""
tot_sites = vcf_num_rows(vcf_path)
progress = tqdm.tqdm(
total=tot_sites, desc="Read genotypes", disable=not show_progress)
vcf = cyvcf2.VCF(vcf_path)
sites_used = 0
non_biallelic = 0
no_ancestral_state = 0
missing_data = 0
indels = 0
unphased = 0
invariant = 0
num_diploids = len(vcf.samples)
num_samples = 2 * num_diploids
j = 0
for row in filter_duplicates(vcf):
progress.update()
# only use biallelic sites with data for all samples, where ancestral state is known
# and we have an RSID.
def vcf_file_geno_lines(path, mode="genotyped", variant_mapping=None, whitelist=None, skip_palindromic=False, liftover_conversion=None):
logging.log(9, "Processing vcf %s", path)
vcf_reader = VCF(path)
is_dict_mapping = variant_mapping is not None and type(variant_mapping) == dict
for variant in vcf_reader:
chr = variant.CHROM
pos = variant.POS
variant_id = variant.ID
ref = variant.REF
alts = variant.ALT
if liftover_conversion:
chr_, pos_ = chr, pos
chr, pos = liftover_conversion(chr, pos)
if chr == "NA" or pos == "NA":
continue
def _get_hits(coords, annotation, parser_type, _parsers=PARSERS):
"""Retrieve BED information, recovering if BED annotation file does have a chromosome.
"""
try:
parser = _parsers[parser_type]
except KeyError:
raise ValueError("Unexpected parser type: %s" % parser)
chrom, start, end = coords
if isinstance(annotation, pysam.VariantFile):
return annotation.fetch(chrom, start, end)
elif isinstance(annotation, cyvcf2.VCF):
return annotation("%s:%d-%d" % (chrom, start - 1, end))
try:
hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
# catch invalid region errors raised by ctabix
except ValueError:
hit_iter = []
# recent versions of pysam return KeyError
except KeyError:
hit_iter = []
except:
print(annotation.__class__, file=sys.stderr)
raise
return hit_iter
if filters.get('sv_types'):
sv_types = set(filters['sv_types'])
logger.info("Get variants from {0}".format(vcf_file_path))
if filters.get('range'):
range_str = "{0}:{1}-{2}".format(
filters['range']['chromosome'],
filters['range']['start'],
filters['range']['end'])
vcf = VCF(vcf_file_path)
handle = vcf(range_str)
else:
handle = VCF(vcf_file_path)
for variant in handle:
variant_line = str(variant)
keep_variant = True
if genes and keep_variant:
keep_variant = False
for gene in genes:
if "{0}".format(gene) in variant_line:
keep_variant = True
break
if consequences and keep_variant:
keep_variant = False
for consequence in consequences:
if consequence in variant_line: