How to use biothings - 10 common examples

To help you get started, we’ve selected a few biothings examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biothings / mygene.info / src / dataload / data_dump / dl_entrez.py View on Github external
sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': TIMESTAMP,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
github biothings / myvariant.info / src / dataload / sources / dbnsfp / dbnsfp_parser.py View on Github external
"fin_ac": fields[170],
                "fin_af": fields[171],
                "nfe_ac": fields[172],
                "nfe_af": fields[173]
            },
            "clinvar": {
                "rs": fields[176],
                "clinsig": fields[177],
                "trait": fields[178],
                "golden_stars": fields[179]
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
github biothings / myvariant.info / src / hub / dataload / sources / dbnsfp / dbnsfp_parser.py View on Github external
"hgvs": [i for i in df["clinvar_hgvs"].split("|") if i != "."],
                "omim": [i for i in df["clinvar_OMIM_id"].split("|") if i != "."],
                "medgen": [i for i in df["clinvar_MedGen_id"].split("|") if i != "."],
                "orphanet": [i for i in df["clinvar_Orphanet_id"].split("|") if i != "."],
                "var_source": [i for i in df["clinvar_var_source"].split("|") if i != "."]

            },
            "hgvsc": list(set(df["HGVSc_ANNOVAR"].split(';') + df["HGVSc_snpEff"].split(';') + df["HGVSc_VEP"].split(';'))),
            "hgvsp": list(set(df["HGVSp_ANNOVAR"].split(';') + df["HGVSp_snpEff"].split(';') + df["HGVSp_VEP"].split(';'))),
            "gtex": list(gtex),
            "geuvadis_eqtl_target_gene": df["Geuvadis_eQTL_target_gene"]
        }
    }
    if include_gnomad:
        one_snp_json['dbnsfp'].update(gnomad)
    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', "NA", None], remove_invalid_list=True), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
github biothings / mygene.info / src / hub / dataload / sources / pharos / parser.py View on Github external
def load_data(input_file):

    with open_anyfile(input_file) as in_f:
        result = defaultdict(list)
        for line in in_f:
            pharos_id, _id = line.strip().split(',')
            if _id != 'entrez_gene_id' and _id != '0':
                result[str(_id)].append(int(pharos_id))
        for k, v in result.items():
            json_doc = {'_id': str(k),
                        'pharos': {"target_id": v}}
            yield unlist(json_doc)
github biothings / mygene.info / src / dataload / data_dump / dl_uniprot.py View on Github external
def download(no_confirm=False):
    orig_path = os.getcwd()
    try:
        os.chdir(DATA_FOLDER)
        path, filename = os.path.split(DATAFILE_PATH)
        if os.path.exists(filename):
            if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                os.remove(filename)
            else:
                logging.info("Skipped!")
                return
        logging.info('Downloading "%s"...' % filename)
        url = 'ftp://{}/{}'.format(FTP_SERVER, DATAFILE_PATH)
        cmdline = 'wget %s -O %s' % (url, filename)
        #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
        return_code = os.system(cmdline)
        if return_code == 0:
            logging.info("Success.")
        else:
            logging.info("Failed with return code (%s)." % return_code)
        logging.info("=" * 50)
    finally:
        os.chdir(orig_path)
github biothings / mygene.info / src / dataload / data_dump / dl_refseq.py View on Github external
refseq_release = get_refseq_release()
    logging.info(refseq_release)

    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'refseq',
           'release': refseq_release,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
github biothings / myvariant.info / src / dataload / contrib / dbnsfp / dbnsfp_parser.py View on Github external
"fin_ac": fields[122],
                "fin_af": fields[123],
                "nfe_ac": fields[124],
                "nfe_af": fields[125],
                "sas_ac": fields[126],
                "sas_af": fields[127]
            },
            "clinvar": {
                "rs": fields[128],
                "clinsig": fields[129],
                "trait": fields[130]
            }
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
github biothings / mygene.info / src / hub / dataload / sources / ucsc / parser.py View on Github external
[int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq,[]).append({
            'transcript' : refseq,
            'chr': chr,
            'strand': -1 if ld[3] == '-' else 1,
            'txstart': int(ld[4]),
            'txend': int(ld[5]),
            'cdsstart': int(ld[6]),
            'cdsend': int(ld[7]),
            'position': exons
        })

    gene2exons = {}
    reflink_file = os.path.join(data_folder, '../hgFixed/database/refLink.txt.gz')
    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: ref2exons[refseq]}
            else:
                gene2exons[geneid][exons_key].extend(ref2exons[refseq])

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
github biothings / mygene.info / src / hub / dataload / sources / exac / parser.py View on Github external
def load_broadinstitute_exac_any(one_file,key):
    logging.info("Loading file %s (%s)" % (one_file,key))
    data = tab2dict(one_file, (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0)
    exacs = {}
    for transcript in data:
        tupleexac = data[transcript]
        # remove version in key so we can search the dict easily later
        exacs[transcript.split(".")[0]] = {"exac" : 
                {
                    "transcript" : transcript,  # but keep version here
                    "n_exons" : int(tupleexac[0]),
                    "cds_start" : int(tupleexac[1]),
                    "cds_end" : int(tupleexac[2]),
                    "bp" : int(tupleexac[3]),
                    key : {
                        "mu_syn" : float(tupleexac[4]),
                        "mu_mis" : float(tupleexac[5]),
                        "mu_lof" : float(tupleexac[6]),
                        "n_syn" : float(tupleexac[7]),
github biothings / mygene.info / src / hub / dataload / sources / homologene / parser.py View on Github external
def load(self, aslist=False):
        '''
        loading ncbi "homologene.data" file
        adding "homologene" field in gene doc
        '''
        from biothings.utils.hub_db import get_src_dump
        homo_d = tab2dict(self.datafile,(2,1),0,header=0)
        entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {}
        entrez_dir = entrez_doc.get("download",{}).get("data_folder")
        assert entrez_dir, "Can't find Entrez data directory"
        DATAFILE = os.path.join(entrez_dir, 'gene_history.gz')
        assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir
        retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
        for id in list(homo_d.keys()):
            homo_d[retired2gene.get(id,id)] = homo_d[id]

        with open(self.datafile) as df:
            homologene_d = {}
            doc_li = []
            print()
            geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d)

            for line in df:
                ld = line.strip().split('\t')
                hm_id, tax_id, geneid = [int(x) for x in ld[:3]]
                if (self.taxid_set is None or tax_id in self.taxid_set) and geneid in geneid_d:
                    # for selected species only
                    # and also ignore those geneid does not match any
                    # existing gene doc