How to use biopandas - 10 common examples

To help you get started, we’ve selected a few biopandas examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rasbt / screenlamp / tools / sort_rocs_mol2.py View on Github external
if sortby:
        df.sort_values(sortby, inplace=True, ascending=False)

    if selection:
        selection_str = parse_selection_string(selection, df_name='df')
        mask = pd.eval(selection_str)
        df = df[mask]

    dbase_query_pairs = [(d, q) for d, q in
                         zip(df['Name'].values, df['ShapeQuery'].values)]
    query_names = {q for q in df['ShapeQuery'].values}

    query_mol2s = {}

    multiconf_query = False
    for idx, cont in enumerate(split_multimol2(query_path)):
        if idx >= 1:
            multiconf_query = True
            break

    cnt = -1

    if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
github rasbt / screenlamp / tools / sort_rocs_mol2.py View on Github external
if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
                    cont[1] = mol_idx + '\n'
                query_mol2s[mol_idx] = ''.join(cont)

    else:
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            if multiconf_query:
                mol_idx = '%s_%d' % (id_, cnt)
            else:
                mol_idx = id_
            if mol_idx in query_names:
                if id_suffix:
                    cont[1] = mol_idx + '\n'
                query_mol2s[mol_idx] = ''.join(cont)

    out_path_base = os.path.join(output_dir, os.path.basename(inp_mol2_path)
                                 .split('.mol2')[0])
    out_path_q = '%s_%s' % (out_path_base, 'query.mol2')
    out_path_d = '%s_%s' % (out_path_base, 'dbase.mol2')

    with tempfile.TemporaryDirectory() as tmpdirname:
github rasbt / screenlamp / tools / enumerate_conformers.py View on Github external
if query_path.endswith('.gz'):
        for id_, cont in split_multimol2(query_path):
            cnt += 1
            cont = b''.join(cont).decode('utf-8').split('\n')
            if multiconf_query:
                mol_idx = '%s_%d' % (id_.decode('utf-8'), cnt)
            else:
                mol_idx = id_
    """

    with open_file(out_mol2_path, write_mode) as outfile:

        prev_molecule = ''

        if inp_mol2_path.endswith('.gz'):
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = b'%s_%d' % (id_, cnt)

                cont[1] = mol_idx + b'\n'
                outfile.write(b''.join(cont))
                prev_molecule = id_

        else:
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
github rasbt / screenlamp / tools / overlay_molecules_rocs.py View on Github external
def run_rocs(source_file, target_file, n_processes, settings):

    prefix = ''.join(target_file.split('.mol2')[:-1])

    sys.stdout.write('Processing %s\n' % os.path.basename(source_file))
    sys.stdout.flush()

    for idx, mol2 in enumerate(split_multimol2(QUERY_FILE)):
        if idx >= 1:
            mcquery = 'true'
            break
    if not idx:
        mcquery = 'false'

    cmd = [EXECUTABLE,
           '-ref', QUERY_FILE,
           '-dbase', source_file,
           '-outputquery', 'false',
           '-prefix', prefix,
           '-mcquery', mcquery,
           '-mpi_np', str(n_processes),
           '-oformat', 'mol2']

    if settings:
github rasbt / screenlamp / tools / enumerate_conformers.py View on Github external
if inp_mol2_path.endswith('.gz'):
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = b'%s_%d' % (id_, cnt)

                cont[1] = mol_idx + b'\n'
                outfile.write(b''.join(cont))
                prev_molecule = id_

        else:
            for i, (id_, cont) in enumerate(split_multimol2(inp_mol2_path)):
                if prev_molecule != id_:
                    cnt = 0
                else:
                    cnt += 1

                mol_idx = '%s_%d' % (id_, cnt)

                cont[1] = mol_idx + '\n'
                outfile.write(''.join(cont))
                prev_molecule = id_

    if verbose:
        elapsed = time.time() - start
        n_molecules = i + 1
        sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
                         (n_molecules, n_molecules / elapsed))
github rasbt / screenlamp / tools / funcgroup_distance_to_id.py View on Github external
for mol2_file in mol2_files:
            if verbose:
                start = time.time()
                sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
                sys.stdout.flush()

            cnt = 0

            if mol2_file.endswith('.gz'):
                data_processor_fn = data_processor_gz
            else:
                data_processor_fn = data_processor

            for chunk in lazy_imap(data_processor=data_processor_fn,
                                   data_generator=split_multimol2(mol2_file),
                                   n_cpus=n_cpus):
                _ = [f.write('%s\n' % mol2_id)for mol2_id in chunk if mol2_id]
                cnt += len(chunk)

            if verbose:
                elapsed = time.time() - start
                sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
                sys.stdout.flush()
github rasbt / screenlamp / tools / funcgroup_matching_selection.py View on Github external
dbase_open_file(output_mol2_path_dbase, dbase_write_mode) as opd:
                for i in selection_indices:

                    mol2_q_cont = ('DID NOT FIND %s\n'
                                   % (df_atom.ix[i]['query']))

                    mol2_d_cont = ('DID NOT FIND %s\n'
                                   % (df_atom.ix[i]['dbase']))

                    for idx, mol2 in enumerate(split_multimol2(
                            input_mol2_path_query)):
                        if idx == i:
                            mol2_q_cont = mol2[1]
                            break

                    for idx, mol2 in enumerate(split_multimol2(
                            input_mol2_path_dbase)):
                        if idx == i:
                            mol2_d_cont = mol2[1]
                            break

                    if query_write_mode == 'wb':
                        opq.write(b''.join(mol2_q_cont))
                    else:
                        opq.write(''.join(mol2_q_cont))

                    if dbase_write_mode == 'wb':
                        opd.write(b''.join(mol2_d_cont))
                    else:
                        opd.write(''.join(mol2_d_cont))

        if verbose:
github rasbt / screenlamp / tools / funcgroup_matching.py View on Github external
if verbose:
        start = time.time()
        sys.stdout.write('Processing %s/%s' % (d_base, q_base))
        sys.stdout.flush()

    cnt = 0

    if q_path.endswith('.gz'):
        data_processor_fn = data_processor_gz
    else:
        data_processor_fn = data_processor

    for chunk in lazy_imap(data_processor=data_processor_fn,
                           data_generator=zip(split_multimol2(d_path),
                                              split_multimol2(q_path)),
                           n_cpus=n_cpus):

        for dbase_id, query_id, atoms, charges in chunk:
            dct_results['dbase'].append(dbase_id)
            dct_results['query'].append(query_id)
            dct_results['atoms'].append(atoms)
            dct_results['charges'].append(charges)

        cnt += len(chunk)
    """

    q_pdmol = PandasMol2()
    d_pdmol = PandasMol2()

    for q_mol2, d_mol2 in zip(split_multimol2(q_path),
                              split_multimol2(d_path)):
github rasbt / screenlamp / tools / funcgroup_presence_to_id.py View on Github external
for mol2_file in mol2_files:
            if verbose:
                start = time.time()
                sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
                sys.stdout.flush()

            cnt = 0

            if mol2_file.endswith('.gz'):
                data_processor_fn = data_processor_gz
            else:
                data_processor_fn = data_processor

            for chunk in lazy_imap(data_processor=data_processor_fn,
                                   data_generator=split_multimol2(
                                      mol2_file),
                                   n_cpus=n_cpus):

                _ = [f.write('%s\n' % mol2_id) for mol2_id
                     in chunk if mol2_id]
                cnt += len(chunk)

            if verbose:
                elapsed = time.time() - start
                sys.stdout.write(' | %d mol/sec\n' % (cnt / elapsed))
                sys.stdout.flush()
github rasbt / screenlamp / tools / mol2_to_id.py View on Github external
def mol2_to_idfile(mol2_files, id_file_path, verbose=0):
    with open(id_file_path, 'w') as f:
        for mol2_file in mol2_files:

            if verbose:
                sys.stdout.write('Processing %s' % os.path.basename(mol2_file))
                sys.stdout.flush()
                start = time.time()

            for idx, mol2 in enumerate(split_multimol2(mol2_file)):
                f.write(mol2[0] + '\n')

            if verbose:
                elapsed = time.time() - start
                n_molecules = idx + 1
                sys.stdout.write(' | scanned %d molecules | %d mol/sec\n' %
                                 (n_molecules, n_molecules / elapsed))
                sys.stdout.flush()

biopandas

Machine Learning Library Extensions

BSD-3-Clause
Latest version published 4 months ago

Package Health Score

74 / 100
Full package analysis

Similar packages