Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_base_feature_props():
class _FakeFeaturier(BaseFeaturizer):
def __init__(self):
super().__init__()
def featurize(self, *x, **kwargs):
return x[0]
@property
def feature_labels(self):
return ['labels']
bf = _FakeFeaturier()
with pytest.raises(ValueError, match='`on_errors`'):
bf.on_errors = 'illegal'
with pytest.raises(ValueError, match='`return_type`'):
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ndarray size changed")
class _FakeFeaturier1(BaseFeaturizer):
def __init__(self, n_jobs=1):
super().__init__(n_jobs=n_jobs)
def featurize(self, *x, **kwargs):
return x[0]
@property
def feature_labels(self):
return ['label1']
class _FakeFeaturier2(BaseFeaturizer):
def __init__(self, n_jobs=1):
super().__init__(n_jobs=n_jobs)
def featurize(self, *x, **kwargs):
return x[0]
@property
def feature_labels(self):
return ['label2']
class _FakeFeaturier3(BaseFeaturizer):
def __init__(self, n_jobs=1):
super().__init__(n_jobs=n_jobs)
def test_base_feature_3(data):
class _ErrorFeaturier(BaseFeaturizer):
def __init__(self, n_jobs=1, on_errors='raise'):
super().__init__(n_jobs=n_jobs, on_errors=on_errors)
def featurize(self, *x):
raise ValueError()
@property
def feature_labels(self):
return ['labels']
featurizer = _ErrorFeaturier()
assert isinstance(featurizer, BaseFeaturizer)
with pytest.raises(ValueError):
featurizer.fit_transform([1, 2, 3, 4])
# Copyright (c) 2019. stewu5. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
from xenonpy.descriptor.base import BaseFeaturizer
class Mordred2DDescriptor(BaseFeaturizer):
def __init__(self, *, on_errors='raise', return_type='any'):
# fix n_jobs to be 0 to skip automatic wrapper in XenonPy BaseFeaturizer class
super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type)
self.output = None
self.__authors__ = ['Stephen Wu', 'TsumiNa']
def featurize(self, x):
# check if type(x) = list
if isinstance(x, pd.Series):
x = x.tolist()
if not isinstance(x, list):
x = [x]
# check input format, assume SMILES if not RDKit-MOL
if not isinstance(x[0], Chem.rdchem.Mol):
x_mol = []
# Copyright (c) 2019. yoshida-lab. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
import re
import numpy as np
import pymatgen as pm
from pymatgen.analysis.local_env import VoronoiNN
from xenonpy.descriptor.base import BaseDescriptor, BaseFeaturizer
__all__ = ['RadialDistributionFunction', 'OrbitalFieldMatrix', 'Structures']
class RadialDistributionFunction(BaseFeaturizer):
"""
Calculate pair distribution descriptor for machine learning.
"""
@property
def feature_labels(self):
return [str(d) for d in self._interval[1:]]
def __init__(self, n_bins=201, r_max=20.0, *, n_jobs=-1, on_errors='raise', return_type='any'):
"""
Parameters
----------
n_bins: int
Number of radial grid points.
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
return list(rdMol.GetMorganFingerprintAsBitVect(x, self.radius, nBits=self.n_bits))
@property
def feature_labels(self):
return ['ecfp3:' + str(i) for i in range(self.n_bits)]
class DescriptorFeature(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, input_type='mol', on_errors='raise', return_type='any'):
"""
All descriptors in RDKit (length = 200) [may include NaN]
see https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors for the full list
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
return list(MAC.GenMACCSKeys(x))
@property
def feature_labels(self):
return ['maccs:' + str(i) for i in range(167)]
class FCFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, input_type='mol', on_errors='raise', return_type='any'):
"""
Morgan (Circular) fingerprints + feature-based (FCFP)
The algorithm used is described in the paper Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints.
JCIM 50:742-54 (2010)
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
radius: int
The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in FCFP,
i.e., radius=2 is roughly equivalent to FCFP4.
n_bits: int
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
return list(rdMol.GetHashedTopologicalTorsionFingerprintAsBitVect(x, nBits=self.n_bits))
@property
def feature_labels(self):
return ['ttfp:' + str(i) for i in range(self.n_bits)]
class MACCS(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, input_type='mol', on_errors='raise', return_type='any'):
"""
The MACCS keys for a molecule. The result is a 167-bit vector. There are 166 public keys,
but to maintain consistency with other software packages they are numbered from 1.
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
input_type: string
Set the specific type of transform input.
Set to ``mol`` (default) to ``rdkit.Chem.rdchem.Mol`` objects as input.
When set to ``smlies``, ``transform`` method can use a SMILES list as input.
Set to ``any`` to use both.
raise ValueError('can not convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
return list(Chem.RDKFingerprint(x, fpSize=self.fp_size))
@property
def feature_labels(self):
return ["rdkit:" + str(i) for i in range(self.fp_size)]
class AtomPairFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, n_bits=2048, input_type='mol', on_errors='raise', return_type='any'):
"""
Atom Pair fingerprints.
Returns the atom-pair fingerprint for a molecule.The algorithm used is described here:
R.E. Carhart, D.H. Smith, R. Venkataraghavan;
"Atom Pairs as Molecular Features in Structure-Activity Studies: Definition and Applications"
JCICS 25, 64-73 (1985).
This is currently just in binary bits with fixed length after folding.
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
n_bits: int
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
if self.input_type == 'any':
if not isinstance(x, Chem.rdchem.Mol):
x_ = x
x = Chem.MolFromSmiles(x)
if x is None:
raise ValueError('can not convert Mol from SMILES %s' % x_)
return list(rdMol.GetMorganFingerprintAsBitVect(x, self.radius, nBits=self.n_bits, useFeatures=True))
@property
def feature_labels(self):
return ['fcfp3:' + str(i) for i in range(self.n_bits)]
class ECFP(BaseFeaturizer):
def __init__(self, n_jobs=-1, *, radius=3, n_bits=2048, input_type='mol', on_errors='raise', return_type='any'):
"""
Morgan (Circular) fingerprints (ECFP)
The algorithm used is described in the paper Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints.
JCIM 50:742-54 (2010)
Parameters
----------
n_jobs: int
The number of jobs to run in parallel for both fit and predict.
Can be -1 or # of cups. Set -1 to use all cpu cores (default).
radius: int
The radius parameter in the Morgan fingerprints, which is roughly half of the diameter parameter in ECFP,
i.e., radius=2 is roughly equivalent to ECFP4.
n_bits: int