Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data.columns.tolist())
# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(lambda x: x.get_integer_formula_and_factor()[0])
data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))
data.reset_index(inplace=True, drop=True)
# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
cf.Stoichiometry(),
cf.ElementProperty.from_preset('magpie'),
cf.ValenceOrbital(props=['frac']),
cf.IonProperty(fast=True)
])
# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])
# Make the model
model = Pipeline([
('imputer', Imputer()),
('model', RandomForestRegressor())
])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')
print('Loaded {} rows with {} columns:'.format(len(data), len(data.columns)),
data.columns.tolist())
# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(lambda x: x.get_integer_formula_and_factor()[0])
data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))
data.reset_index(inplace=True, drop=True)
# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
cf.Stoichiometry(),
cf.ElementProperty.from_preset('magpie'),
cf.ValenceOrbital(props=['frac']),
cf.IonProperty(fast=True)
])
# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])
# Make the model
model = Pipeline([
('imputer', Imputer()),
('model', RandomForestRegressor())
])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')
def debug(self):
return self._get_featurizers([cf.ElementProperty.from_preset("magpie")])
def all(self):
fs = [
cf.AtomicOrbitals(),
cf.ElementProperty.from_preset("matminer"),
cf.ElementProperty.from_preset("magpie"),
cf.ElementProperty.from_preset("matscholar_el"),
cf.ElementProperty.from_preset("deml"),
cf.Meredig(),
cf.ElementFraction(),
cf.Stoichiometry(),
cf.TMetalFraction(),
cf.BandCenter(),
cf.ValenceOrbital(),
cf.YangSolidSolution(),
cf.CationProperty.from_preset(preset_name="deml"),
cf.OxidationStates.from_preset(preset_name="deml"),
cf.ElectronAffinity(),
cf.ElectronegativityDiff(),
cf.IonProperty(fast=True),
cf.Miedema(),
cf.AtomicPackingEfficiency(), # slower than the rest
cf.CohesiveEnergy(), # requires mpid present
]
return self._get_featurizers(fs)