Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"verbosity": -1}
# "seed": 8888
# folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=8888)
# idx = y_train.argsort()
# y_lab = np.repeat(list(range(50000 // 20)), 20)
# y_lab = np.asarray(sorted(list(zip(idx, y_lab))))[:, -1].astype(np.int32)
# splits = folds.split(X_train, y_lab)
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
splits = folds.split(X_train, y_train)
for fold_, (trn_idx, val_idx) in enumerate(splits):
print("fold n°{}".format(fold_ + 1))
trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])
num_round = 20000
clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
early_stopping_rounds=100)
oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
print("MAE CV score: {:<8.8f}".format(1/(mean_absolute_error(oof_lgb, y_train)+1)))
print(predictions_lgb)
np.save('val.mse_lgb.npy',oof_lgb)
np.save('test.mse_lgb.npy',predictions_lgb)
def suggest_learning_rate(self, X, y, max_boost_round):
lr = [0.01, 0.02, 0.03, 0.04, 0.05]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
params = self.setParams(self.default_hyper_param)
max_round = max_boost_round // 500
auc = np.zeros([len(lr), max_round])
for i in range(len(lr)):
print ('learning rate: %.2f' %(lr[i]))
params['learning_rate'] = lr[i]
train_data = lgb.Dataset(X_train, y_train, free_raw_data=False)
clf = None
for j in range(max_round):
clf = lgb.train(params, train_data, num_boost_round=500, init_model=clf, keep_training_booster=True)
# score with regularization
auc[i, j] = roc_auc_score(y_valid, clf.predict(X_valid)) - lr[i] * 0.1 + j * 0.001
print (auc)
idx = np.argmax(auc)
best_lr = lr[idx // max_round]
best_boost_round = (idx % max_round + 1) * 500
return best_lr, best_boost_round
import lightgbm as lgb
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
n_estimators = 10
d_train = lgb.Dataset(X_train, label=y_train)
params = {
'boosting_type': 'dart',
'objective': 'binary',
}
clf = lgb.train(params, d_train, n_estimators)
y_pred = clf.predict(X_test)
clf.save_model('lg_dart_breast_cancer.model') # save the model in txt format
np.savetxt('lg_dart_breast_cancer_true_predictions.txt', y_pred)
np.savetxt('breast_cancer_test.tsv', X_test, delimiter='\t')
d = clf.dump_model()
import json
with open('lg_dart_breast_cancer.json', 'w') as fout:
json.dump(d, fout, indent=1)
def test_plot_split_value_histogram(self):
gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10)
ax0 = lgb.plot_split_value_histogram(gbm0, 27)
self.assertIsInstance(ax0, matplotlib.axes.Axes)
self.assertEqual(ax0.get_title(), 'Split value histogram for feature with index 27')
self.assertEqual(ax0.get_xlabel(), 'Feature split value')
self.assertEqual(ax0.get_ylabel(), 'Count')
self.assertLessEqual(len(ax0.patches), 2)
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
gbm1.fit(self.X_train, self.y_train)
ax1 = lgb.plot_split_value_histogram(gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5),
title='Histogram for feature @index/name@ @feature@',
xlabel='x', ylabel='y', color='r')
self.assertIsInstance(ax1, matplotlib.axes.Axes)
self.assertEqual(ax1.get_title(),
'Histogram for feature name {}'.format(gbm1.booster_.feature_name()[27]))
self.assertEqual(ax1.get_xlabel(), 'x')
self.assertEqual(ax1.get_ylabel(), 'y')
self.assertLessEqual(len(ax1.patches), 2)
for patch in ax1.patches:
self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red
ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'],
title=None, xlabel=None, ylabel=None)
if binary:
data["drv"] = data["drv"].replace("r", "4")
numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []
mapper = DataFrameMapper(
[(numeric_features, [ContinuousDomain()])] +
[([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] +
[(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]
)
pipeline = PMMLPipeline([
("mapper", mapper),
("model", LGBMClassifier(n_estimators=1000))
])
pipeline.fit(data, data["drv"], model__categorical_feature=[3])
suffix = "binary" if binary else "multiclass"
sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml")
print(list(pipeline.predict(data[:10])))
print(list(pipeline.predict_proba(data[0:1])[0]))
def _get_create_model(self, classification):
if classification:
model = LGBMClassifier()
else:
model = LGBMRegressor()
def create_model(x, y):
return model.fit(x, y)
return create_model
def create_lightgbm_classifier(X, y):
lgbm = LGBMClassifier(boosting_type='gbdt', learning_rate=0.1,
max_depth=5, n_estimators=200, n_jobs=1, random_state=777)
model = lgbm.fit(X, y)
return model
# custom metric (disable default metric)
gbm = lgb.LGBMRegressor(metric='None',
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 1)
self.assertIn('error', gbm.evals_result_['training'])
# default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1',
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('l1', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# non-default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 2)
self.assertIn('mape', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# multiple metrics for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric=['l1', 'gamma'],
**params).fit(eval_metric=constant_metric, **params_fit)
self.assertEqual(len(gbm.evals_result_['training']), 3)
self.assertIn('l1', gbm.evals_result_['training'])
self.assertIn('gamma', gbm.evals_result_['training'])
self.assertIn('error', gbm.evals_result_['training'])
# custom metric (disable default metric for non-default objective)
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None',
**params).fit(eval_metric=constant_metric, **params_fit)
# enable display training loss
cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10,
nfold=3, stratified=False, shuffle=False,
metrics='l1', verbose_eval=False, eval_train_metric=True)
self.assertIn('train l1-mean', cv_res)
self.assertIn('valid l1-mean', cv_res)
self.assertNotIn('train l2-mean', cv_res)
self.assertNotIn('valid l2-mean', cv_res)
self.assertEqual(len(cv_res['train l1-mean']), 10)
self.assertEqual(len(cv_res['valid l1-mean']), 10)
# self defined folds
tss = TimeSeriesSplit(3)
folds = tss.split(X_train)
cv_res_gen = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds,
verbose_eval=False)
cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss,
verbose_eval=False)
np.testing.assert_allclose(cv_res_gen['l2-mean'], cv_res_obj['l2-mean'])
# lambdarank
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3}
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
# ... with l2 metric
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
metrics='l2', verbose_eval=False)
self.assertEqual(len(cv_res_lambda), 2)
self.assertFalse(np.isnan(cv_res_lambda['l2-mean']).any())
# ... with NDCG (default) metric
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
tss = TimeSeriesSplit(3)
folds = tss.split(X_train)
cv_res_gen = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds,
verbose_eval=False)
cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss,
verbose_eval=False)
np.testing.assert_allclose(cv_res_gen['l2-mean'], cv_res_obj['l2-mean'])
# lambdarank
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3}
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
# ... with l2 metric
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
metrics='l2', verbose_eval=False)
self.assertEqual(len(cv_res_lambda), 2)
self.assertFalse(np.isnan(cv_res_lambda['l2-mean']).any())
# ... with NDCG (default) metric
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
verbose_eval=False)
self.assertEqual(len(cv_res_lambda), 2)
self.assertFalse(np.isnan(cv_res_lambda['ndcg@3-mean']).any())
# self defined folds with lambdarank
cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10,
folds=GroupKFold(n_splits=3),
verbose_eval=False)
np.testing.assert_allclose(cv_res_lambda['ndcg@3-mean'], cv_res_lambda_obj['ndcg@3-mean'])