Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_plot_split_value_histogram(self):
gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10)
ax0 = lgb.plot_split_value_histogram(gbm0, 27)
self.assertIsInstance(ax0, matplotlib.axes.Axes)
self.assertEqual(ax0.get_title(), 'Split value histogram for feature with index 27')
self.assertEqual(ax0.get_xlabel(), 'Feature split value')
self.assertEqual(ax0.get_ylabel(), 'Count')
self.assertLessEqual(len(ax0.patches), 2)
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
gbm1.fit(self.X_train, self.y_train)
ax1 = lgb.plot_split_value_histogram(gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5),
title='Histogram for feature @index/name@ @feature@',
xlabel='x', ylabel='y', color='r')
self.assertIsInstance(ax1, matplotlib.axes.Axes)
self.assertEqual(ax1.get_title(),
'Histogram for feature name {}'.format(gbm1.booster_.feature_name()[27]))
self.assertEqual(ax1.get_xlabel(), 'x')
self.assertEqual(ax1.get_ylabel(), 'y')
self.assertLessEqual(len(ax1.patches), 2)
for patch in ax1.patches:
self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red
ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'],
title=None, xlabel=None, ylabel=None)
if binary:
data["drv"] = data["drv"].replace("r", "4")
numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []
mapper = DataFrameMapper(
[(numeric_features, [ContinuousDomain()])] +
[([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] +
[(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]
)
pipeline = PMMLPipeline([
("mapper", mapper),
("model", LGBMClassifier(n_estimators=1000))
])
pipeline.fit(data, data["drv"], model__categorical_feature=[3])
suffix = "binary" if binary else "multiclass"
sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml")
print(list(pipeline.predict(data[:10])))
print(list(pipeline.predict_proba(data[0:1])[0]))
def _get_create_model(self, classification):
if classification:
model = LGBMClassifier()
else:
model = LGBMRegressor()
def create_model(x, y):
return model.fit(x, y)
return create_model
def create_lightgbm_classifier(X, y):
lgbm = LGBMClassifier(boosting_type='gbdt', learning_rate=0.1,
max_depth=5, n_estimators=200, n_jobs=1, random_state=777)
model = lgbm.fit(X, y)
return model
def test_lightgbm__classifier(self):
model = LGBMClassifier(n_estimators=3, min_child_samples=1)
self._test_binary_classification_core(model)
self._test_single_output_core(model)
def lgb_predict(training,label,predict):
# feature_list = training.columns.tolist()
# training = training.values
# predict = predict.values
print(".....")
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=41, reg_alpha=0.0, reg_lambda=1.8,
max_depth=-1, n_estimators=2000, objective='binary',
subsample=0.6, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.08, min_child_samples=50, random_state=42, n_jobs=-1
)
# X_train, X_val, y_train, y_val = train_test_split(training, label, test_size=0.1, random_state=42)
shuffle_indices = np.random.permutation(np.arange(len(label)))
training = training[shuffle_indices]
label = label[shuffle_indices]
train_num = int(0.95*len(label))
X_train, X_val = training[:train_num],training[train_num:]
y_train, y_val = label[:train_num],label[train_num:]
print("spilt done")
del training
del label
gc.collect()
def LGB_test(train_x,train_y,test_x,test_y):
from multiprocessing import cpu_count
print("LGB test")
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=1000, objective='binary',
subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.05, min_child_weight=50,random_state=2018,n_jobs=cpu_count()-1
)
clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)
print(clf.feature_importances_)
return clf,clf.best_score_[ 'valid_1']['auc']
"race": "category",
"gender": "category",
"native-country": "category",
}
X = pd.read_csv(url, names=names, header=None, dtype=dtypes)
X["gender"] = (
X["gender"].str.strip().astype("category")
) # Remove leading whitespace
y = X.pop("salary").map({" <=50K": False, " >50K": True})
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, shuffle=True, random_state=42
)
model = lgb.LGBMClassifier(random_state=42).fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
y_pred = pd.Series(y_pred, name=">$50k")
explainer = ethik.ClassificationExplainer()
return explainer, X_test, y_pred, y_test
def LGB_predict(train_x,train_y,test_x,res,index):
print("LGB test")
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=1500, objective='binary',
subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=-1
)
clf.fit(train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='auc',early_stopping_rounds=100)
res['score'+str(index)] = clf.predict_proba(test_x)[:,1]
res['score'+str(index)] = res['score'+str(index)].apply(lambda x: float('%.6f' % x))
print(str(index)+' predict finish!')
res = res.reset_index(drop=True)
print(res.head())
gc.collect()
return res['score'+str(index)]
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': num_classes,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'verbose': -1,
'metric': ['multi_logloss'],
"learning_rate": 0.2,
"max_depth": 5,
"num_leaves": 10,
"reg_lambda": 0.1,
"num_trees": 500,
"min_data_in_leaf": 100,
}
clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.05, objective='multiclass',
random_state=314, silent=True, metric='None',
n_jobs=4, n_estimators=5000, class_weight='balanced')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
def evaluate_macroF1_lgb(truth, predictions):
# this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
f1 = f1_score(truth, pred_labels, average='macro')
return ('macroF1', f1, True)
import lightgbm as lgb
def learning_rate_power_0997(current_iter):