Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# step 2: Select Feature
data = extract_feature_and_label(data, feature_name_list=conf['feature_name'], label_name_list=conf['label_name'])
# step 3: Preprocess
train, test = divide_train_and_test(data, conf['training_set_proportion'])
train_x, train_y = data_transform_for_xgboost(train)
test_x, test_y = data_transform_for_xgboost(test)
train_y = sign(train_y)
test_y = sign(test_y)
indices = find_all_indices(train_y, 1)
indices.extend(find_all_indices(train_y, -1))
train_x = np.array(train_x)[indices]
train_y = np.array(train_y)[indices]
dtrain = xgb.DMatrix(train_x, train_y)
param = {
'booster': 'gbtree',
'silent': True,
'eta': 0.01,
'max_depth': 5,
'gamma': 0.1,
'objective': 'multi:softmax',
'num_class': 3,
'seed': 1000,
'scale_pos_weight': 1
}
clf = xgb.XGBClassifier(**param)
if conf['use_previous_model'] is False:
clf.fit(train_x, train_y)
def test():
data = np.random.rand(5,10) # 5 entities, each contains 10 features
label = np.random.randint(2, size=5) # binary target
dtrain = xgb.DMatrix( data, label=label)
dtest = xgb.DMatrix(test)
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
evallist = [(dtest,'eval'), (dtrain,'train')]
num_round = 10
bst = xgb.train( param, dtrain, num_round, evallist)
X_train, y_train = X[:tr_size, :], y[:tr_size]
X_test, y_test = X[tr_size:, :], y[tr_size:]
# First with cpu_predictor
params = {'tree_method': 'gpu_hist',
'predictor': 'cpu_predictor',
'n_jobs': -1,
'seed': 123}
m = xgb.XGBRegressor(**params).fit(X_train, y_train)
cpu_train_score = m.score(X_train, y_train)
cpu_test_score = m.score(X_test, y_test)
# Now with gpu_predictor
params['predictor'] = 'gpu_predictor'
m = xgb.XGBRegressor(**params).fit(X_train, y_train)
gpu_train_score = m.score(X_train, y_train)
gpu_test_score = m.score(X_test, y_test)
assert np.allclose(cpu_train_score, gpu_train_score)
assert np.allclose(cpu_test_score, gpu_test_score)
def test_xgboost_direct():
try:
import xgboost
except Exception as e:
print("Skipping test_xgboost_direct!")
return
import shap
N = 100
M = 4
X = np.random.randn(N,M)
y = np.random.randn(N)
model = xgboost.XGBRegressor()
model.fit(X, y)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
assert np.allclose(shap_values[0,:], _brute_force_tree_shap(explainer.model, X[0,:]))
def test_xgboost_classifier(output_margin):
import xgboost as xgb
df = pd.read_csv("./open_data/creditcard.csv")
X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
gbm = xgb.sklearn.XGBClassifier()
gbm.fit(X, y)
gbm.predict(X, output_margin=output_margin)
gbm.predict_proba(X, output_margin=output_margin)
custom_input1,
custom_input2,
model,
custom_output1,
):
with train as reader:
train_df = reader.read(concat=True)
dtrain_x = xgb.DMatrix(train_df[:-1])
dtrain_y = xgb.DMatrix(train_df[-1])
with validation as reader:
validation_df = reader.read(concat=True)
dvalidation_x = xgb.DMatrix(validation_df[:-1])
dvalidation_y = xgb.DMatrix(validation_df[-1])
my_model = xgb.XGBModel(**static_hyperparameters)
my_model.fit(dtrain_x,
dtrain_y,
eval_set=[(dvalidation_x, dvalidation_y)],
eval_metric=sample_eval_function)
model.set(my_model)
custom_output1.set(my_model.evals_result())
def test_xgboost_ranking():
try:
import xgboost
except:
print("Skipping test_xgboost_ranking!")
return
import shap
# train lightgbm ranker model
x_train, y_train, x_test, y_test, q_train, q_test = shap.datasets.rank()
params = {'objective': 'rank:pairwise', 'learning_rate': 0.1,
'gamma': 1.0, 'min_child_weight': 0.1,
'max_depth': 4, 'n_estimators': 4}
model = xgboost.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, q_train.astype(int),
eval_set=[(x_test, y_test)], eval_group=[q_test.astype(int)])
_validate_shap_values(model, x_test)
def test_xgboost_regression(output_margin):
import xgboost as xgb
df = pd.read_csv("./open_data/creditcard.csv")
X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
gbm = xgb.sklearn.XGBRegressor()
gbm.fit(X, y)
gbm.predict(X, output_margin=output_margin)
def test_xgboost_ranking():
try:
import xgboost
except:
print("Skipping test_xgboost_ranking!")
return
import shap
# train lightgbm ranker model
x_train, y_train, x_test, y_test, q_train, q_test = shap.datasets.rank()
params = {'objective': 'rank:pairwise', 'learning_rate': 0.1,
'gamma': 1.0, 'min_child_weight': 0.1,
'max_depth': 4, 'n_estimators': 4}
model = xgboost.sklearn.XGBRanker(**params)
model.fit(x_train, y_train, q_train.astype(int),
eval_set=[(x_test, y_test)], eval_group=[q_test.astype(int)])
_validate_shap_values(model, x_test)
def run_test(name, params_fun):
"""Runs a distributed GPU test."""
# Always call this before using distributed module
xgb.rabit.init()
rank = xgb.rabit.get_rank()
world = xgb.rabit.get_world_size()
# Load file, file will be automatically sharded in distributed mode.
dtrain = xgb.DMatrix('../../demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('../../demo/data/agaricus.txt.test')
params, n_rounds = params_fun(rank)
# Specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
# Run training, all the features in training API is available.
# Currently, this script only support calling train once for fault recovery purpose.
bst = xgb.train(params, dtrain, n_rounds, watchlist, early_stopping_rounds=2)
# Have each worker save its model
model_name = "test.model.%s.%d" % (name, rank)