(6)XGBoost 교차 검증 사용
8719 단어 XGBoost 학습
import numpy as np
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix(basePath+'data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2
print('running cross validation')
running cross validation
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
#metrics: , (rmse ,error
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed=0,
callbacks=[xgb.callback.print_evaluation(show_stdv=True)])
[0] train-error:0.0506682+0.009201 test-error:0.0557316+0.0158887 [1] train-error:0.0213034+0.00205561 test-error:0.0211884+0.00365323 .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
test-error-mean
test-error-std
train-error-mean
train-error-std
0
0.055732
0.015889
0.050668
0.009201
1
0.021188
0.003653
0.021303
0.002056
print('running cross validation, disable standard deviation display')
running cross validation, disable standard deviation display # do cross validation, this will print result out as
# [iteration] metric_name:mean_value
# num_boost_round=10:
res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
metrics={'error'}, seed=0,
callbacks=[xgb.callback.print_evaluation(show_stdv=False),
xgb.callback.early_stop(3)]) # :Will train until test-error hasn't improved in 3 rounds.
[0] train-error:0.0506682 test-error:0.0557316 Multiple eval metrics have been passed: ‘test-error’ will be used for early stopping. Will train until test-error hasn’t improved in 3 rounds. [1] train-error:0.0213034 test-error:0.0211884 [2] train-error:0.0099418 test-error:0.0099786 [3] train-error:0.0141256 test-error:0.0144336 [4] train-error:0.0059878 test-error:0.0062948 [5] train-error:0.0020344 test-error:0.0016886 [6] train-error:0.0012284 test-error:0.001228 [7] train-error:0.0012284 test-error:0.001228 [8] train-error:0.0009212 test-error:0.001228 [9] train-error:0.0006142 test-error:0.001228 Stopping. Best iteration: [6] train-error:0.0012284+0.000260265 test-error:0.001228+0.00104094 print(res)
test-error-mean test-error-std train-error-mean train-error-std 0 0.055732 0.015889 0.050668 0.009201 1 0.021188 0.003653 0.021303 0.002056 2 0.009979 0.004828 0.009942 0.006076 3 0.014434 0.003517 0.014126 0.001706 4 0.006295 0.003123 0.005988 0.001878 5 0.001689 0.000574 0.002034 0.001470 6 0.001228 0.001041 0.001228 0.000260 print('running cross validation, with preprocessing function')
running cross validation, with preprocessing function # define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
# , (dtrain, dtest, param) 。
def fpreproc(dtrain, dtest, param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0)) / np.sum(label == 1)
param['scale_pos_weight'] = ratio # , 。 :sum( )/ sum( )
return (dtrain, dtest, param)
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'auc'}, seed=0, fpreproc=fpreproc) #auc:
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
test-auc-mean
test-auc-std
train-auc-mean
train-auc-std
0
0.958232
0.005778
0.958228
0.001442
1
0.981431
0.002595
0.981414
0.000647
###
# you can also do cross validation with customized loss function
# See custom_objective.py
##
print('running cross validation, with cutomsized loss function')
running cross validation, with cutomsized loss function def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0 - preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
obj=logregobj, feval=evalerror)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
test-error-mean
test-error-std
test-rmse-mean
test-rmse-std
train-error-mean
train-error-std
train-rmse-mean
train-rmse-std
0
0.055732
0.015889
1.598043
0.012826
0.050668
0.009201
1.595072
0.003868
1
0.021188
0.003653
2.449282
0.080900
0.021303
0.002056
2.442600
0.076834
#rmse: root mean square error
#mae: mean absolute error