その8 ボストンの住宅価格をXGBoostで予測してみた(Grid Searchでパラメータチューニング有)
12月に入って初めての投稿です。hinomarucです。 今回はXGBoostのパラメータチューニングをGrid Searchで行いました。
事前に試したいパラメータを定義しておき、一番精度のよい組み合わせを発見する方法です。
最適なパラメータを見つける方法はGrid Searchの他に下記のような探索方法もあるようです。 ・Random Search ・Bayesian Search
今回はデータ量も少ないので、すべての組み合わせを試すGrid Searchで最適なパラメータを見つけることにしました。
まずは定番な手法からやって徐々に高度化していくのが技術力向上への近道です。
今回にてBoston housingのデータを活用し、 一通りのアルゴリズムを試したので、 次回からは他のデータセットを見ていく予定です。
Checking Boston housing prices corrected dataset 8
パッケージのインポート
import packages
import seaborn as sns import statsmodels.api as sm import pandas as pd
データの読み込み
reading data
df = pd.read_csv("http://lib.stat.cmu.edu/datasets/boston_corrected.txt",skiprows=9,sep="\t")
※ 22年2月1日更新: UnicodeDecodeErrorが発生する方はencoding='Windows-1252'を追加してください。 詳細は下記新ブログをご参照ください。
list(df.columns)
Out[0]
['OBS.', 'TOWN', 'TOWN#', 'TRACT', 'LON', 'LAT', 'MEDV', 'CMEDV', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
モデリング
Xgboostをgrid searchを利用して最適なパラメータを発掘しモデリング
サンプリング (訓練データ、テストデータ分割)
# 訓練データとテストデータに分割する。 from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.20,random_state=100)
訓練データ、テストデータ作成
FEATURE_COLS=[ #'OBS.', #'TOWN', #'TOWN#', #'TRACT', #'LON', #'LAT', #'MEDV', #'CMEDV', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] X_train = train[FEATURE_COLS] Y_train = train["CMEDV"] X_test = test[FEATURE_COLS] Y_test = test["CMEDV"]
モデル作成
import numpy as np import xgboost as xgb from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import KFold import sklearn;
sorted(sklearn.metrics.SCORERS.keys())
Out[0]
['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'max_error', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
# Grid Search用のパラメータ作成。 # あまり組み合わせが多いと時間がかかる(time consuming) params = { 'eta': [0.01], # default = 0.3 'gamma': [1,2,3], # default = 0 'max_depth': [7,8,9], # default = 6 'min_child_weight': [1], # default = 1 'subsample': [0.8,1.0], # default = 1 'colsample_bytree': [0.8,1.0], # default = 1 } kf = KFold(n_splits=5, shuffle = True, random_state = 1)
model = xgb.XGBRegressor(objective ='reg:squarederror') grid = GridSearchCV(estimator=model, param_grid=params, scoring='neg_mean_squared_error', n_jobs=2, cv=kf.split(X_train,Y_train), verbose=3)
grid.fit(X_train,Y_train)
Out[0]
Fitting 5 folds for each of 36 candidates, totalling 180 fits [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers. [Parallel(n_jobs=2)]: Done 52 tasks | elapsed: 4.1s [Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed: 13.7s finished GridSearchCV(cv=, error_score='raise-deprecating', estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, importance_type='gain', learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthre... random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, subsample=1, verbosity=1), iid='warn', n_jobs=2, param_grid={'colsample_bytree': [0.8, 1.0], 'eta': [0.01], 'gamma': [1, 2, 3], 'max_depth': [7, 8, 9], 'min_child_weight': [1], 'subsample': [0.8, 1.0]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring='neg_mean_squared_error', verbose=3)
print('ベストスコア:',grid.best_score_, sep="\n") print('\n') print('ベストestimator:',grid.best_estimator_,sep="\n") print('\n') print('ベストparams:',grid.best_params_,sep="\n")
Out[0]
ベストスコア: -9.530778242337254 ベストestimator: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0, eta=0.01, gamma=1, importance_type='gain', learning_rate=0.1, max_delta_step=0, max_depth=8, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, subsample=0.8, verbosity=1) ベストparams: {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma': 1, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.8}
pd.DataFrame(grid.cv_results_)
Out[0]
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_colsample_bytree | param_eta | param_gamma | param_max_depth | param_min_child_weight | param_subsample | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.148316 | 0.017969 | 0.005488 | 0.003220 | 0.8 | 0.01 | 1 | 7 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.076630 | -14.470374 | -14.204951 | -7.658387 | -5.533450 | -9.999786 | 3.652858 | 18 |
1 | 0.137329 | 0.018536 | 0.002986 | 0.000138 | 0.8 | 0.01 | 1 | 7 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.458042 | -14.724980 | -11.222454 | -9.884966 | -6.001298 | -10.068390 | 2.901155 | 20 |
2 | 0.148533 | 0.020026 | 0.005267 | 0.004438 | 0.8 | 0.01 | 1 | 8 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -7.976184 | -15.908916 | -14.020251 | -7.590577 | -6.088544 | -10.327361 | 3.892094 | 28 |
3 | 0.170525 | 0.023704 | 0.003612 | 0.000521 | 0.8 | 0.01 | 1 | 8 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.700472 | -14.353754 | -10.628294 | -10.099057 | -6.085330 | -9.983005 | 2.693402 | 16 |
4 | 0.256356 | 0.099531 | 0.003434 | 0.000663 | 0.8 | 0.01 | 1 | 9 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -7.466963 | -16.184669 | -14.151406 | -7.588601 | -5.805110 | -10.250326 | 4.122926 | 23 |
5 | 0.154640 | 0.018473 | 0.004230 | 0.002807 | 0.8 | 0.01 | 1 | 9 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.592687 | -15.144381 | -10.524692 | -9.715390 | -6.090526 | -10.023246 | 2.966572 | 19 |
6 | 0.126797 | 0.021668 | 0.003181 | 0.000809 | 0.8 | 0.01 | 2 | 7 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -7.251086 | -15.427656 | -14.231703 | -7.982824 | -5.617347 | -10.113224 | 3.952004 | 21 |
7 | 0.115606 | 0.003012 | 0.002741 | 0.000028 | 0.8 | 0.01 | 2 | 7 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.905719 | -13.969786 | -10.536954 | -10.434475 | -5.592526 | -9.898524 | 2.707726 | 10 |
8 | 0.120392 | 0.002409 | 0.002760 | 0.000027 | 0.8 | 0.01 | 2 | 8 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -7.830935 | -15.098665 | -13.385085 | -7.751377 | -5.576456 | -9.939276 | 3.652089 | 13 |
9 | 0.124836 | 0.003660 | 0.002725 | 0.000068 | 0.8 | 0.01 | 2 | 8 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.518925 | -14.474478 | -10.967876 | -9.879880 | -5.929837 | -9.964160 | 2.814920 | 15 |
10 | 0.138423 | 0.016395 | 0.004200 | 0.002745 | 0.8 | 0.01 | 2 | 9 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -7.863751 | -15.339218 | -13.011845 | -7.456237 | -5.971402 | -9.938286 | 3.599526 | 12 |
11 | 0.140609 | 0.006163 | 0.004117 | 0.002776 | 0.8 | 0.01 | 2 | 9 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.431798 | -14.521303 | -10.523572 | -9.698426 | -6.102612 | -9.864832 | 2.766658 | 9 |
12 | 0.136987 | 0.019530 | 0.004518 | 0.002256 | 0.8 | 0.01 | 3 | 7 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.477851 | -15.477659 | -14.236324 | -7.876438 | -5.801074 | -10.385188 | 3.784912 | 31 |
13 | 0.114759 | 0.002845 | 0.002859 | 0.000327 | 0.8 | 0.01 | 3 | 7 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -9.715860 | -14.904595 | -11.108844 | -9.834932 | -5.802163 | -10.284346 | 2.915425 | 25 |
14 | 0.125141 | 0.002459 | 0.002807 | 0.000104 | 0.8 | 0.01 | 3 | 8 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -7.778931 | -15.650952 | -13.272735 | -7.911699 | -6.233541 | -10.179314 | 3.631852 | 22 |
15 | 0.124319 | 0.003002 | 0.002665 | 0.000014 | 0.8 | 0.01 | 3 | 8 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -9.749970 | -14.799285 | -10.943012 | -10.627325 | -6.091604 | -10.453008 | 2.776222 | 32 |
16 | 0.134001 | 0.003087 | 0.002731 | 0.000015 | 0.8 | 0.01 | 3 | 9 | 1 | 0.8 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -8.234727 | -15.679617 | -13.935103 | -7.690596 | -5.961844 | -10.311116 | 3.794549 | 27 |
17 | 0.143320 | 0.008923 | 0.002685 | 0.000053 | 0.8 | 0.01 | 3 | 9 | 1 | 1 | {'colsample_bytree': 0.8, 'eta': 0.01, 'gamma'... | -9.216786 | -15.147090 | -10.915934 | -10.313346 | -6.190086 | -10.366962 | 2.891744 | 30 |
18 | 0.127510 | 0.002232 | 0.002838 | 0.000018 | 1 | 0.01 | 1 | 7 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -8.791845 | -13.182757 | -13.814405 | -6.023290 | -6.317940 | -9.634236 | 3.310944 | 2 |
19 | 0.140026 | 0.006731 | 0.002717 | 0.000024 | 1 | 0.01 | 1 | 7 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -10.376528 | -13.634383 | -11.121636 | -7.712917 | -6.680718 | -9.913218 | 2.480296 | 11 |
20 | 0.145129 | 0.005851 | 0.002972 | 0.000266 | 1 | 0.01 | 1 | 8 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -9.020084 | -12.558632 | -14.145986 | -6.070754 | -5.812531 | -9.530778 | 3.361392 | 1 |
21 | 0.147489 | 0.001379 | 0.002723 | 0.000068 | 1 | 0.01 | 1 | 8 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -10.512789 | -14.492950 | -9.860562 | -7.648759 | -6.193250 | -9.750445 | 2.831896 | 7 |
22 | 0.157881 | 0.003713 | 0.002874 | 0.000023 | 1 | 0.01 | 1 | 9 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -9.980800 | -13.817187 | -14.439687 | -5.746679 | -5.665099 | -9.940447 | 3.770678 | 14 |
23 | 0.161794 | 0.003368 | 0.002732 | 0.000039 | 1 | 0.01 | 1 | 9 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -11.125359 | -15.305603 | -9.954281 | -8.072668 | -7.170193 | -10.333431 | 2.849112 | 29 |
24 | 0.128149 | 0.001751 | 0.002745 | 0.000026 | 1 | 0.01 | 2 | 7 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -8.593110 | -13.942909 | -14.714939 | -5.746816 | -5.523899 | -9.714682 | 3.934955 | 4 |
25 | 0.135409 | 0.003699 | 0.002627 | 0.000037 | 1 | 0.01 | 2 | 7 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -10.453405 | -15.911814 | -11.522422 | -8.073868 | -7.530489 | -10.706241 | 2.994296 | 35 |
26 | 0.142641 | 0.003469 | 0.002767 | 0.000023 | 1 | 0.01 | 2 | 8 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -8.612592 | -13.288846 | -14.711817 | -5.893449 | -5.870810 | -9.684921 | 3.696475 | 3 |
27 | 0.151855 | 0.004353 | 0.002685 | 0.000044 | 1 | 0.01 | 2 | 8 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -11.044064 | -15.661659 | -11.737650 | -7.638802 | -7.204178 | -10.665818 | 3.077828 | 34 |
28 | 0.158111 | 0.005961 | 0.002775 | 0.000028 | 1 | 0.01 | 2 | 9 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -8.796288 | -14.036172 | -14.441856 | -5.854810 | -5.824649 | -9.800572 | 3.790769 | 8 |
29 | 0.163354 | 0.002897 | 0.002638 | 0.000009 | 1 | 0.01 | 2 | 9 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -11.300870 | -15.772229 | -12.538039 | -7.744553 | -7.266909 | -10.933574 | 3.152548 | 36 |
30 | 0.131469 | 0.004056 | 0.002720 | 0.000041 | 1 | 0.01 | 3 | 7 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -9.222455 | -13.179842 | -14.321178 | -6.399286 | -5.524574 | -9.739875 | 3.519981 | 5 |
31 | 0.135513 | 0.003887 | 0.002967 | 0.000712 | 1 | 0.01 | 3 | 7 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -9.747451 | -16.628352 | -10.104485 | -7.958219 | -7.046601 | -10.305067 | 3.360617 | 26 |
32 | 0.146825 | 0.005332 | 0.002743 | 0.000036 | 1 | 0.01 | 3 | 8 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -8.770543 | -13.577960 | -14.226023 | -6.410230 | -5.706538 | -9.748238 | 3.552605 | 6 |
33 | 0.148833 | 0.002246 | 0.002593 | 0.000032 | 1 | 0.01 | 3 | 8 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -10.949157 | -15.691583 | -10.437168 | -7.647688 | -6.576437 | -10.269525 | 3.173566 | 24 |
34 | 0.156493 | 0.004974 | 0.002990 | 0.000460 | 1 | 0.01 | 3 | 9 | 1 | 0.8 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -9.347635 | -13.852955 | -14.797973 | -6.217224 | -5.728991 | -9.999500 | 3.762762 | 17 |
35 | 0.165337 | 0.006189 | 0.002599 | 0.000030 | 1 | 0.01 | 3 | 9 | 1 | 1 | {'colsample_bytree': 1.0, 'eta': 0.01, 'gamma'... | -10.565500 | -16.010136 | -11.188954 | -7.585731 | -6.920068 | -10.462825 | 3.228405 | 33 |
# Grid Searchで一番精度が良かったモデル
bestmodel = grid.best_estimator_
精度確認
# 自由度調整済みr2を算出 def adjusted_r2(X,Y,model): from sklearn.metrics import r2_score import numpy as np r_squared = r2_score(Y, model.predict(X)) adjusted_r2 = 1 - (1-r_squared)*(len(Y)-1)/(len(Y)-X.shape[1]-1) #yhat = model.predict(X) \ #SS_Residual = sum((Y-yhat)**2) \ #SS_Total = sum((Y-np.mean(Y))**2) #r_squared = 1 - (float(SS_Residual))/ SS_Total return adjusted_r2 # 予測モデルの精度確認の各種指標を算出 def get_model_evaluations(X_train,Y_train,X_test,Y_test,model): from sklearn.metrics import explained_variance_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_log_error from sklearn.metrics import median_absolute_error # 評価指標確認 # 参考: https://funatsu-lab.github.io/open-course-ware/basic-theory/accuracy-index/ yhat_test = model.predict(X_test) return "adjusted_r2(train) :" + str(adjusted_r2(X_train,Y_train,model)) \ , "adjusted_r2(test) :" + str(adjusted_r2(X_test,Y_test,model)) \ , "平均誤差率(test) :" + str(np.mean(abs(Y_test / yhat_test - 1))) \ , "MAE(test) :" + str(mean_absolute_error(Y_test, yhat_test)) \ , "MedianAE(test) :" + str(median_absolute_error(Y_test, yhat_test)) \ , "RMSE(test) :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test))) \ , "RMSE(test) / MAE(test) :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test)) / mean_absolute_error(Y_test, yhat_test)) #better if result = 1.253
get_model_evaluations(X_train,Y_train,X_test,Y_test,bestmodel)
Out[0]
('adjusted_r2(train) :0.9979849458983184', 'adjusted_r2(test) :0.9007293205261151', '平均誤差率(test) :0.10292127436810407', 'MAE(test) :2.0740456263224285', 'MedianAE(test) :1.506885433197021', 'RMSE(test) :2.8888856581918403', 'RMSE(test) / MAE(test) :1.3928746897020952')
デフォルト値より精度があがった。 平均誤差率が約10%近くになった。(前回約11%) この1ポイント差は大きい。
# 描画設定 from matplotlib import rcParams rcParams['xtick.labelsize'] = 12 # x軸のラベルのフォントサイズ rcParams['ytick.labelsize'] = 12 # y軸のラベルのフォントサイズ rcParams['figure.figsize'] = 18,8 # 画像サイズの変更(inch) import matplotlib.pyplot as plt from matplotlib import ticker sns.set_style("whitegrid") # seabornのスタイルセットの一つ sns.set_color_codes() # デフォルトカラー設定 (deepになってる) plt.figure() ax = sns.regplot(x=Y_test, y=bestmodel.predict(X_test), fit_reg=False,color='#4F81BD') ax.set_xlabel(u"CMEDV") ax.set_ylabel(u"(Predicted) CMEDV") ax.get_xaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x), ','))) ax.get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda y, p: format(int(y), ','))) ax.plot([0,10,20,30,40,50],[0,10,20,30,40,50], linewidth=2, color="#C0504D",ls="--")
xgb.plot_importance(bestmodel)
xgb.to_graphviz(bestmodel, num_trees=5)