その9 ボストンの住宅価格をアンサンブル学習(Voting regressor)で予測してみた
どうもhinomarucです。 複数のモデルを組み合わせて、 精度がよいアウトプットを作ることは kaggleなどのコンペをみてやってみたいと 思っていました。
アンサンブル学習の一つである、voting regressorで 住宅価格を予測してみました。
A voting regressor is an ensemble meta-estimator that fits base regressors each on the whole dataset. It, then, averages the individual predictions to form a final prediction. 参考: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html
sklearnのvoting regressorだと説明変数が全てのアルゴリズムで、 共通になってしまうので、各アルゴリズムごとに説明変数 の組み合わせを変更してモデル作成したい場合は自分で各モデルから算出した予測値の平均を取る必要があります。
- Checking Boston housing prices corrected dataset 9
- 他の組み合わせを試して見る
- VotingRegressor()を使わずに、各モデルを作成し予測数値の平均を取る
- まとめ
Checking Boston housing prices corrected dataset 9
パッケージのインポート
import packages
import seaborn as sns import statsmodels.api as sm import pandas as pd
データの読み込み
reading data
df = pd.read_csv("http://lib.stat.cmu.edu/datasets/boston_corrected.txt",skiprows=9,sep="\t")
※ 22年2月1日更新: pandasのread_csvでUnicodeDecodeErrorが発生する方はencoding='Windows-1252'を追加してください。 詳細は下記新ブログをご参照ください。
list(df.columns)
['OBS.', 'TOWN', 'TOWN#', 'TRACT', 'LON', 'LAT', 'MEDV', 'CMEDV', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
モデリング
Voting regressorでやってみる
サンプリング (訓練データ、テストデータ分割)
# 訓練データとテストデータに分割する。 from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.20,random_state=100)
訓練データ、テストデータ作成
FEATURE_COLS=[ #'OBS.', #'TOWN', #'TOWN#', #'TRACT', #'LON', #'LAT', #'MEDV', #'CMEDV', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] X_train = train[FEATURE_COLS] Y_train = train["CMEDV"] X_test = test[FEATURE_COLS] Y_test = test["CMEDV"]
モデル作成
import numpy as np import xgboost as xgb from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.ensemble import VotingRegressor
# GradientBoosting, RandomForest, 重回帰で試して見る # sklearnのサンプルと同じ reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10) reg2 = RandomForestRegressor(random_state=1, n_estimators=10) reg3 = LinearRegression(normalize=True) ereg = VotingRegressor(estimators=[('xgb', reg1), ('rf', reg2), ('lr', reg3)])
ereg = ereg.fit(X_train, Y_train)
ereg.estimators
[('xgb', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_iter_no_change=None, presort='auto', random_state=1, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)), ('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1, verbose=0, warm_start=False)), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True))]
精度確認
# 自由度調整済みr2を算出 def adjusted_r2(X,Y,model): from sklearn.metrics import r2_score import numpy as np r_squared = r2_score(Y, model.predict(X)) adjusted_r2 = 1 - (1-r_squared)*(len(Y)-1)/(len(Y)-X.shape[1]-1) #yhat = model.predict(X) \ #SS_Residual = sum((Y-yhat)**2) \ #SS_Total = sum((Y-np.mean(Y))**2) #r_squared = 1 - (float(SS_Residual))/ SS_Total return adjusted_r2 # 予測モデルの精度確認の各種指標を算出 def get_model_evaluations(X_train,Y_train,X_test,Y_test,model): from sklearn.metrics import explained_variance_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_log_error from sklearn.metrics import median_absolute_error # 評価指標確認 # 参考: https://funatsu-lab.github.io/open-course-ware/basic-theory/accuracy-index/ yhat_test = model.predict(X_test) return "adjusted_r2(train) :" + str(adjusted_r2(X_train,Y_train,model)) \ , "adjusted_r2(test) :" + str(adjusted_r2(X_test,Y_test,model)) \ , "平均誤差率(test) :" + str(np.mean(abs(Y_test / yhat_test - 1))) \ , "MAE(test) :" + str(mean_absolute_error(Y_test, yhat_test)) \ , "MedianAE(test) :" + str(median_absolute_error(Y_test, yhat_test)) \ , "RMSE(test) :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test))) \ , "RMSE(test) / MAE(test) :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test)) / mean_absolute_error(Y_test, yhat_test)) #better if result = 1.253
get_model_evaluations(X_train,Y_train,X_test,Y_test,ereg)
('adjusted_r2(train) :0.8864926311777143', 'adjusted_r2(test) :0.8177146097869532', '平均誤差率(test) :0.1304160387830404', 'MAE(test) :2.7898130860175834', 'MedianAE(test) :1.9059579212992723', 'RMSE(test) :3.91467567052991', 'RMSE(test) / MAE(test) :1.4032035659127442')
コメント
# 描画設定 from matplotlib import rcParams rcParams['xtick.labelsize'] = 12 # x軸のラベルのフォントサイズ rcParams['ytick.labelsize'] = 12 # y軸のラベルのフォントサイズ rcParams['figure.figsize'] = 18,8 # 画像サイズの変更(inch) import matplotlib.pyplot as plt from matplotlib import ticker sns.set_style("whitegrid") # seabornのスタイルセットの一つ sns.set_color_codes() # デフォルトカラー設定 (deepになってる) plt.figure() ax = sns.regplot(x=Y_test, y=ereg.predict(X_test), fit_reg=False,color='#4F81BD') ax.set_xlabel(u"CMEDV") ax.set_ylabel(u"(Predicted) CMEDV") ax.get_xaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x), ','))) ax.get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda y, p: format(int(y), ','))) ax.plot([0,10,20,30,40,50],[0,10,20,30,40,50], linewidth=2, color="#C0504D",ls="--")
他の組み合わせを試して見る
# xgboost, random forest, 重回帰でやってみる reg1 = xgb.XGBRegressor( objective ='reg:squarederror' , base_score=0.5 , booster='gbtree' , colsample_bylevel=1 , colsample_bynode=1 , colsample_bytree=1 , gamma=0 , importance_type='gain' , learning_rate=0.1 , max_delta_step=0 , max_depth=3 , min_child_weight=1 , missing=None , n_estimators=100 , n_jobs=1 , nthread=None , random_state=0 , reg_alpha=0 , reg_lambda=1 , scale_pos_weight=1 , seed=None , silent=None , subsample=1 , verbosity=1) reg2 = RandomForestRegressor(random_state=1, n_estimators=10) reg3 = LinearRegression(normalize=True) ereg = VotingRegressor(estimators=[('xgb', reg1), ('rf', reg2), ('lr', reg3)])
ereg = ereg.fit(X_train, Y_train)
get_model_evaluations(X_train,Y_train,X_test,Y_test,ereg)
('adjusted_r2(train) :0.941411489447292', 'adjusted_r2(test) :0.8725883749878736', '平均誤差率(test) :0.10861120144413837', 'MAE(test) :2.2296969721803177', 'MedianAE(test) :1.520569355831178', 'RMSE(test) :3.27283705041899', 'RMSE(test) / MAE(test) :1.467839393089651')
XgBoost単体の方が精度がよかったように思える。。
ereg.estimators
[('xgb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, importance_type='gain', learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, subsample=1, verbosity=1)), ('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=1, verbose=0, warm_start=False)), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True))]
VotingRegressor()を使わずに、各モデルを作成し予測数値の平均を取る
# XgBoost用 X_train = train[FEATURE_COLS] X_test = test[FEATURE_COLS] # 変数を絞る (重回帰と多項式回帰用) X_train2 = train[["RM","LSTAT","PTRATIO","B"]] #部屋数、身分が低い人口割合、生徒と先生比率、人種スコアでの重回帰 X_test2 = test[["RM","LSTAT","PTRATIO","B"]] # 目的変数(ターゲット) Y_train = train["CMEDV"] #修正済み住宅価格中央値 Y_test = test["CMEDV"]
# 多項式回帰用にデータセットを変換 from sklearn.preprocessing import PolynomialFeatures poly_features = PolynomialFeatures(degree = 2) # n=2 X_train_poly = poly_features.fit_transform(X_train2) X_test_poly = poly_features.fit_transform(X_test2)
各モデルの作成
# 重回帰 import numpy as np from sklearn.linear_model import LinearRegression multiple_regression = LinearRegression() multiple_regression.fit(X_train2,Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# 多項式回帰 import numpy as np from sklearn.linear_model import LinearRegression polynomial_regression = LinearRegression() polynomial_regression.fit(X_train_poly,Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# XgBoost import xgboost as xgb xgboost = xgb.XGBRegressor( objective ='reg:squarederror' , base_score=0.5 , booster='gbtree' , colsample_bylevel=1 , colsample_bynode=1 , colsample_bytree=1 , gamma=0 , importance_type='gain' , learning_rate=0.1 , max_delta_step=0 , max_depth=3 , min_child_weight=1 , missing=None , n_estimators=100 , n_jobs=1 , nthread=None , random_state=0 , reg_alpha=0 , reg_lambda=1 , scale_pos_weight=1 , seed=None , silent=None , subsample=1 , verbosity=1) xgboost.fit(X_train,Y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, importance_type='gain', learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, subsample=1, verbosity=1)
各モデルの予測値の平均を取る (テストデータ)
yhat_test_1 = multiple_regression.predict(X_test2) yhat_test_2 = polynomial_regression.predict(X_test_poly) yhat_test_3 = xgboost.predict(X_test)
yhat_test_average = (yhat_test_1 + yhat_test_2 + yhat_test_3) / 3
各モデルの予測値の平均を取る (訓練データ)
yhat_train_1 = multiple_regression.predict(X_train2) yhat_train_2 = polynomial_regression.predict(X_train_poly) yhat_train_3 = xgboost.predict(X_train)
yhat_train_average = (yhat_train_1 + yhat_train_2 + yhat_train_3) / 3
精度確認
# 自由度調整済みr2を算出 def adjusted_r2(X,Y,Yhat): from sklearn.metrics import r2_score import numpy as np r_squared = r2_score(Y, Yhat) adjusted_r2 = 1 - (1-r_squared)*(len(Y)-1)/(len(Y)-X.shape[1]-1) return adjusted_r2 # 予測モデルの精度確認の各種指標を算出 def get_model_evaluations(X_train,Y_train,X_test,Y_test,Yhat_train,Yhat_test): from sklearn.metrics import explained_variance_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_log_error from sklearn.metrics import median_absolute_error # 評価指標確認 # 参考: https://funatsu-lab.github.io/open-course-ware/basic-theory/accuracy-index/ return "adjusted_r2(train) :" + str(adjusted_r2(X_train,Y_train,Yhat_train)) \ , "adjusted_r2(test) :" + str(adjusted_r2(X_test,Y_test,Yhat_test)) \ , "平均誤差率(test) :" + str(np.mean(abs(Y_test / Yhat_test - 1))) \ , "MAE(test) :" + str(mean_absolute_error(Y_test, Yhat_test)) \ , "MedianAE(test) :" + str(median_absolute_error(Y_test, Yhat_test)) \ , "RMSE(test) :" + str(np.sqrt(mean_squared_error(Y_test, Yhat_test))) \ , "RMSE(test) / MAE(test) :" + str(np.sqrt(mean_squared_error(Y_test, Yhat_test)) / mean_absolute_error(Y_test, Yhat_test)) #better if result = 1.253
get_model_evaluations(X_train,Y_train,X_test,Y_test,yhat_train_average,yhat_test_average)
('adjusted_r2(train) :0.8711757920412861', 'adjusted_r2(test) :0.841610495090336', '平均誤差率(test) :0.11883711395386347', 'MAE(test) :2.440346734822397', 'MedianAE(test) :1.8652607788449629', 'RMSE(test) :3.649077240737884', 'RMSE(test) / MAE(test) :1.4953109689977953')
重回帰と多項式回帰のみでモデリング
yhat_test_average = (yhat_test_1 + yhat_test_2) / 2 yhat_train_average = (yhat_train_1 + yhat_train_2) / 2 get_model_evaluations(X_train,Y_train,X_test,Y_test,yhat_train_average,yhat_test_average)
('adjusted_r2(train) :0.758751320817442', 'adjusted_r2(test) :0.769456566440293', '平均誤差率(test) :0.1377369999749322', 'MAE(test) :2.8939164700575666', 'MedianAE(test) :1.9465828734264088', 'RMSE(test) :4.4024682653269895', 'RMSE(test) / MAE(test) :1.5212838072134869')
まとめ
今までの結果を振り返ると、XgBoostのみでのモデリングが平均誤差率が一番低いという結果になりました。
また、重回帰や多項式回帰単体を組み合わせた結果よりも 多項式回帰単体の方が平均誤差率が低いという結果にもなりました。
ただ結果を組み合わせるだけではよいモデルはできないのですね。