(旧) ヒノマルクのデータ分析ブログ

どんな企業、地域、国でも働けるようなスキルを身につけていくブログ

PythonやBIツールを使って分析と可視化をします

その9 ボストンの住宅価格をアンサンブル学習(Voting regressor)で予測してみた

どうもhinomarucです。 複数のモデルを組み合わせて、 精度がよいアウトプットを作ることは kaggleなどのコンペをみてやってみたいと 思っていました。

アンサンブル学習の一つである、voting regressorで 住宅価格を予測してみました。

A voting regressor is an ensemble meta-estimator that fits base regressors each on the whole dataset. It, then, averages the individual predictions to form a final prediction. 参考: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html

sklearnのvoting regressorだと説明変数が全てのアルゴリズムで、 共通になってしまうので、各アルゴリズムごとに説明変数 の組み合わせを変更してモデル作成したい場合は自分で各モデルから算出した予測値の平均を取る必要があります。

Checking Boston housing prices corrected dataset 9

パッケージのインポート
import packages

import seaborn as sns
import statsmodels.api as sm
import pandas as pd

データの読み込み
reading data

df = pd.read_csv("http://lib.stat.cmu.edu/datasets/boston_corrected.txt",skiprows=9,sep="\t")

※ 22年2月1日更新: pandasのread_csvでUnicodeDecodeErrorが発生する方はencoding='Windows-1252'を追加してください。
詳細は下記新ブログをご参照ください。


list(df.columns)
Out[0]
    ['OBS.',
     'TOWN',
     'TOWN#',
     'TRACT',
     'LON',
     'LAT',
     'MEDV',
     'CMEDV',
     'CRIM',
     'ZN',
     'INDUS',
     'CHAS',
     'NOX',
     'RM',
     'AGE',
     'DIS',
     'RAD',
     'TAX',
     'PTRATIO',
     'B',
     'LSTAT']

モデリング

Voting regressorでやってみる

サンプリング (訓練データ、テストデータ分割)

# 訓練データとテストデータに分割する。
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.20,random_state=100)

訓練データ、テストデータ作成

FEATURE_COLS=[
 #'OBS.',
 #'TOWN',
 #'TOWN#',
 #'TRACT',
 #'LON',
 #'LAT',
 #'MEDV',
 #'CMEDV',
 'CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT']
X_train = train[FEATURE_COLS]
Y_train = train["CMEDV"]
X_test = test[FEATURE_COLS]
Y_test = test["CMEDV"]

モデル作成

import numpy as np
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
# GradientBoosting, RandomForest, 重回帰で試して見る
# sklearnのサンプルと同じ
reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10)
reg2 = RandomForestRegressor(random_state=1, n_estimators=10)
reg3 = LinearRegression(normalize=True)
ereg = VotingRegressor(estimators=[('xgb', reg1), ('rf', reg2), ('lr', reg3)])
ereg = ereg.fit(X_train, Y_train)
ereg.estimators
Out[0]
    [('xgb',
      GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                                learning_rate=0.1, loss='ls', max_depth=3,
                                max_features=None, max_leaf_nodes=None,
                                min_impurity_decrease=0.0, min_impurity_split=None,
                                min_samples_leaf=1, min_samples_split=2,
                                min_weight_fraction_leaf=0.0, n_estimators=10,
                                n_iter_no_change=None, presort='auto', random_state=1,
                                subsample=1.0, tol=0.0001, validation_fraction=0.1,
                                verbose=0, warm_start=False)),
     ('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                            max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=10,
                            n_jobs=None, oob_score=False, random_state=1, verbose=0,
                            warm_start=False)),
     ('lr',
      LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True))]

精度確認

# 自由度調整済みr2を算出
def adjusted_r2(X,Y,model):
    from sklearn.metrics import r2_score
    import numpy as np
    r_squared = r2_score(Y, model.predict(X))
    adjusted_r2 = 1 - (1-r_squared)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
    #yhat = model.predict(X) \ #SS_Residual = sum((Y-yhat)**2) \ #SS_Total = sum((Y-np.mean(Y))**2)
    #r_squared = 1 - (float(SS_Residual))/ SS_Total
    return adjusted_r2

# 予測モデルの精度確認の各種指標を算出
def get_model_evaluations(X_train,Y_train,X_test,Y_test,model):
    from sklearn.metrics import explained_variance_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_squared_log_error
    from sklearn.metrics import median_absolute_error

   # 評価指標確認
   # 参考: https://funatsu-lab.github.io/open-course-ware/basic-theory/accuracy-index/
    yhat_test = model.predict(X_test)
    return "adjusted_r2(train)     :" + str(adjusted_r2(X_train,Y_train,model)) \
         , "adjusted_r2(test)      :" + str(adjusted_r2(X_test,Y_test,model)) \
         , "平均誤差率(test)       :" + str(np.mean(abs(Y_test / yhat_test - 1))) \
         , "MAE(test)              :" + str(mean_absolute_error(Y_test, yhat_test)) \
         , "MedianAE(test)         :" + str(median_absolute_error(Y_test, yhat_test)) \
         , "RMSE(test)             :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test))) \
         , "RMSE(test) / MAE(test) :" + str(np.sqrt(mean_squared_error(Y_test, yhat_test)) / mean_absolute_error(Y_test, yhat_test)) #better if result = 1.253
get_model_evaluations(X_train,Y_train,X_test,Y_test,ereg)
Out[0]
    ('adjusted_r2(train)     :0.8864926311777143',
     'adjusted_r2(test)      :0.8177146097869532',
     '平均誤差率(test)       :0.1304160387830404',
     'MAE(test)              :2.7898130860175834',
     'MedianAE(test)         :1.9059579212992723',
     'RMSE(test)             :3.91467567052991',
     'RMSE(test) / MAE(test) :1.4032035659127442')

コメント

# 描画設定
from matplotlib import rcParams
rcParams['xtick.labelsize'] = 12       # x軸のラベルのフォントサイズ
rcParams['ytick.labelsize'] = 12       # y軸のラベルのフォントサイズ
rcParams['figure.figsize'] = 18,8      # 画像サイズの変更(inch)

import matplotlib.pyplot as plt
from matplotlib import ticker
sns.set_style("whitegrid")             # seabornのスタイルセットの一つ
sns.set_color_codes()                  # デフォルトカラー設定 (deepになってる)

plt.figure()
ax = sns.regplot(x=Y_test, y=ereg.predict(X_test), fit_reg=False,color='#4F81BD')
ax.set_xlabel(u"CMEDV")
ax.set_ylabel(u"(Predicted) CMEDV")
ax.get_xaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda y, p: format(int(y), ',')))
ax.plot([0,10,20,30,40,50],[0,10,20,30,40,50], linewidth=2, color="#C0504D",ls="--")

他の組み合わせを試して見る

# xgboost, random forest, 重回帰でやってみる

reg1 = xgb.XGBRegressor( objective ='reg:squarederror'
                        , base_score=0.5
                        , booster='gbtree'
                        , colsample_bylevel=1
                        , colsample_bynode=1
                        , colsample_bytree=1
                        , gamma=0
                        , importance_type='gain'
                        , learning_rate=0.1
                        , max_delta_step=0
                        , max_depth=3
                        , min_child_weight=1
                        , missing=None
                        , n_estimators=100
                        , n_jobs=1
                        , nthread=None
                        , random_state=0
                        , reg_alpha=0
                        , reg_lambda=1
                        , scale_pos_weight=1
                        , seed=None
                        , silent=None
                        , subsample=1
                        , verbosity=1)
reg2 = RandomForestRegressor(random_state=1, n_estimators=10)
reg3 = LinearRegression(normalize=True)
ereg = VotingRegressor(estimators=[('xgb', reg1), ('rf', reg2), ('lr', reg3)])
ereg = ereg.fit(X_train, Y_train)
get_model_evaluations(X_train,Y_train,X_test,Y_test,ereg)
Out[0]
    ('adjusted_r2(train)     :0.941411489447292',
     'adjusted_r2(test)      :0.8725883749878736',
     '平均誤差率(test)       :0.10861120144413837',
     'MAE(test)              :2.2296969721803177',
     'MedianAE(test)         :1.520569355831178',
     'RMSE(test)             :3.27283705041899',
     'RMSE(test) / MAE(test) :1.467839393089651')

XgBoost単体の方が精度がよかったように思える。。

ereg.estimators
Out[0]
    [('xgb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                   colsample_bynode=1, colsample_bytree=1, gamma=0,
                   importance_type='gain', learning_rate=0.1, max_delta_step=0,
                   max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                   n_jobs=1, nthread=None, objective='reg:squarederror',
                   random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                   seed=None, silent=None, subsample=1, verbosity=1)),
     ('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                            max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=10,
                            n_jobs=None, oob_score=False, random_state=1, verbose=0,
                            warm_start=False)),
     ('lr',
      LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True))]

VotingRegressor()を使わずに、各モデルを作成し予測数値の平均を取る

# XgBoost用
X_train = train[FEATURE_COLS]
X_test = test[FEATURE_COLS]

# 変数を絞る (重回帰と多項式回帰用)
X_train2 = train[["RM","LSTAT","PTRATIO","B"]] #部屋数、身分が低い人口割合、生徒と先生比率、人種スコアでの重回帰
X_test2 = test[["RM","LSTAT","PTRATIO","B"]]

# 目的変数(ターゲット)
Y_train = train["CMEDV"] #修正済み住宅価格中央値
Y_test = test["CMEDV"]
# 多項式回帰用にデータセットを変換
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree = 2) # n=2

X_train_poly = poly_features.fit_transform(X_train2)
X_test_poly = poly_features.fit_transform(X_test2)

各モデルの作成

# 重回帰
import numpy as np
from sklearn.linear_model import LinearRegression
multiple_regression = LinearRegression()
multiple_regression.fit(X_train2,Y_train)
Out[0]
    LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# 多項式回帰
import numpy as np
from sklearn.linear_model import LinearRegression
polynomial_regression = LinearRegression()
polynomial_regression.fit(X_train_poly,Y_train)
Out[0]
    LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# XgBoost
import xgboost as xgb
xgboost = xgb.XGBRegressor( objective ='reg:squarederror'
                        , base_score=0.5
                        , booster='gbtree'
                        , colsample_bylevel=1
                        , colsample_bynode=1
                        , colsample_bytree=1
                        , gamma=0
                        , importance_type='gain'
                        , learning_rate=0.1
                        , max_delta_step=0
                        , max_depth=3
                        , min_child_weight=1
                        , missing=None
                        , n_estimators=100
                        , n_jobs=1
                        , nthread=None
                        , random_state=0
                        , reg_alpha=0
                        , reg_lambda=1
                        , scale_pos_weight=1
                        , seed=None
                        , silent=None
                        , subsample=1
                        , verbosity=1)
xgboost.fit(X_train,Y_train) 
Out[0]
    XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                 colsample_bynode=1, colsample_bytree=1, gamma=0,
                 importance_type='gain', learning_rate=0.1, max_delta_step=0,
                 max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                 n_jobs=1, nthread=None, objective='reg:squarederror',
                 random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                 seed=None, silent=None, subsample=1, verbosity=1)

各モデルの予測値の平均を取る (テストデータ)

yhat_test_1 = multiple_regression.predict(X_test2)
yhat_test_2 = polynomial_regression.predict(X_test_poly)
yhat_test_3 = xgboost.predict(X_test)
yhat_test_average = (yhat_test_1 + yhat_test_2 + yhat_test_3) / 3

各モデルの予測値の平均を取る (訓練データ)

yhat_train_1 = multiple_regression.predict(X_train2)
yhat_train_2 = polynomial_regression.predict(X_train_poly)
yhat_train_3 = xgboost.predict(X_train)
yhat_train_average = (yhat_train_1 + yhat_train_2 + yhat_train_3) / 3

精度確認

# 自由度調整済みr2を算出
def adjusted_r2(X,Y,Yhat):
    from sklearn.metrics import r2_score
    import numpy as np
    r_squared = r2_score(Y, Yhat)
    adjusted_r2 = 1 - (1-r_squared)*(len(Y)-1)/(len(Y)-X.shape[1]-1)
    return adjusted_r2

# 予測モデルの精度確認の各種指標を算出
def get_model_evaluations(X_train,Y_train,X_test,Y_test,Yhat_train,Yhat_test):
    from sklearn.metrics import explained_variance_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_squared_log_error
    from sklearn.metrics import median_absolute_error

   # 評価指標確認
   # 参考: https://funatsu-lab.github.io/open-course-ware/basic-theory/accuracy-index/
    return "adjusted_r2(train)     :" + str(adjusted_r2(X_train,Y_train,Yhat_train)) \
         , "adjusted_r2(test)      :" + str(adjusted_r2(X_test,Y_test,Yhat_test)) \
         , "平均誤差率(test)       :" + str(np.mean(abs(Y_test / Yhat_test - 1))) \
         , "MAE(test)              :" + str(mean_absolute_error(Y_test, Yhat_test)) \
         , "MedianAE(test)         :" + str(median_absolute_error(Y_test, Yhat_test)) \
         , "RMSE(test)             :" + str(np.sqrt(mean_squared_error(Y_test, Yhat_test))) \
         , "RMSE(test) / MAE(test) :" + str(np.sqrt(mean_squared_error(Y_test, Yhat_test)) / mean_absolute_error(Y_test, Yhat_test)) #better if result = 1.253
get_model_evaluations(X_train,Y_train,X_test,Y_test,yhat_train_average,yhat_test_average)
Out[0]
    ('adjusted_r2(train)     :0.8711757920412861',
     'adjusted_r2(test)      :0.841610495090336',
     '平均誤差率(test)       :0.11883711395386347',
     'MAE(test)              :2.440346734822397',
     'MedianAE(test)         :1.8652607788449629',
     'RMSE(test)             :3.649077240737884',
     'RMSE(test) / MAE(test) :1.4953109689977953')

重回帰と多項式回帰のみでモデリング

yhat_test_average = (yhat_test_1 + yhat_test_2) / 2
yhat_train_average = (yhat_train_1 + yhat_train_2) / 2
get_model_evaluations(X_train,Y_train,X_test,Y_test,yhat_train_average,yhat_test_average)
Out[0]
    ('adjusted_r2(train)     :0.758751320817442',
     'adjusted_r2(test)      :0.769456566440293',
     '平均誤差率(test)       :0.1377369999749322',
     'MAE(test)              :2.8939164700575666',
     'MedianAE(test)         :1.9465828734264088',
     'RMSE(test)             :4.4024682653269895',
     'RMSE(test) / MAE(test) :1.5212838072134869')

まとめ

今までの結果を振り返ると、XgBoostのみでのモデリングが平均誤差率が一番低いという結果になりました。

また、重回帰や多項式回帰単体を組み合わせた結果よりも 多項式回帰単体の方が平均誤差率が低いという結果にもなりました。

ただ結果を組み合わせるだけではよいモデルはできないのですね。