"California Housing Data"を用いて、Linear RegressionとRandom Forestの出力を比較してみる。
Boston Dataよりデータ数が相当多いので、学習に時間がかかることに注意
# California Housing Data
from sklearn.datasets import fetch_california_housing
import pandas as pd
housing = fetch_california_housing()
features = pd.DataFrame(housing.data, columns=housing.feature_names)
targets = housing.target
features.head()
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|
0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 |
1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 |
2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 |
3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 |
4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 |
# 訓練データとテストデータへの分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.9, random_state=0)
# データの正規化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values,
columns=X_train.columns.values)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)
# モデルの選択
# (1) 重回帰
from sklearn.linear_model import LinearRegression
lm_model = LinearRegression()
# (2)ランダムフォレスト回帰
from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0)
# 学習(トレーニング)
lm_model.fit(X_train, y_train)
rfr_model.fit(X_train, y_train)
#### 学習機械によるテストデータでの予測(推定)
lm_predicted_test = lm_model.predict(X_test)
rfr_predicted_test = rfr_model.predict(X_test)
# 各種評価指数の計算
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
# (1) 決定係数の算出
lm_test_score = r2_score(y_test, lm_predicted_test)
rfr_test_score = r2_score(y_test, rfr_predicted_test)
# (2) Spearman Correlationの算出
lm_spearman = spearmanr(y_test, lm_predicted_test)
rfr_spearman = spearmanr(y_test, rfr_predicted_test)
# (3) Pearson Correlationの算出
lm_pearson = pearsonr(y_test, lm_predicted_test)
rfr_pearson = pearsonr(y_test, rfr_predicted_test)
# 各種評価指標のプリント
print("Test data R-2 score: (Linear) %5.3f (Random Forest) %5.3f"
% (lm_test_score, rfr_test_score))
print("Test data Spearman correlation: (Linear) %.3f (Random Forest) %.3f"
% (lm_spearman[0], rfr_spearman[0]))
print("Test data Pearson correlation: (Linear) %.3f (Random Forest) %.3f"
% (lm_pearson[0], rfr_pearson[0]))
Test data R-2 score: (Linear) 0.610 (Random Forest) 0.818 Test data Spearman correlation: (Linear) 0.817 (Random Forest) 0.910 Test data Pearson correlation: (Linear) 0.781 (Random Forest) 0.905
import matplotlib.pyplot as plt
# 予測と観測値の散布図
fig, axes = plt.subplots(1, 2, figsize=(10,5))
axes[0].scatter(lm_predicted_test, y_test)
axes[0].set_xlabel("Predicted value by Linear Model")
axes[0].set_ylabel("Observed value")
axes[0].set_title("Linear Regression")
axes[1].scatter(rfr_predicted_test, y_test)
axes[1].set_xlabel("Predicted value by Random Forest")
axes[1].set_ylabel("Observed value")
axes[1].set_title("Random Forest Regression")
# 推定値=観測値(y=x)の補助線
x = [0.0, 5.0]
axes[0].plot(x,x, "r")
axes[1].plot(x,x, "r")
plt.show()