機械学習の前処理、EDA、モデル構築手順(回帰問題)

自分用のメモです。 随時更新します。

可視化はmatplotlib系(seaborn)で行う。 ほかに候補としてはplotlyなどあるので好きなのを選べばよい。

回帰問題

importするライブラリ

# データ解析のライブラリ
import pandas as pd
import numpy as np 
import pandas_profiling as pdp

# データ可視化のライブラリ
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline 

# Scikit-learn
# common
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline

# 評価指標
from sklearn.metrics import mean_squared_error, r2_score

# 推定器(とヘルパー関数)
from sklearn.linear_model import LinearRegression # 線形回帰
from sklearn.linear_model import SGDRegressor # 確率的勾配降下法
from sklearn.linear_model import Ridge, Lasso, ElasticNet # 正則化された線形回帰
from sklearn.tree import DecisionTreeRegressor # 決定木
#
from sklearn.preprocessing import PolynomialFeatures # 多項式回帰

# scipy
from scipy import stats

# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor

# その他
from math import sqrt

データの読み込み

df = pd.read_csv("hoge.csv")

EDA

読み込んだデータの素性をみる

print(df.shape, df.dtypes)
df.head()
df.describe()
pdp.ProfileReport(df)

量的データ

特徴量(目的変数を含む)同士の相関

目的変数と相関が低い特徴量は捨てる、相関が高い特徴量同士は特徴が似ているので減らす、etc.

# 散布図,ピアソン相関係数
g = sns.jointplot(<some_feature>, <target>, data = concrete, kind='scatter', color='darkslategray').annotate(stats.pearsonr)
# 回帰,ピアソン相関係数
g = sns.jointplot(<some_feature>, <target>, data = concrete, kind='reg', color='darkslategray').annotate(stats.pearsonr)
# corr heatmap 
df_corr=df[["peak-rpm", "highway-mpg", "price"]].corr()
g=sns.heatmap(df_corr, square=True, vmax=1, vmin=-1, center=0, annot=True)

質的データ

ペアプロット

特徴量同士の散布図とヒストグラム

sns.pairplot(glass[['RI', 'Na', 'Mg', 'Al', 'Type']], hue='Type', diag_kind='hist')

箱ひげ図(box plot)、Swarmplot(分布密度を表現)

分布がオーバーラップしていなければ使えそう

features = glass.iloc[:,0:4].columns

plt.figure(figsize=(20,9*5))
gs = gridspec.GridSpec(4, 1)
for i, col in enumerate(glass[features]):
    plt.title('Glass features')
    ax = plt.subplot(gs[i])
    sns.boxplot(x= glass['Type'], y=glass[col], palette='Set2', linewidth=1.0)
    sns.swarmplot(x=glass['Type'], y=glass[col], color=".5")

前処理

NaNでない不定値をNaNへ置換

# 例:不定値が"?"で表現されている場合
# inplace=Trueのときは、dfを書き換える
df.replace("?", np.nan, inplace = True)

NaNの処理(置換(平均、最頻値、補間、)、その列/行を捨てる)

# replace by mean value. 
avg_norm_loss = df["normalized-losses"].astype("float").mean(axis=0)
print("Average of normalized-losses:", avg_norm_loss)
df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)
# replace by median value.
df['num-of-doors'].value_counts().idxmax()
df["num-of-doors"].replace(np.nan, "four", inplace=True)

正規化

# 値が正の場合、最大値で割って値域を0~1にする
df['length'] = df['length']/df['length'].max()
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df = scaler.transform(df)

標準化

# 平均を引いて標準偏差で割る。平均ゼロ、分散1の集合にする。
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df)

分布を正規分布に近づける(Box-Cox変換、Yeo-Johnson変換)

正規分布を仮定してるモデル手法が多いので

# https://note.com/mikiokubo/n/n42417e5d0f6c
from scipy import stats
data, lmbda = stats.boxcox(df["pm2.5"]+0.001)
df["boxcox"] = data
df["boxcox"].hist()

次元削減

PCAとか

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_Train)

Binning(あんまり実施しなさそう)

大(=2)、中(=1)、小(=0)などに分類する

bins = np.linspace(min(df["horsepower"]), max(df["horsepower"]), 4)
group_names = ['Low', 'Medium', 'High']
df['horsepower-binned'] = pd.cut(df['horsepower'], bins, labels=group_names, include_lowest=True )

pyplot.bar(group_names, df["horsepower-binned"].value_counts())

# set x/y labels and plot title
plt.pyplot.xlabel("horsepower")
plt.pyplot.ylabel("count")
plt.pyplot.title("horsepower bins")

カテゴリー特徴量の変換

# onehot encoding(dummy)
# e.g.: fuel-type is "gas" or "diesel"
dummy_variable_1 = pd.get_dummies(df["fuel-type"])

# merge data frame "df" and "dummy_variable_1" 
df = pd.concat([df, dummy_variable_1], axis=1)

# drop original column "fuel-type" from "df"
df.drop("fuel-type", axis = 1, inplace=True)

ベースmodel作成

比較対象とする最初のモデル。とりあえずXGBoostで。

xgboost = XGBRegressor(random_state=1)
history = xgboost_base.fit(X_train, y_train)
y_pred_base = xgboos.predict(X_train)
# e.g. 評価指標がRMSE(Root Mean Squard Error)のとき
rmse_base = sqrt(mean_squared_error(y_train.values, y_pred_base))
print(Rmse_base)

モデル性能評価

クロスバリデーション

from sklearn.model_selection import StratifiedKFold

# kfoldイテレータ
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for train_idx, val_idx in kfold.split(train_features, train_labels):
    history = model.fit(
        train_features[train_idx], 
        train_labels[train_idx],
        epochs=30,
        batch_size=20,
        validation_data=(train_features[val_idx], train_labels[val_idx])
    )
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(
    estimator=pipe_lr,
    X=X_train,
    y=y_train,
    cv=10,
)

ハイパーパラメータチューニング

グリッドサーチ

from sklearn.grid_search import GridSearchCV

# グリッドサーチCVの実行
test_params = {
 'n_estimators':[100,200,300,400,500]
}
gs = GridSearchCV(
    estimator = XGBRegressor(seed=42), 
    scoring='explained_variance', 
    param_grid = test_params, 
    cv = 5, 
    early_stopping_rounds=50, # 50回改善しないときに終了
    return_train_score=False
)
gs.fit(X_train,y_train)

# 最良スコア、パラメータ
print(gs.best_score_)
print(gs.best_params_)

# 最良モデルパラメータで推測
clf = gs.best_estimator_
clf.fit(X_train, y_train)

pipeline

ワークフローの効率化。fit_transformメソッドを持つ変換器や推定器を結合する。

from sklearn.pipeline import Pipeline

pipe_lr = Pipeline([
    ("scl", StandardScaler()),
    ("pca", PCA(n_components=2)),
    ("clf", LogisticRegression())
])
pipe_lr.fit(X_train, y_train)

学習曲線を描く

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html#sklearn.model_selection.learning_curve
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py
# https://qiita.com/koichi_hiphopdream/items/9ad75d184aba9626c09b

def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):

    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    """
    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")
    """

    return plt

# cv対応無し
def plot_simple_learning_curve(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val[:m])
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m])
        val_errors.append(mean_squared_error(y_val_predict, y_val[:m])
    plt.plot(np.sqrt(train_errors), "r-", label="train")
    plt.plot(np.sqrt(val.errors), "b-", label="val")

評価指標

model.fit(X_train, y_train)
y_pred = model.predict(X_train)

# MSE
mean_squared_error(y_train, y_pred)
# R2 score, 最大1で誤差がないことを示す。
r2_score(y_train, y_pred)