機械学習の前処理、EDA、モデル構築手順(回帰問題)
自分用のメモです。 随時更新します。
可視化はmatplotlib系(seaborn)で行う。 ほかに候補としてはplotlyなどあるので好きなのを選べばよい。
回帰問題
importするライブラリ
# データ解析のライブラリ import pandas as pd import numpy as np import pandas_profiling as pdp # データ可視化のライブラリ import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sns %matplotlib inline # Scikit-learn # common from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import MinMaxScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline # 評価指標 from sklearn.metrics import mean_squared_error, r2_score # 推定器(とヘルパー関数) from sklearn.linear_model import LinearRegression # 線形回帰 from sklearn.linear_model import SGDRegressor # 確率的勾配降下法 from sklearn.linear_model import Ridge, Lasso, ElasticNet # 正則化された線形回帰 from sklearn.tree import DecisionTreeRegressor # 決定木 # from sklearn.preprocessing import PolynomialFeatures # 多項式回帰 # scipy from scipy import stats # XGBoost import xgboost as xgb from xgboost import XGBRegressor # その他 from math import sqrt
データの読み込み
df = pd.read_csv("hoge.csv")
EDA
読み込んだデータの素性をみる
print(df.shape, df.dtypes)
df.head()
df.describe()
pdp.ProfileReport(df)
量的データ
特徴量(目的変数を含む)同士の相関
目的変数と相関が低い特徴量は捨てる、相関が高い特徴量同士は特徴が似ているので減らす、etc.
# 散布図,ピアソン相関係数 g = sns.jointplot(<some_feature>, <target>, data = concrete, kind='scatter', color='darkslategray').annotate(stats.pearsonr) # 回帰,ピアソン相関係数 g = sns.jointplot(<some_feature>, <target>, data = concrete, kind='reg', color='darkslategray').annotate(stats.pearsonr) # corr heatmap df_corr=df[["peak-rpm", "highway-mpg", "price"]].corr() g=sns.heatmap(df_corr, square=True, vmax=1, vmin=-1, center=0, annot=True)
質的データ
ペアプロット
特徴量同士の散布図とヒストグラム
sns.pairplot(glass[['RI', 'Na', 'Mg', 'Al', 'Type']], hue='Type', diag_kind='hist')
箱ひげ図(box plot)、Swarmplot(分布密度を表現)
分布がオーバーラップしていなければ使えそう
features = glass.iloc[:,0:4].columns plt.figure(figsize=(20,9*5)) gs = gridspec.GridSpec(4, 1) for i, col in enumerate(glass[features]): plt.title('Glass features') ax = plt.subplot(gs[i]) sns.boxplot(x= glass['Type'], y=glass[col], palette='Set2', linewidth=1.0) sns.swarmplot(x=glass['Type'], y=glass[col], color=".5")
前処理
NaNでない不定値をNaNへ置換
# 例:不定値が"?"で表現されている場合 # inplace=Trueのときは、dfを書き換える df.replace("?", np.nan, inplace = True)
NaNの処理(置換(平均、最頻値、補間、)、その列/行を捨てる)
# replace by mean value. avg_norm_loss = df["normalized-losses"].astype("float").mean(axis=0) print("Average of normalized-losses:", avg_norm_loss) df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True) # replace by median value. df['num-of-doors'].value_counts().idxmax() df["num-of-doors"].replace(np.nan, "four", inplace=True)
正規化
# 値が正の場合、最大値で割って値域を0~1にする df['length'] = df['length']/df['length'].max()
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() df = scaler.transform(df)
標準化
# 平均を引いて標準偏差で割る。平均ゼロ、分散1の集合にする。 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(df)
分布を正規分布に近づける(Box-Cox変換、Yeo-Johnson変換)
正規分布を仮定してるモデル手法が多いので
# https://note.com/mikiokubo/n/n42417e5d0f6c from scipy import stats data, lmbda = stats.boxcox(df["pm2.5"]+0.001) df["boxcox"] = data df["boxcox"].hist()
次元削減
PCAとか
from sklearn.decomposition import PCA pca = PCA(n_components=2) X_pca = pca.fit_transform(X_Train)
Binning(あんまり実施しなさそう)
大(=2)、中(=1)、小(=0)などに分類する
bins = np.linspace(min(df["horsepower"]), max(df["horsepower"]), 4) group_names = ['Low', 'Medium', 'High'] df['horsepower-binned'] = pd.cut(df['horsepower'], bins, labels=group_names, include_lowest=True ) pyplot.bar(group_names, df["horsepower-binned"].value_counts()) # set x/y labels and plot title plt.pyplot.xlabel("horsepower") plt.pyplot.ylabel("count") plt.pyplot.title("horsepower bins")
カテゴリー特徴量の変換
# onehot encoding(dummy) # e.g.: fuel-type is "gas" or "diesel" dummy_variable_1 = pd.get_dummies(df["fuel-type"]) # merge data frame "df" and "dummy_variable_1" df = pd.concat([df, dummy_variable_1], axis=1) # drop original column "fuel-type" from "df" df.drop("fuel-type", axis = 1, inplace=True)
ベースmodel作成
比較対象とする最初のモデル。とりあえずXGBoostで。
xgboost = XGBRegressor(random_state=1) history = xgboost_base.fit(X_train, y_train) y_pred_base = xgboos.predict(X_train) # e.g. 評価指標がRMSE(Root Mean Squard Error)のとき rmse_base = sqrt(mean_squared_error(y_train.values, y_pred_base)) print(Rmse_base)
モデル性能評価
クロスバリデーション
from sklearn.model_selection import StratifiedKFold # kfoldイテレータ kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) for train_idx, val_idx in kfold.split(train_features, train_labels): history = model.fit( train_features[train_idx], train_labels[train_idx], epochs=30, batch_size=20, validation_data=(train_features[val_idx], train_labels[val_idx]) )
from sklearn.cross_validation import cross_val_score scores = cross_val_score( estimator=pipe_lr, X=X_train, y=y_train, cv=10, )
ハイパーパラメータチューニング
グリッドサーチ
from sklearn.grid_search import GridSearchCV # グリッドサーチCVの実行 test_params = { 'n_estimators':[100,200,300,400,500] } gs = GridSearchCV( estimator = XGBRegressor(seed=42), scoring='explained_variance', param_grid = test_params, cv = 5, early_stopping_rounds=50, # 50回改善しないときに終了 return_train_score=False ) gs.fit(X_train,y_train) # 最良スコア、パラメータ print(gs.best_score_) print(gs.best_params_) # 最良モデルパラメータで推測 clf = gs.best_estimator_ clf.fit(X_train, y_train)
pipeline
ワークフローの効率化。fit_transformメソッドを持つ変換器や推定器を結合する。
from sklearn.pipeline import Pipeline pipe_lr = Pipeline([ ("scl", StandardScaler()), ("pca", PCA(n_components=2)), ("clf", LogisticRegression()) ]) pipe_lr.fit(X_train, y_train)
学習曲線を描く
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html#sklearn.model_selection.learning_curve # https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py # https://qiita.com/koichi_hiphopdream/items/9ad75d184aba9626c09b def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): if axes is None: _, axes = plt.subplots(1, 3, figsize=(20, 5)) axes[0].set_title(title) if ylim is not None: axes[0].set_ylim(*ylim) axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") train_sizes, train_scores, test_scores, fit_times, _ = \ learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) # Plot learning curve axes[0].grid() axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") axes[0].legend(loc="best") """ # Plot n_samples vs fit_times axes[1].grid() axes[1].plot(train_sizes, fit_times_mean, 'o-') axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std, fit_times_mean + fit_times_std, alpha=0.1) axes[1].set_xlabel("Training examples") axes[1].set_ylabel("fit_times") axes[1].set_title("Scalability of the model") # Plot fit_time vs score axes[2].grid() axes[2].plot(fit_times_mean, test_scores_mean, 'o-') axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title("Performance of the model") """ return plt # cv対応無し def plot_simple_learning_curve(model, X, y): X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2) train_errors, val_errors = [], [] for m in range(1, len(X_train)): model.fit(X_train[:m], y_train[:m]) y_train_predict = model.predict(X_train[:m]) y_val_predict = model.predict(X_val[:m]) train_errors.append(mean_squared_error(y_train_predict, y_train[:m]) val_errors.append(mean_squared_error(y_val_predict, y_val[:m]) plt.plot(np.sqrt(train_errors), "r-", label="train") plt.plot(np.sqrt(val.errors), "b-", label="val")
評価指標
model.fit(X_train, y_train) y_pred = model.predict(X_train) # MSE mean_squared_error(y_train, y_pred) # R2 score, 最大1で誤差がないことを示す。 r2_score(y_train, y_pred)