도전

11971 단어 Python3
/하이퍼매개변수는?
기계 학습 알고리즘에서 사람이 조정해야 할 매개 변수
ex) 딥러닝 중인 층수
이번에는 두 가지 방법으로 슈퍼 파라미터를 탐색합니다.
1, 메쉬 검색(최상의 값을 찾기 위해 일부 값의 모든 조합을 시도함)
2, Bays 최적화를 통한 매개변수 검색
검색 범위 정보
all_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'min_child_weight': [3, 5, 10],
    'n_estimetors': [10000],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1],
    'random_state': [0],
    'n_jobs': [1],
}
격자 기반 검색 매개 변수 검색
(sklearn.model selection.ParameterGrid)[https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html] 위에 있는 사전을 주면 모든 매개 변수를 만들 수 있습니다.
다음은 검증된 미니 사이즈입니다.
from sklearn.model_selection import ParameterGrid
all_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'min_child_weight': [3, 5, 10],
    'n_estimetors': [10000],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1],
    'random_state': [0],
    'n_jobs': [1],
}

for params in ParameterGrid(all_params):
    print(params)
    break # ここコメントアウトすると全部のパラメーターが出力されていることがわかります。
결실
{'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimetors': 10000, 'n_jobs': 1, 'random_state': 0, 'reg_alpha': 0}
실제 코드로 해볼게요.
%matplotlib inline
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pylab as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid

# 組み合わせが多いので、進捗を可視化するツールを入れました。
from tqdm import tqdm_notebook as tqdm

warnings.filterwarnings('ignore')

# all_paramsはグローバル変数として宣言
all_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'min_child_weight': [3, 5, 10],
    'n_estimetors': [10000],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1],
    'random_state': [0],
    'n_jobs': [1],
}


def validate(train_x, train_y, params):
    accuracies = []
    feature_importances = []

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    for train_idx, test_idx in cv.split(train_x, train_y):
        trn_x = train_x.iloc[train_idx, :]
        val_x = train_x.iloc[test_idx, :]

        trn_y = train_y.iloc[train_idx]
        val_y = train_y.iloc[test_idx]

        clf = xgb.XGBClassifier(**params)
        clf.fit(trn_x, trn_y)

        pred_y = clf.predict(val_x)
        feature_importances.append(clf.feature_importances_)
        accuracies.append(accuracy_score(val_y, pred_y))
    print(np.mean(accuracies))
    return accuracies, feature_importances


def plot_feature_importances(feature_importances, cols):
    df_fimp = pd.DataFrame(feature_importances, columns=cols)
    df_fimp.plot(kind="box", rot=90)


def preprocess_df(df):
    # CabinはこのあとDropするので、コードから削除
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode())
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    # 列の削除
    df.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)

    # Sexの01化とEmbarkedのダミー化 
    df["Sex"] = df["Sex"].replace({"male": 0, "female": 1})
    df = pd.get_dummies(df)

    return df


# test dataのpredict
def predict_df(train_x, train_y, test_x, df_test_raw, path_output="result.csv"):
    params = {'learning_rate': 0.008306052798923729, 'max_depth': 7, 'min_child_weight': 3, 'colsample_bytree': 0.8210307463506532, 'colsample_bylevel': 0.8061816543590015}
    clf = xgb.XGBClassifier(**params)
    clf.fit(train_x, train_y)
    preds = clf.predict(test_x)

    _df = pd.DataFrame()
    _df["PassengerId"] = df_test_raw["PassengerId"]
    _df["Survived"] = preds
    _df.to_csv(path_output, index=False)


def main():
    df_train = pd.read_csv("train.csv")

    # ここは前処理
    train_y = df_train["Survived"]
    train_x = df_train.drop("Survived", axis=1)

    train_x = preprocess_df(train_x)
    accuracies, feature_importances = validate(train_x, train_y, {})
    print(np.mean(accuracies))
    plot_feature_importances(feature_importances, train_x.columns)

    flag_product = True
    if flag_product:
        df_test = pd.read_csv("test.csv")
        df_test_raw = df_test.copy()
        test_x = preprocess_df(df_test)
        predict_df(train_x, train_y, test_x, df_test_raw, "result.csv")

# main文を書き換えているので、別関数として定義
def main_parametersearch():
    df_train = pd.read_csv("train.csv")
    train_y = df_train["Survived"]
    train_x = df_train.drop("Survived", axis=1)
    train_x = preprocess_df(train_x)

    # ここまではmainと同じ
    # tqdmで囲むことで、進捗を可視化できます。
    best_score = 0
    best_params = {}
    for params in tqdm(ParameterGrid(all_params)):
        accuracies, feature_importances = validate(train_x, train_y, params)

        # もしaccuracyの平均値が最大だった場合、
        # best_scoreを更新して、best_paramsを更新する。
        if np.mean(accuracies) > best_score:
            best_score = np.mean(accuracies)
            best_params = params
    print(best_score, best_params)

# 呼んでいる関数を変えた
if __name__ == '__main__':
    main_parametersearch()
Bays를 통한 최적화된 매개변수 검색
베이스 최적화는 초파라미터를 더욱 효과적으로 검색하는 알고리즘https://datachemeng.com/bayesianoptimization/이다.
인상
1, 높은 정밀도 탐색
2, 간혹 슈퍼 매개 변수 변경, 좀 더 깊은 곳 탐색
의 두 조합을 사용하여 최적의 매개 변수를 검색합니다.
이번https://optuna.org/이용하다
!pip install optuna # ライブラリーのインストールコマンド
import optuna
import numpy as np
import pandas as pd
import xgboost as xgb

from tqdm import tqdm_notebook as tqdm
from IPython.display import display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold


# optunaの出力をsupressする
# https://optuna.readthedocs.io/en/stable/faq.html#how-to-suppress-log-messages-of-optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    params = {
        'seed': 0,
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-2),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_child_weight': trial.suggest_int('min_child_weight', 3, 10),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.8, 1.0),
        'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.8, 1.0),
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    accuracies = []
    for train_idx, test_idx in cv.split(train_x, train_y):
        trn_x = train_x.iloc[train_idx, :]
        val_x = train_x.iloc[test_idx, :]

        trn_y = train_y.iloc[train_idx]
        val_y = train_y.iloc[test_idx]

        # main - Predict
        clf = xgb.XGBClassifier(**params)
        clf.fit(trn_x, trn_y)

        pred_y = clf.predict(val_x)
        accuracies.append(accuracy_score(val_y, pred_y))

    return 1.0 - np.mean(accuracies)


def preprocess_df(df):
    # CabinはこのあとDropするので、コードから削除
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode())
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

    # 列の削除
    df.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)

    # Sexの01化とEmbarkedのダミー化 
    df["Sex"] = df["Sex"].replace({"male": 0, "female": 1})
    df = pd.get_dummies(df)

    return df


# main
df_train = pd.read_csv("train.csv")
train_y = df_train["Survived"]
train_x = df_train.drop("Survived", axis=1)
train_x = preprocess_df(train_x)

# random_stateを固定する
# 実際は要らないですが、今回はチュートリアルなので導入しています。
# https://optuna.readthedocs.io/en/stable/faq.html#how-can-i-obtain-reproducible-optimization-results
sampler = optuna.samplers.TPESampler(seed=100) # Make the sampler behave in a deterministic way.
study = optuna.create_study(sampler=sampler)
study.optimize(objective, n_trials=100, n_jobs=1)
print(study.best_trial.value)
print(study.best_trial.params)
기본적
1, 정밀도를 되돌려주는 함수 만들기(objective 함수)
2, Optuna에 함수 던지기(study.optimize 매개변수)
대응 가능
실제 테스트.csv도 예측해 볼게요.
params = study.best_trial.params

def main():
    df_train = pd.read_csv("train.csv")

    train_y = df_train["Survived"]
    train_x = df_train.drop("Survived", axis=1)

    train_x = preprocess_df(train_x)
    accuracies, feature_importances = validate(train_x, train_y, params) # paramsに書き換えました。
    print(np.mean(accuracies))
    plot_feature_importances(feature_importances, train_x.columns)

    flag_product = True
    if flag_product:
        df_test = pd.read_csv("test.csv")
        df_test_raw = df_test.copy()
        test_x = preprocess_df(df_test)
        predict_df(train_x, train_y, test_x, df_test_raw, "result.csv")

if __name__ == "__main__":
    main()
결실
0.8249158249158249

조합을 통해 정밀도를 높이다
기계 학습에서 단일한 모형을 직접 사용하는 것이 아니라 여러 모형을 조합하여 정밀도를 높이는 방법을 집합학습이라고 한다.
여기서 LightGBM과 XGBoost의 조합이 정밀도를 높일 수 있는지 살펴보겠습니다.
# 今回はクロスバリデーションで精度を出す以外のところは削っています。
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

# main文
df_train = pd.read_csv("train.csv")
train_y = df_train["Survived"]
train_x = df_train.drop("Survived", axis=1)

train_x = preprocess_df(train_x)

accuracies = []

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
for train_idx, test_idx in cv.split(train_x, train_y):
    trn_x = train_x.iloc[train_idx, :]
    val_x = train_x.iloc[test_idx, :]

    trn_y = train_y.iloc[train_idx]
    val_y = train_y.iloc[test_idx]

    clf_xgb = xgb.XGBClassifier(**params)
    clf_lgb = lgb.LGBMClassifier(**params)

    clf_xgb.fit(trn_x, trn_y)
    clf_lgb.fit(trn_x, trn_y)

    # 平均値化するためにprobabilityを出した
    pred_proba_y_xgb = clf_xgb.predict_proba(val_x)[:, 1]
    pred_proba_y_lgb = clf_lgb.predict_proba(val_x)[:, 1]

    # probabilityの平均値が0.50を超えていれば1, そうでないなら0
    pred_proba_y = pd.DataFrame({"xgb": pred_proba_y_xgb, "lgb": pred_proba_y_lgb}).mean(axis=1)
    pred_y = [1 if proba > 0.50 else 0 for proba in pred_proba_y]
    accuracies.append(accuracy_score(val_y, pred_y))

print(np.mean(accuracies))
결실
0.8215488215488215
이번에 내려갔어요.
이번에는 단순히 평균치만 취했을 뿐 실제로는 여러 판별모델에서 출력한 값을 머신러닝에 재투입하는 스택킹 등 다양한 방법을 적용했다

좋은 웹페이지 즐겨찾기