[프로젝트로 배우는 데이터사이언스] 모델과 파라미터 찾기

💎 모델과 파라미터 찾기

🔼 최적의 모델과 파라미터 찾기

# 사이킷런을 이용하여 train/test 셋 나누기
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
       'Insulin_nan']]
y = df['Outcome']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
# 최적의 max_depth 값 찾기
from sklearn.metrics import accuracy_score

for max_depth in range(3, 12):
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    y_predict = model.fit(X_train, y_train).predict(X_test)
    score = accuracy_score(y_test, y_predict) * 100
    print(max_depth, score)
    
# max_depth = 4 일 때 정확도가 가장 높은 것으로 나타남

  • cross validation : train 데이터셋을 여러 fold로 나눠서 평균을 내는 방법

🔼 GridSearchCV를 사용해 최적의 하이퍼 파라미터 값 찾기

from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(random_state=42)

# max_features : 무작위로 선택할 feature 수
# cv : cross validation
param_grid = {"max_depth": range(3, 12), "max_features": [0.3, 0.5, 0.7, 0.9, 1]}
clf = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=2)
clf.fit(X_train, y_train)

clf.best_params_
# max_depth = 4, max_features = 0.7 일 때 최적의 값을 나타냄

clf.best_score_
# train 정확도 0.8779

clf.score(X_test, y_test)
# test 정확도 0.8831

🔼 RamdomSearchCV를 사용해 최적의 하이퍼 파라미터 값 찾기

max_depth = np.random.randint(3, 20, 10)
max_features = np.random.uniform(0.7, 1.0, 100)
 
param_distributions = {"max_depth" :max_depth,
           "max_features": max_features,
           "min_samples_split" : list(range(2, 7))
           }
           
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(model, param_distributions,
    n_iter=1000, scoring="accuracy",
    n_jobs=-1, cv=5, random_state=42)
clf.fit(X_train, y_train)

clf.best_params_
# max_depth = 4, max_features = 0.749 일 때 최적의 값을 나타냄

clf.best_score_
# train 정확도 0.8779

clf.score(X_test, y_test)
# test 정확도 0.8831

🔼 DecisionTree, RandomForest, GradientBoosting 모델 비교

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
RandomForestClassifier(random_state=42),
GradientBoostingClassifier(random_state=42)]

results = []
for estimator in estimators:
    result = []
    result.append(estimator.__class__.__name__)
    results.append(result)

from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {"max_depth": max_depth, 
                       "max_features": max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 200, 10)
        
    clf = RandomizedSearchCV(estimator, param_distributions, 
                       n_iter=100, scoring="accuracy",
                       n_jobs=-1, cv=5, verbose=2)

    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)
    
df = pd.DataFrame(results,
columns=["estimator", "best_params", "train_score", "test_score", "cv_result"])
             
df

좋은 웹페이지 즐겨찾기