sklearn의 모델 선택과 평가의 교차 검증

import numpy as np

교차 검증: 학습기의 표현을 평가한다
데이터를 훈련 집합과 테스트 집합으로 나누다

X, y = np.arange(10).reshape((5, 2)), range(5)
print(X)
print(y)

[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
range(0, 5)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.33,random_state=42)

print(X_train)
print(y_train)

[[4 5]
 [0 1]
 [6 7]]
[2, 0, 3]

print(X_test,"
",y_test)

[[2 3]
 [8 9]] 
 [1, 4]

교차 검증 지표 계산
cross_val_score

from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score

diabetes = datasets.load_diabetes()
X= diabetes.data[:150]
y = diabetes.target[:150]

lasso = linear_model.Lasso()
cross_val_score(lasso,X,y)

array([ 0.33150734,  0.08022311,  0.03531764])

cross_validate 함수와 다도량 평가

여러 개의 지표를 지정하여 평가할 수 있다.

테스트 득점을 제외하고는 훈련 득점, 의합 횟수, score-times(득점 횟수)를 포함하는 사전

을 되돌려준다.
교차 검증을 통해 예측 얻기

from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(lasso, X, y)
y_pred

array([ 174.26933996,  117.6539241 ,  164.60228641,  155.65049088,
        132.68647979,  128.49511245,  120.76146877,  141.069413  ,
        164.18904498,  182.37394949,  111.04181265,  127.94311443,
        135.0869234 ,  162.83066014,  135.3573514 ,  157.64516523,
        178.95843326,  163.3919841 ,  143.85237903,  144.29748882,
        133.58117218,  124.77928571,  132.90918003,  208.52927   ,
        153.61908967,  154.16616341,  118.95351821,  163.50467541,
        145.89406196,  168.3308101 ,  155.87411031,  123.45960148,
        185.70459144,  133.38468582,  117.2789469 ,  150.27895019,
        174.1541028 ,  160.03235091,  192.31389633,  161.58568256,
        154.2224809 ,  119.35517679,  146.15706413,  133.82056934,
        179.68118754,  137.96619936,  146.07788398,  126.77579723,
        123.32101099,  166.26710247,  146.41559964,  161.67261029,
        147.47731459,  138.44595305,  144.85421048,  113.77990664,
        185.54970402,  115.31624749,  142.23672103,  171.07792136,
        132.5394716 ,  177.80524864,  116.5616502 ,  134.25230846,
        142.88707475,  173.2830912 ,  154.31273504,  149.16680759,
        144.88238997,  121.97783103,  110.38457621,  180.25559631,
        199.06141058,  151.1195546 ,  161.14217698,  153.96960812,
        150.77179755,  113.30903579,  165.15755771,  115.85735727,
        174.19267171,  150.12027233,  115.47891783,  153.38967232,
        115.31573467,  156.49909623,   92.62211515,  178.15649994,
        131.59320715,  134.46166754,  116.97678633,  190.00790119,
        166.01173292,  126.25944471,  134.29256991,  144.71971963,
        190.9769591 ,  182.39199466,  154.45325308,  148.30325558,
        151.72036937,  124.12825466,  138.6011155 ,  137.75891286,
        123.0917243 ,  131.74735403,  112.07367481,  124.56956904,
        156.78432061,  128.63135591,   93.68260079,  130.54324394,
        131.8693231 ,  154.5708257 ,  179.81343019,  165.78130755,
        150.04779033,  162.37974736,  143.92996797,  143.15645843,
        125.20161377,  145.99590279,  155.3505536 ,  145.97574185,
        134.66120515,  163.92450638,  101.92329396,  139.33014324,
        122.71377023,  152.20573113,  153.36931089,  116.76545147,
        131.96936127,  109.74817383,  132.57453994,  159.38030328,
        109.31343881,  147.69926269,  156.3664255 ,  161.12509958,
        128.16523686,  156.78446286,  154.04375702,  124.83705022,
        143.85606595,  143.23651701,  147.76316913,  154.21572891,
        129.07895017,  157.79644923])

교차 검증 교체기
교차 검증 교체기 - 순환 반복 데이터
K할인

from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y= np.array([1, 2, 3, 4])

kf = KFold(n_splits=2)
kf.get_n_splits(X)

print(kf)

KFold(n_splits=2, random_state=None, shuffle=False)

for train_index,test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    print("   ：",X[train_index],"
",y[train_index])
    print("   ：",X[test_index],"
",y[test_index])

TRAIN: [2 3] TEST: [0 1]
   ： [[1 2]
 [3 4]] 
 [3 4]
   ： [[1 2]
 [3 4]] 
 [1 2]
TRAIN: [0 1] TEST: [2 3]
   ： [[1 2]
 [3 4]] 
 [1 2]
   ： [[1 2]
 [3 4]] 
 [3 4]

할인

from sklearn.model_selection import RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)

for train_index, test_index in rkf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1] TEST: [2 3]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]

상호 인증(LOO)

from sklearn.model_selection import LeaveOneOut
X = np.array([[1, 2], [3, 4]])
y = np.array([1, 2])
loo = LeaveOneOut()
loo.get_n_splits(X)

loo

LeaveOneOut()

for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [1] TEST: [0]
[[3 4]] [[1 2]] [2] [1]
TRAIN: [0] TEST: [1]
[[1 2]] [[3 4]] [1] [2]

LPO

from sklearn.model_selection import LeavePOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
lpo = LeavePOut(2)
lpo.get_n_splits(X)
print(lpo)
for train_index, test_index in lpo.split(X):
   print("TRAIN:", train_index, "TEST:", test_index)
   X_train, X_test = X[train_index], X[test_index]
   y_train, y_test = y[train_index], y[test_index]

LeavePOut(p=2)
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [1 2] TEST: [0 3]
TRAIN: [0 3] TEST: [1 2]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 1] TEST: [2 3]

무작위 정렬 교차 검증

from sklearn.model_selection import ShuffleSplit
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)

for train_index, test_index in rs.split(X):
       print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [3 1 0] TEST: [2]
TRAIN: [2 1 3] TEST: [0]
TRAIN: [0 2 1] TEST: [3]

rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
                  random_state=0)
for train_index, test_index in rs.split(X):
       print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [3 1] TEST: [2]
TRAIN: [2 1] TEST: [0]
TRAIN: [0 2] TEST: [3]

클래스 탭 기반, 층별 교차 검증 교체기
층k할인
모든 작은 집합에서, 각 종류의 샘플 비율은 대체로 전체 데이터 집합과 같다.

from sklearn.model_selection import StratifiedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]

층별 랜덤 split
구분을 만들지만, 구분 중의 모든 종류의 비율은 전체 데이터 집합과 같습니다.

from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [1 2] TEST: [3 0]
TRAIN: [0 2] TEST: [1 3]
TRAIN: [0 2] TEST: [3 1]

그룹 데이터에 사용되는 교차 검증 교체기
그룹 k-fold

from sklearn.model_selection import GroupKFold
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
group_kfold = GroupKFold(n_splits=2)
group_kfold.get_n_splits(X, y, groups)

for train_index, test_index in group_kfold.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [0 1] TEST: [2 3]
[[1 2]
 [3 4]] [[5 6]
 [7 8]] [1 2] [3 4]
TRAIN: [2 3] TEST: [0 1]
[[5 6]
 [7 8]] [[1 2]
 [3 4]] [3 4] [1 2]

교차 검증 그룹 남기기

from sklearn.model_selection import LeaveOneGroupOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
groups = np.array([1, 1, 2, 2])
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)

logo.get_n_splits(groups=groups)

for train_index, test_index in logo.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [2 3] TEST: [0 1]
[[5 6]
 [7 8]] [[1 2]
 [3 4]] [1 2] [1 2]
TRAIN: [0 1] TEST: [2 3]
[[1 2]
 [3 4]] [[5 6]
 [7 8]] [1 2] [1 2]

예약된 P 그룹 상호 확인

from sklearn.model_selection import LeavePGroupsOut
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 1])
groups = np.array([1, 2, 3])
lpgo = LeavePGroupsOut(n_groups=2)
lpgo.get_n_splits(X, y, groups)

lpgo.get_n_splits(groups=groups)

for train_index, test_index in lpgo.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

TRAIN: [2] TEST: [0 1]
[[5 6]] [[1 2]
 [3 4]] [1] [1 2]
TRAIN: [1] TEST: [0 2]
[[3 4]] [[1 2]
 [5 6]] [2] [1 1]
TRAIN: [0] TEST: [1 2]
[[1 2]] [[3 4]
 [5 6]] [1] [2 1]

그룹 나누기(임의)

from sklearn.model_selection import GroupShuffleSplit

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))

[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]

미리 정의된 축소, 유효성 검사 세트
교차 검증이 시간 시퀀스 데이터에 응용
시간 시퀀스 분할

from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)

TimeSeriesSplit(max_train_size=None, n_splits=3)

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

쓰쿠바 대학의 기계 학습 강좌 : 과제의 파이썬 스크립트 부분을 만들면서 sklearn 공부 (10)

지난번 이번에는 이상치가 있는 경우 Youtube의 해설은 제6회(1)55분 30초당 프로그램은 2행 더 강좌에서는 최초의 값을 바꾸고 있는 것 같지만, 하나 추가해도 거기까지 변화는 없을 것 같다. 이것은 실행 결...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

sklearn의 모델 선택과 평가의 교차 검증

좋은 웹페이지 즐겨찾기