SW과정 머신러닝 1022(13)

SW과정 머신러닝 1022(13)

1. Titanic 분석

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

titanic_df = pd.read_csv('titanic_train.csv')
titanic_df.head(2)

titanic_df.info()

titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Cabin'].fillna('N',inplace=True)
titanic_df['Embarked'].fillna('N',inplace=True) #그냥 N이라는 문자열을 넣어줌

titanic_df.isnull().sum() #nan값 Ture인지 False인지 확인
titanic_df.isnull().sum().sum()

titanic_df['Sex'].value_counts()

titanic_df['Cabin'].value_counts()

titanic_df['Embarked'].value_counts()

titanic_df['Cabin'] = titanic_df['Cabin'].str[:1]

titanic_df.groupby(['Sex','Survived'])['Survived'].count()

sns.barplot(x='Sex', y='Survived', data=titanic_df)

plt.figure(figsize=(10,6))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=titanic_df)

titanic_df.columns

def get_category(age):
    cat=''
    if age<=-1: cat='Unknown'
    elif age <= 5: cat='Baby'
    elif age <= 12: cat='Child'        
    elif age <= 18: cat='Teenager'        
    elif age <= 25: cat='Student'        
    elif age <= 35: cat='Young Adult'        
    elif age <= 60: cat='Adult'
    else:cat='Elderly'
    return cat
    
titanic_df['Age_cat'] = titanic_df['Age'].apply(get_category)
titanic_df.head(1)

plt.figure(figsize=(10,5))
group_names=['Unknown','Baby','Child','Teenager','Student','Young Adult', 'Adult', 'Elderly']
sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=titanic_df, order=group_names)

from sklearn.preprocessing import LabelEncoder
def encode_features(dataDF):
    features=['Sex','Cabin','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(dataDF[feature])
        dataDF[feature]=le.transform(dataDF[feature])
    return dataDF
    
titanic_df = encode_features(titanic_df)
titanic_df.head(1)

def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df
    
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1, inplace=True)
    return df


def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features=['Sex','Cabin','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df
    
def tranform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df
    
titanic_df = pd.read_csv('titanic_train.csv')

y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)
x_titanic_df = tranform_features(x_titanic_df)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_titanic_df,y_titanic_df,test_size=0.2,random_state=121)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression()

dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('dt 정확도:',accuracy_score(y_test, dt_pred))

rf_clf.fit(X_train, y_train)
rf_pred = dt_clf.predict(X_test)
print('rf 정확도:',accuracy_score(y_test, rf_pred)) 

lr_clf.fit(X_train, y_train)
lr_pred = dt_clf.predict(X_test)
print('lr 정확도:',accuracy_score(y_test, lr_pred)) 

from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    kfold = KFold(n_splits=folds,shuffle=True)
    scores=[]
    for iter_count,(train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        X_train, X_test = X_titanic_df.values[train_index],X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index],y_titanic_df.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test,predictions)
        scores.append(accuracy)
        print(f'교차검증 {iter_count} 정확도:{accuracy:.4f}')
    mean_score = np.mean(scores)
    print(f'평균정확도 : {mean_score:.4f}')
    
exec_kfold(dt_clf)
exec_kfold(rf_clf)
exec_kfold(lr_clf)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf,X_titanic_df,y_titanic_df,cv=5) #cv값 나누는 것//디폴트값이 5개
print(scores)
print(np.mean(scores))

scores = cross_val_score(rf_clf,X_titanic_df,y_titanic_df,cv=5)
print(scores)
print(np.mean(scores))

scores = cross_val_score(lr_clf,X_titanic_df,y_titanic_df,cv=5)
print(scores)
print(np.mean(scores))

from sklearn.model_selection import GridSearchCV

pram = {'max_depth':[2,3,4,10],
        'min_samples_split':[2,3,5],
        'min_samples_leaf':[1,5,8]}
        
grid_dclf = GridSearchCV(dt_clf,param_grid=pram,scoring='accuracy', cv=5)
grid_dclf.fit(X_train,y_train)
print(grid_dclf.best_params_)
print(grid_dclf.best_score_)

best_dclf = grid_dclf.best_estimator_
pred = best_dclf.predict(X_test)
accuracy_score(y_test,pred)

from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
    def fit(self,X,y=None):
        pass
    def predict(self,X):
        pred = np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i]==1: #남자
                pred[i] = 0 #사망
            else:
                pred[i] = 1 #생존
        return pred

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features=['Sex','Cabin','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df

def tranform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

titanic_df = pd.read_csv('titanic_train.csv')

y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived',axis=1)
X_titanic_df = tranform_features(X_titanic_df)
X_train,X_test,y_train,y_test = train_test_split(X_titanic_df,y_titanic_df,test_size=0.2,random_state=0)

myclf = MyDummyClassifier()
myclf.fit(X_train,y_train)
pred = myclf.predict(X_test)
accuracy_score(y_test,pred)

from sklearn.datasets import load_digits

class MyFakeClassifier(BaseEstimator):
    def fit(self,X,y):
        pass
    def predict(self,X):
        return np.zeros((len(X),1),dtype=bool)

digits = load_digits()

digits

y = (digits.target==7).astype(int)

y

X_train,X_test,y_train,y_test=train_test_split(digits.data,y,random_state=11)

y_test.shape

pd.Series(y_test).value_counts()

fakeclf = MyFakeClassifier()
fakeclf.fit(X_train,y_train)
pred = fakeclf.predict(X_test)
accuracy_score(y_test,pred)

from sklearn.metrics import confusion_matrix #혼동행렬

confusion_matrix(y_test,pred)

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    print('오차행렬')
    print(confusion)
    print(f'정확도:{accuracy:.4f} 정밀도:{precision:.4f} 재현율:{recall:.4f}' )

titanic_df = pd.read_csv('titanic_train.csv')

y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived',axis=1)
X_titanic_df = tranform_features(X_titanic_df)
X_train,X_test,y_train,y_test = train_test_split(X_titanic_df,y_titanic_df,test_size=0.2,random_state=0)

from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred=lr_clf.predict(X_test)
get_clf_eval(y_test,pred)

fakeclf = MyFakeClassifier()
fakeclf.fit(X_train,y_train)
pred = fakeclf.predict(X_test)
get_clf_eval(y_test,pred)

pred

pred_proba = lr_clf.predict_proba(X_test)
pred_proba

pred = lr_clf.predict(X_test)

pred_proba.shape

pred.shape

pred_proba[:3]

pred_proba_result = np.concatenate([pred_proba,pred.reshape(-1,1)],axis=1) #reshape(-1,-1)의미는 행,열을 각각 알아서 재배열 해준다는 소리다.

pred_proba_result[:5]

from sklearn.preprocessing import Binarizer

X = [[1,-1,2],
    [2,0,0],
    [0,1.1,1.2]]

binarizer = Binarizer(threshold=1.1)
binarizer.fit_transform(X)

custom_threshold = 0.5
pred_proba_1 = pred_proba[:,1].reshape(-1,1) #행은 알아서하고 열을 하나
custom_predict = Binarizer(threshold=custom_threshold).fit_transform(pred_proba_1)

get_clf_eval(y_test,custom_predict)

custom_threshold = 0.5
pred_proba_1 = pred_proba[:,1].reshape(-1,1) #행은 알아서하고 열을 하나
custom_predict = Binarizer(threshold=custom_threshold).fit_transform(pred_proba_1)
get_clf_eval(y_test,custom_predict)

2. 손글씨.ipynb

from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784')

type(mnist)

mnist

mnist.data.shape

mnist.target.shape

type(mnist.data)

type(mnist.target)

mnist 데이터 분리하세요 (학습용 90%,검증용 10%)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(mnist.data,mnist.target,test_size=0.1)

y_train 의 값의 빈도를 출력(value_counts())

type(y_train)

y_train.value_counts()

RandomForest를 이용하여 학습 예측 평가하시오

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier()
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
accuracy_score(y_test,pred)

X_train

import numpy as np
import matplotlib.pyplot as plt

n_test = len(X_test)
random_pick = np.random.randint(0,n_test,size=10)
random_pick

X_test.iloc[191]

figure=plt.figure(figsize=(12,5))
#figure.set_size_inches(12,5)
axes=[]
for i in range(1,11):
    axes.append(figure.add_subplot(2,5,i))
#axes
tmp_list=[]
for i in range(10):
    tmp = X_test.iloc[random_pick[i]]
    #print(tmp)
    tmp = np.array(tmp)
    tmp = tmp.reshape(28,28)
    tmp_list.append(tmp)

print(y_test.iloc[random_pick])
for i in range(10):
    axes[i].matshow(tmp_list[i])

X_test.iloc[random_pick[i]].to_list()

import glob
from PIL import Image

for image_path in glob.glob('./data/*.png'):
    #print(image_path)
    img = Image.open(image_path).convert('L')
    plt.imshow(img)
    img = np.resize(img,(1,784))
    img = 255-(img)
    #print(img)
    pred = clf.predict(img)
    print(pred)
    plt.show()

좋은 웹페이지 즐겨찾기