Python 기계 학습 알고리즘 분류 실현

Python 알고리즘 분류
포도주 데이터 세트 를 테스트 하 는데 데이터 세트 는 다 분류 이 고 데이터 의 견본 분포 가 불 균형 하기 때문에 데이터 테스트 를 직접 하면 효과 가 이상 적 이지 않다.그래서 SMOTE 과 샘플링 을 사용 하여 데 이 터 를 처리 하고 데 이 터 를 무 겁 게 하고 비우 고 처리 한 후에 데이터 가 균형 을 이 룬 다음 에 테스트 를 하면 이전 테스트 에 비해 정확도 가 높 아 집 니 다.
在这里插入图片描述
예 를 들 어 결정 트 리:
Smote 처리 전:
在这里插入图片描述
Smote 처리 후:
在这里插入图片描述

from typing import Counter
from matplotlib import colors, markers
import numpy as np
import pandas as pd
import operator
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
#             
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

#        
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']


path ="C:\\Users\\zt\\Desktop\\winequality\\myexcel.xls"
# path=r"C:\\Users\\zt\\Desktop\\winequality\\winequality-red.csv"#         
# exceldata = np.loadtxt(
#     path,
#     dtype=str,
#     delimiter=";",#         
#     skiprows=1
# )

# print(Counter(exceldata[:,-1]))

exceldata = pd.read_excel(path)
print(exceldata)

print(exceldata[exceldata.duplicated()])
print(exceldata.duplicated().sum())

#  
exceldata = exceldata.drop_duplicates()


#    
print(exceldata.isnull())
print(exceldata.isnull().sum)
print(exceldata[~exceldata.isnull()])
exceldata = exceldata[~exceldata.isnull()]

print(Counter(exceldata["quality"]))

#smote

#  imlbearn         SMOTE  
from imblearn.over_sampling import SMOTE
#  SMOTE  ,random_state           


X,y = np.split(exceldata,(11,),axis=1)
smo = SMOTE(random_state=10) 

x_smo,y_smo = SMOTE().fit_resample(X.values,y.values)




print(Counter(y_smo))



x_smo = pd.DataFrame({"fixed acidity":x_smo[:,0], "volatile acidity":x_smo[:,1],"citric acid":x_smo[:,2] ,"residual sugar":x_smo[:,3] ,"chlorides":x_smo[:,4],"free sulfur dioxide":x_smo[:,5] ,"total sulfur dioxide":x_smo[:,6] ,"density":x_smo[:,7],"pH":x_smo[:,8] ,"sulphates":x_smo[:,9] ," alcohol":x_smo[:,10]})
y_smo = pd.DataFrame({"quality":y_smo})
print(x_smo.shape)
print(y_smo.shape)
#  
exceldata = pd.concat([x_smo,y_smo],axis=1)
print(exceldata)

#  X,y
X,y = np.split(exceldata,(11,),axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=10,train_size=0.7)
print("     :%d"%(X_train.shape[0]))
print("     :%d"%(X_test.shape[0]))



def func_mlp(X_train,X_test,y_train,y_test):
    print("    MLP:")
    kk = [i for i in range(200,500,50) ] #    
    t_precision = []
    t_recall = []
    t_accuracy = []
    t_f1_score = []
    for n in kk:
        method = MLPClassifier(activation="tanh",solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1,max_iter=n)
        method.fit(X_train,y_train)
        MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
                        beta_2=0.999, early_stopping=False, epsilon=1e-08,
                        hidden_layer_sizes=(5, 2), learning_rate='constant',
                        learning_rate_init=0.001, max_iter=n, momentum=0.9,
                        nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
                        solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
                        warm_start=False)
        y_predict = method.predict(X_test)
        t =classification_report(y_test, y_predict, target_names=['3','4','5','6','7','8'],output_dict=True)
        print(t)
        t_accuracy.append(t["accuracy"])
        t_precision.append(t["weighted avg"]["precision"])
        t_recall.append(t["weighted avg"]["recall"])
        t_f1_score.append(t["weighted avg"]["f1-score"])
    plt.figure("     MLP")
    plt.subplot(2,2,1)
    #     #x   
    plt.xlabel('    ')
    #y   
    plt.ylabel('accuracy')
    #  
    plt.title('        accuracy')
    plt.plot(kk,t_accuracy,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,2)
    #     #x   
    plt.xlabel('    ')
    #y   
    plt.ylabel('precision')
    #  
    plt.title('        precision')
    plt.plot(kk,t_precision,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,3)
    #     #x   
    plt.xlabel('    ')
    #y   
    plt.ylabel('recall')
    #  
    plt.title('        recall')
    plt.plot(kk,t_recall,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,4)
    #     #x   
    plt.xlabel('    ')
    #y   
    plt.ylabel('f1_score')
    #  
    plt.title('        f1_score')
    plt.plot(kk,t_f1_score,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.show()


def func_svc(X_train,X_test,y_train,y_test):
    print("   :")
    kk = ["linear","poly","rbf"] #     
    t_precision = []
    t_recall = []
    t_accuracy = []
    t_f1_score = []
    for n in kk:
        method = SVC(kernel=n, random_state=0)
        method = method.fit(X_train, y_train)
        y_predic = method.predict(X_test)
        t =classification_report(y_test, y_predic, target_names=['3','4','5','6','7','8'],output_dict=True)
        print(t)
        t_accuracy.append(t["accuracy"])
        t_precision.append(t["weighted avg"]["precision"])
        t_recall.append(t["weighted avg"]["recall"])
        t_f1_score.append(t["weighted avg"]["f1-score"])
    plt.figure("        ")
    plt.subplot(2,2,1)
    #     #x   
    plt.xlabel('     ')
    #y   
    plt.ylabel('accuracy')
    #  
    plt.title('         accuracy')
    plt.plot(kk,t_accuracy,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,2)
    #     #x   
    plt.xlabel('     ')
    #y   
    plt.ylabel('precision')
    #  
    plt.title('         precision')
    plt.plot(kk,t_precision,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,3)
    #     #x   
    plt.xlabel('     ')
    #y   
    plt.ylabel('recall')
    #  
    plt.title('         recall')
    plt.plot(kk,t_recall,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,4)
    #     #x   
    plt.xlabel('     ')
    #y   
    plt.ylabel('f1_score')
    #  
    plt.title('         f1_score')
    plt.plot(kk,t_f1_score,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.show()

def func_classtree(X_train,X_test,y_train,y_test):
    print("   :")
    kk = [10,20,30,40,50,60,70,80,90,100] #       
    t_precision = []
    t_recall = []
    t_accuracy = []
    t_f1_score = []
    for n in kk:
        method = tree.DecisionTreeClassifier(criterion="gini",max_depth=n)
        method.fit(X_train,y_train)
        predic = method.predict(X_test)
        print("method.predict:%f"%method.score(X_test,y_test))

        
        t =classification_report(y_test, predic, target_names=['3','4','5','6','7','8'],output_dict=True)
        print(t)
        t_accuracy.append(t["accuracy"])
        t_precision.append(t["weighted avg"]["precision"])
        t_recall.append(t["weighted avg"]["recall"])
        t_f1_score.append(t["weighted avg"]["f1-score"])
    plt.figure("        ")
    plt.subplot(2,2,1)
    #     #x   
    plt.xlabel('       ')
    #y   
    plt.ylabel('accuracy')
    #  
    plt.title('           accuracy')
    plt.plot(kk,t_accuracy,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,2)
    #     #x   
    plt.xlabel('       ')
    #y   
    plt.ylabel('precision')
    #  
    plt.title('           precision')
    plt.plot(kk,t_precision,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,3)
    #     #x   
    plt.xlabel('       ')
    #y   
    plt.ylabel('recall')
    #  
    plt.title('           recall')
    plt.plot(kk,t_recall,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.subplot(2,2,4)
    #     #x   
    plt.xlabel('       ')
    #y   
    plt.ylabel('f1_score')
    #  
    plt.title('           f1_score')
    plt.plot(kk,t_f1_score,color="r",marker="o",lineStyle="-")
    plt.yticks(np.arange(0,1,0.1))

    plt.show()

def func_adaboost(X_train,X_test,y_train,y_test):
    print("   :")
    kk = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
    t_precision = []
    t_recall = []
    t_accuracy = []
    t_f1_score = []
    for n in range(100,200,200):
        for k in kk:
            print("     :%d
:%.2f"%(n,k)) bdt = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2, min_samples_split=20), algorithm="SAMME", n_estimators=n, learning_rate=k) bdt.fit(X_train, y_train) # 100 , 0.1 y_pred = bdt.predict(X_test) print(" score:%lf"%(bdt.score(X_train,y_train))) print(" score:%lf"%(bdt.score(X_test,y_test))) print(bdt.feature_importances_) t =classification_report(y_test, y_pred, target_names=['3','4','5','6','7','8'],output_dict=True) print(t) t_accuracy.append(t["accuracy"]) t_precision.append(t["weighted avg"]["precision"]) t_recall.append(t["weighted avg"]["recall"]) t_f1_score.append(t["weighted avg"]["f1-score"]) plt.figure(" 100 (adaboost)") plt.subplot(2,2,1) # #x plt.xlabel(' ') #y plt.ylabel('accuracy') # plt.title(' accuracy') plt.plot(kk,t_accuracy,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,2) # #x plt.xlabel(' ') #y plt.ylabel('precision') # plt.title(' precision') plt.plot(kk,t_precision,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,3) # #x plt.xlabel(' ') #y plt.ylabel('recall') # plt.title(' recall') plt.plot(kk,t_recall,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,4) # #x plt.xlabel(' ') #y plt.ylabel('f1_score') # plt.title(' f1_score') plt.plot(kk,t_f1_score,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.show() # inX # dataSet # labels, dataSet # k def classify0(inx, data_set, labels, k): """ k """ data_set_size = data_set.shape[0] # , diff_mat = np.tile(inx, (data_set_size, 1)) - data_set # sq_diff_mat = diff_mat**2 # sq_distances = sq_diff_mat.sum(axis=1) # distances = sq_distances**0.5 # sorted_dist_indicies = distances.argsort() # , class_count = {} # , k for i in range(k): vote_label = labels[sorted_dist_indicies[i]] # i # vote_label 1, #class_count.get(vote_label, 0) 0 vote_label class_count[vote_label[0]] = class_count.get(vote_label[0], 0) + 1 # k sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True) # return sorted_class_count[0][0] def func_knn(X_train,X_test,y_train,y_test): print("k :") kk = [i for i in range(3,30,5)] #k t_precision = [] t_recall = [] t_accuracy = [] t_f1_score = [] for n in kk: y_predict = [] for x in X_test.values: a = classify0(x, X_train.values, y_train.values, n) # k y_predict.append(a) t =classification_report(y_test, y_predict, target_names=['3','4','5','6','7','8'],output_dict=True) print(t) t_accuracy.append(t["accuracy"]) t_precision.append(t["weighted avg"]["precision"]) t_recall.append(t["weighted avg"]["recall"]) t_f1_score.append(t["weighted avg"]["f1-score"]) plt.figure(" k ") plt.subplot(2,2,1) # #x plt.xlabel('k ') #y plt.ylabel('accuracy') # plt.title(' k accuracy') plt.plot(kk,t_accuracy,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,2) # #x plt.xlabel('k ') #y plt.ylabel('precision') # plt.title(' k precision') plt.plot(kk,t_precision,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,3) # #x plt.xlabel('k ') #y plt.ylabel('recall') # plt.title(' k recall') plt.plot(kk,t_recall,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,4) # #x plt.xlabel('k ') #y plt.ylabel('f1_score') # plt.title(' k f1_score') plt.plot(kk,t_f1_score,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.show() def func_randomforest(X_train,X_test,y_train,y_test): print(" :") t_precision = [] t_recall = [] t_accuracy = [] t_f1_score = [] kk = [10,20,30,40,50,60,70,80] # for n in kk: clf = RandomForestClassifier(n_estimators=n, max_depth=100,min_samples_split=2, random_state=10,verbose=True) clf.fit(X_train,y_train) predic = clf.predict(X_test) print(" :",clf.feature_importances_) print("acc:",clf.score(X_test,y_test)) t =classification_report(y_test, predic, target_names=['3','4','5','6','7','8'],output_dict=True) print(t) t_accuracy.append(t["accuracy"]) t_precision.append(t["weighted avg"]["precision"]) t_recall.append(t["weighted avg"]["recall"]) t_f1_score.append(t["weighted avg"]["f1-score"]) plt.figure(" 100( )") plt.subplot(2,2,1) # #x plt.xlabel(' ') #y plt.ylabel('accuracy') # plt.title(' accuracy') plt.plot(kk,t_accuracy,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,2) # #x plt.xlabel(' ') #y plt.ylabel('precision') # plt.title(' precision') plt.plot(kk,t_precision,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,3) # #x plt.xlabel(' ') #y plt.ylabel('recall') # plt.title(' recall') plt.plot(kk,t_recall,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.subplot(2,2,4) # #x plt.xlabel(' ') #y plt.ylabel('f1_score') # plt.title(' f1_score') plt.plot(kk,t_f1_score,color="r",marker="o",lineStyle="-") plt.yticks(np.arange(0,1,0.1)) plt.show() if __name__ == '__main__': # print(func_mlp(X_train,X_test,y_train,y_test)) # print(func_svc(X_train,X_test,y_train,y_test)) # print(func_classtree(X_train,X_test,y_train,y_test)) # print(func_adaboost(X_train,X_test,y_train,y_test)) #knn print(func_knn(X_train,X_test,y_train,y_test)) #randomforest print(func_randomforest(X_train,X_test,y_train,y_test))
파 이 썬 이 기계 학습 알고리즘 을 실현 하 는 분류 에 관 한 이 글 은 여기까지 소개 되 었 습 니 다.더 많은 파 이 썬 알고리즘 분류 내용 은 우리 의 이전 글 을 검색 하거나 아래 의 관련 글 을 계속 조회 하 시기 바 랍 니 다.앞으로 많은 응원 바 랍 니 다!

좋은 웹페이지 즐겨찾기