python 기계 학습 sklearn 데이터 세트 iris 소개

89181 단어 python scikit-learn iris 데이터 세트

###########################
#  ：
#              ，       “http://python.jobbole.com/83563/”   
#              ，             。        ，   ：
#      1.              ；
#      2.              ；
#      3.     lib         ；
#      4.        python       ；
#      5.        ，             ；
#      6.            ，      ，     ！
#      7.  
#         ：  python 0  ！            ，     ，     ！
#                “          ”   ！
#  ：   ，           （  ）。 
#  ：       “http://www.cnblogs.com/taichu/p/5216659.html”，       ，         ！
###########################

###########################
#（0）  
# 1.       ，            。
#     ‘  ’             ，      。
#          ，                  ！
###########################

###########################
#（1）      （  ）
#   ：    ；     
###########################

##################
# ubuntu15.10     6      python  
#sudo apt-get install python   #  python   ，        2.7   
#sudo apt-get install python-numpy    #  python numpy  
#sudo apt-get install python-matplotlib
#sudo apt-get install python-networkx
#sudo apt-get install python-sklearn
#python  # python          ，         ，           ？
#  ，    Anaconda Python IDE    ，      ，  SCIPY        ，      ！
#    ：   WIN10      Ubuntu15.10     ， Ubuntu      python，       lib         ！
##################

import urllib2
url = 'http://aima.cs.berkeley.edu/data/iris.csv'
u = urllib2.urlopen(url)
#           ，         ！
#localfn='/mnt/hgfs/sharedfolder/iris.csv' #for linux
localfn='D:\\Virtual Machines\\sharedfolder\\iris.csv' #for windows
localf = open(localfn, 'w')
localf.write(u.read())
localf.close()

# data examples
#COL1,  COL2,   COL3,   COL4,   COL5
#5.1    3.5    1.4    0.2    setosa
#…    …    …    …    …
#4.7    3.2    1.3    0.2    setosa
#7    3.2    4.7    1.4    versicolor
#…    …    …    …    …
#6.9    3.1    4.9    1.5    versicolor
#6.3    3.3    6    2.5    virginica
#…    …    …    …    …
#7.1    3    5.9    2.1    virginica

#############################
#U can get description of 'iris.csv' 
#at 'http://aima.cs.berkeley.edu/data/iris.txt'
#Definiation of COLs:
#1. sepal length in cm (   )
#2. sepal width in cm（   ）
#3. petal length in cm (   )
#4. petal width in cm（   ）
#5. class: 
#      -- Iris Setosa
#      -- Iris Versicolour
#      -- Iris Virginica
#Missing Attribute Values: None
#################################


from numpy import genfromtxt, zeros
# read the first 4 columns
data = genfromtxt(localfn,delimiter=',',usecols=(0,1,2,3)) 
# read the fifth column
target = genfromtxt(localfn,delimiter=',',usecols=(4),dtype=str)

print data.shape
# output: (150, 4)
print target.shape
# output: (150,)

#auto build a collection of unique elements
print set(target)  
# output: set(['setosa', 'versicolor', 'virginica'])
#print set(data) #wrong usage of set, numbers is unhashable

######################
#plot     ：
#'bo'=blue+circle; 'r+'=red+plus;'g'=red+*
#search keyword 'matlab plot' on web for details
#http://www.360doc.com/content/15/0113/23/16740871_440559122.shtml
#http://zhidao.baidu.com/link?url=6JA9-A-UT3kmslX1Ba5uTY1718Xh-OgebUJVuOs3bdzfnt4jz4XXQdAmvb7R5JYMHyRbBU0MYr-OtXPyKxnxXsPPkm9u5qAciwxIVACR8k7
######################

#figure for 2D data
from pylab import plot, show
plot(data[target=='setosa',0],data[target=='setosa',2],'bo')
plot(data[target=='versicolor',0],data[target=='versicolor',2],'r+')
plot(data[target=='virginica',0],data[target=='virginica',2],'g*')
show()

#  :   Ubuntu python        ， figure      RUN.
#    Anaconda spyder（Python2.7）     ，   figure      console
#         ！

#figure for all 4D（4   ） data,     ，    ，    
setosa_sepal_x=ssx=data[target=='setosa',0]
setosa_sepal_y=ssy=data[target=='setosa',1]
setosa_petal_x=spx=data[target=='setosa',2]
setosa_petal_y=spy=data[target=='setosa',3]

versicolor_sepal_x=vsx=data[target=='versicolor',0]
versicolor_sepal_y=vsy=data[target=='versicolor',1]
versicolor_petal_x=vpx=data[target=='versicolor',2]
versicolor_petal_y=vpy=data[target=='versicolor',3]

virginica_sepal_x=vgsx=data[target=='virginica',0]
virginica_sepal_y=vgsy=data[target=='virginica',1]
virginica_petal_x=vgpx=data[target=='virginica',2]
virginica_petal_y=vgpy=data[target=='virginica',3]

plot(ssx,ssy,'bo',spx,spy,'b+')
plot(vsx,vsy,'ro',vpx,vpy,'r+')
plot(vgsx,vgsy,'go',vgpx,vgpy,'g+')
show()


#figure for 1D（     ），            
#pylab        
#http://hyry.dip.jp/tech/book/page/scipy/matplotlib_fast_plot.html
from pylab import figure, subplot, hist, xlim, show
xmin = min(data[:,0])
xmax = max(data[:,0])
figure() #   ，       figure
subplot(411) # distribution of the setosa class (1st, on the top)
hist(data[target=='setosa',0],color='b',alpha=.7)
xlim(xmin,xmax)
#subplot（ , ,plot ）；(4,1,2)   412,   10   
subplot(412) # distribution of the versicolor class (2nd)
hist(data[target=='versicolor',0],color='r',alpha=.7)
xlim(xmin,xmax)
subplot(413) # distribution of the virginica class (3rd)
hist(data[target=='virginica',0],color='g',alpha=.7)
xlim(xmin,xmax)
subplot(414) # global histogram (4th, on the bottom)
hist(data[:,0],color='y',alpha=.7)
xlim(xmin,xmax)
show()

###########################
#（2）    
#               ，  （    /      /      ）
###########################

#  target  (1 )   0 t  
t = zeros(len(target))
#type(t) #show type of t (numpy.ndarray)
#print t #show contains of t
# target             1(   )
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
#print t

#   data     
from sklearn.naive_bayes import GaussianNB
classifier = cf = GaussianNB()
cf.fit(data,t) # training on the iris dataset
print cf.predict(data[0]) #     1   
#output:[ 1.]
print t[0]
#output:1.0

#     data           ，t      
from sklearn import cross_validation
train, test, t_train, t_test = cross_validation.train_test_split(data, t, \
test_size=0.4, random_state=0)

print train.shape
#output:(90, 4)
print test.shape
#output:(60, 4)
print t_train.shape
#output:(90,)
print t_test.shape
#output:(60,)

# 60%     ，  40%    ，  93.3%
cf.fit(train,t_train) # train
print cf.score(test,t_test) # test
#output:0.93333333333333335
cf.score(train,t_train) #                 100%  ！
#output:0.97777777777777775

#        ，      ，      97%
cf.fit(data,t)
#output:GaussianNB()
cf.score(data,t)
#output:0.95999999999999996


# 100%     ，  40%    ，  94.99%
cf.fit(data,t)
#output:GaussianNB()
cf.score(test,t_test)
#output:0.94999999999999996

#############################################################
#TODO：    （            ）
#              feature         。     ，   ：
#1.      ，     ，       ，  2  feature； sepal-size，petal-size
#2.          ，    ，       ，  2  feature！ whole-length，whole-wide
#3.       ，     ，  4   feature;
#4.       8 feature     ？  ：   ，      ，     ？        ratio ？
#     ，         ？   ？
#       sepal-size petal-size        （            ）
#          100%，            100%  ，              （      ）
#TRY：      ，     ，       ，    ，       ！
#############################################################


#            
from sklearn.metrics import confusion_matrix
print confusion_matrix(cf.predict(test),t_test)
#output:[[16  0  0]
#output: [ 0 23  4]
#output: [ 0  0 17]]

#        
#            
#        -----------
#         1  2  3
#  | 1 43  5   2
#  | 2 2   45  3
#  | 3 0   1   49
#  |
#
#  ：             
#  ：     3     50   ；
#       3 1        2；
#       2 2        1,3        3
#       1 5        2,2        3

#          
#Precision：       
#Recall（       ）：       
#F1-Score：precision recall      

from sklearn.metrics import classification_report
print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])
#output:            precision    recall  f1-score   support
#output:    setosa       1.00      1.00      1.00        16
#output:versicolor       1.00      0.85      0.92        27
#output: virginica       0.81      1.00      0.89        17
#output:avg / total      0.95      0.93      0.93        60

##############################################################
#          
#     ：Hn=n/(1/a1+1/a2+...+1/an)
#     ：Gn=(a1a2...an)^(1/n)
#     ：An=(a1+a2+...+an)/n
#     ：Qn=√ [(a1^2+a2^2+...+an^2)/n]
#         Hn ≤ Gn ≤ An ≤ Qn
#
#         ：
# ： 4             3、4、6、8 ，         （1      ）？
# ：        ， 1/[(1/3+1/4+1/6+1/8)/4]=4/(1/3+1/4+1/6+1/8)=4.57 
###########################################################


#                    。
#    、                   
#                   


#                         ，
#                 ，  Cross Validation。
#           ：                 ，
#                 。
#sklearn             ：

from sklearn.cross_validation import cross_val_score
# cross validation with 6 iterations 
scores = cross_val_score(classifier, data, t, cv=6)
print scores
#output:[ 0.92592593  1.          0.91666667  0.91666667  0.95833333  1.        ]
#         。  CV=6，  6 

#                  。               ：
from numpy import mean
print mean(scores)
#output:0.96

#        cv  ，   mean 
#  CV  >=2,    'ValueError: k-fold cross validation requires at least one train / test split by setting n_folds=2 or more, got n_folds=1.'
#  CV             （ t=50;t_train=27;t_test=16），    ndarray    ！
#1.  data     cv             
for i in range(2, 51):
  scores = cross_val_score(classifier, data, t, cv=i)
  print mean(scores)#  for               （         ）        ！


#2.  test     cv               
for i in range(2, 17): print mean(cross_val_score(classifier, test, t_test, cv=i))


#3.  train     cv               
for i in range(2, 28): print mean(cross_val_score(classifier, train, t_train, cv=i))


#
#
#   numpy.ndarray        
ndarray={}
for item in t: ndarray[item] = ndarray.get(item, 0) + 1
    #         （     ！），    python  for      

print(ndarray)
#output:{1.0: 50, 2.0: 50, 3.0: 50}

#   numpy.ndarray        
ndarray={}
for item in t_train: ndarray[item] = ndarray.get(item, 0) + 1
    #         ，    python  for      

print(ndarray)
#output:{1.0: 34, 2.0: 27, 3.0: 29}

#   numpy.ndarray        
ndarray={}
for item in t_test: ndarray[item] = ndarray.get(item, 0) + 1
    #         ，    python  for      

print(ndarray)
#output:{1.0: 16, 2.0: 23, 3.0: 21}

#
#
#***********************************
#    ：     ， 1 n-1 n-1 1          ；
#TODO：            model（           -    ）       ；
#                       cv    ；
#                 ，  model                ？
#          figure X train/data（      %）(0,1)；Y     mean         ！(0,1)
#                           ， RUN            
#TODO：         ，       ，           ，           ？
#                  。                  toolbox，          
#             auto   ，          ，        。
#***********************************

###########################
#（3）  
###########################
#k-means    ：       k ，  n       k   ；       :             ;           ；
#                                      “    ”（    ）     。
#k-means       ：
#（1）   n         k           （      k ）；
#（2）            （    ），                ；                ；
#（3）       （   ）     （    ）；
#（4）         ，       ，      ，     ；            （2）。
############################


from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3) # initialization     3   ，            3 。
#kmeans = KMeans(k=3, init='random') # both parameters are wrong
kms.fit(data) # actual execution
c = kms.predict(data)

from sklearn.metrics import completeness_score, homogeneity_score
print completeness_score(t,c)
#output:0.764986151449
print homogeneity_score(t,c)
#output:0.751485402199

#    ！t    3    ，     1,2,3
#                        ，           1。
#                              1.
figure()
subplot(211) # top figure with the real classes
plot(data[t==1,0],data[t==1,2],'bo')
plot(data[t==2,0],data[t==2,2],'ro')
plot(data[t==3,0],data[t==3,2],'go')
subplot(212) # bottom figure with classes assigned automatically
plot(data[c==1,0],data[c==1,2],'bo',alpha=.5)
plot(data[c==2,0],data[c==2,2],'go',alpha=.5)
plot(data[c==0,0],data[c==0,2],'mo',alpha=.5)
show()

#          ，          k-means    ，
#                。  kmean               
#          ；              

#    4 feature     2        ，        3  ，
#       。
import matplotlib.pyplot as plt
plt.figure()
plt.subplot(211) # top figure with the real classes
plt.plot(data[t==1,0],data[t==1,1],'bo',data[t==1,2],data[t==1,3],'b+')
plt.plot(data[t==2,0],data[t==2,1],'ro',data[t==2,2],data[t==2,3],'r+')
plt.plot(data[t==3,0],data[t==3,1],'go',data[t==3,2],data[t==3,3],'g+')
plt.subplot(212) # bottom figure with classes assigned automatically
plt.plot(data[c==0,0],data[c==0,1],'bo',data[c==0,2],data[c==0,3],'b+',alpha=.7)
plt.plot(data[c==1,0],data[c==1,1],'ro',data[c==1,2],data[c==1,3],'r+',alpha=.7)
plt.plot(data[c==2,0],data[c==2,1],'go',data[c==2,2],data[c==2,3],'g+',alpha=.7)
p=plt
fig=plt.gcf()
fig.show() # p.show()  ，         。


###########################
#（4）  
###########################

#                      。
#       ：       ，       。
#          ；            ；
#          ，      。


##############
#sklear.linear_model    LinearRegression  。
#                      ，
#             。  sklearn  ；
#
##############

#         40    ，        
#         ，              
#Step1-        40  
from numpy.random import rand
x = rand(40,1) # explanatory variable
y = x*x*x+rand(40,1)/5 # depentend variable

#Step2-    
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x,y)

#Step3-    x  ，         y  （        ）
from numpy import linspace, matrix
#  0 1  40    
randx = linspace(0,1,40) 
#      40 x   ，        y   ，     
#  y   x       y   
plot(x,y,'o',randx,linreg.predict(matrix(randx).T),'--r')
show()

#Step4-    MSE                。0  
from sklearn.metrics import mean_squared_error
print mean_squared_error(linreg.predict(x),y)

#########################
#                  
#########################
#  x y（  reshape     (50,)     (50,1)，   linreg.fit!
ssx_blue=data[target=='setosa',0].reshape((50,1)) #  setosa sepal  length
ssy_blue=data[target=='setosa',1].reshape((50,1)) #  setosa sepal  width

# x y        
linreg = LinearRegression()
linreg.fit(ssx_blue,ssy_blue)

#    x  ，         y  （        ）
#        setosa   sepal        X:[4.0-6.0]y:[2.5-4.5]
randx = linspace(4.0,6.0,50) 
plot(ssx_blue,ssy_blue,'o',randx,linreg.predict(matrix(randx).T),'--r')
show()

#    MSE                。0  
print mean_squared_error(linreg.predict(ssx_blue),ssy_blue)


###########################
#（5）     
###########################

#    feature                 ，    。
#                 。                   。
#                          。
#          4                。
#    ：feature         ，            feature        ，
#                       ，       feature        。

#            。                   。
#1        ，0     ，-1        。

#              ，         ：
#“    ”petal width “    ”petal length     。

from numpy import corrcoef
corr = corrcoef(data.T) # .T gives the transpose
print corr
#output:[[ 1.         -0.10936925  0.87175416  0.81795363]
#output: [-0.10936925  1.         -0.4205161  -0.35654409]
#output: [ 0.87175416 -0.4205161   1.          0.9627571 ]
#output: [ 0.81795363 -0.35654409  0.9627571   1.        ]]

from pylab import pcolor, colorbar, xticks, yticks
from numpy import arange
pcolor(corr) #       ，4      4x4
colorbar() #       
#  X,Y   ，       1，   1,2,3,4，      name  。
xticks(arange(1,5),['sepal length',  'sepal width', 'petal length', 'petal width'],rotation=-20)
yticks(arange(1,5),['sepal length',  'sepal width', 'petal length', 'petal width'],rotation=-45)
show()


###########################
#（6）    （  ）
#       PCA
###########################


from sklearn.decomposition import PCA
#     feature（   ）         
#  3D    ，    ，     2D   ， 4D          
#   data   4 feature   2    。
#    ：       feature      ，         。
pca = PCA(n_components=2)

pcad = pca.fit_transform(data)

plot(pcad[target=='setosa',0],pcad[target=='setosa',1],'bo')
plot(pcad[target=='versicolor',0],pcad[target=='versicolor',1],'ro')
plot(pcad[target=='virginica',0],pcad[target=='virginica',1],'go')
show()

#     PC
print pca.explained_variance_ratio_
#output: [ 0.92461621  0.05301557]
pc1, pc2 = pca.explained_variance_ratio_ #  2 PC

print 1-sum(pca.explained_variance_ratio_)
#output:0.0223682249752
print 1.0-pc1-pc2 #       

#       
data_inv = pca.inverse_transform(pcad)
#                
print abs(sum(sum(data - data_inv)))
#output:6.66133814775e-15

#    ：PC   1  4 （      4 ）
# PCA     ；4   100%，3     ；
for i in range(1,5):
    pca = PCA(n_components=i)
    pca.fit(data)
    print sum(pca.explained_variance_ratio_) * 100,'%'

#output:92.4616207174 %
#output:97.7631775025 %
#output:99.481691455 %
#output:100.0 %

  ：http://www.cnblogs.com/taichu/p/5251332.html

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

로마 숫자를 정수로 또는 그 반대로 변환

그 중 하나는 로마 숫자를 정수로 변환하는 함수를 만드는 것이었고 두 번째는 그 반대를 수행하는 함수를 만드는 것이었습니다. 문자만 포함합니다'I', 'V', 'X', 'L', 'C', 'D', 'M' ; 문자열이 ...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

데이터 포털에서 시각화할 때, "날짜를 세로로 배열하면 IKEL이다"

nginx + django 배치

좋은 웹페이지 즐겨찾기

개발자 우수 사이트 수집

개발자가 알아야 할 필수 사이트 100선 추천 우리는 당신을 위해 100개의 자주 사용하는 개발자 학습 사이트를 정리했습니다