python 기계 학습 sklearn 데이터 세트 iris 소개

###########################
#
#              ,       “http://python.jobbole.com/83563/”   
#              ,             。        ,   :
#      1.              ;
#      2.              ;
#      3.     lib         ;
#      4.        python       ;
#      5.        ,             ;
#      6.            ,      ,     !
#      7.  
#         :  python 0  !            ,     ,     !
#                “          ”   !
#  :   ,           (  )。 
#  :       “http://www.cnblogs.com/taichu/p/5216659.html”,       ,         !
###########################

###########################
#(0)  
# 1.       ,            。
#     ‘  ’             ,      。
#          ,                  !
###########################

###########################
#(1)      (  )
#   :    ;     
###########################

##################
# ubuntu15.10     6      python  
#sudo apt-get install python   #  python   ,        2.7   
#sudo apt-get install python-numpy    #  python numpy  
#sudo apt-get install python-matplotlib
#sudo apt-get install python-networkx
#sudo apt-get install python-sklearn
#python  # python          ,         ,           ?
#  ,    Anaconda Python IDE    ,      ,  SCIPY        ,      !
#    :   WIN10      Ubuntu15.10     , Ubuntu      python,       lib         !
##################

import urllib2
url = 'http://aima.cs.berkeley.edu/data/iris.csv'
u = urllib2.urlopen(url)
#           ,         !
#localfn='/mnt/hgfs/sharedfolder/iris.csv' #for linux
localfn='D:\\Virtual Machines\\sharedfolder\\iris.csv' #for windows
localf = open(localfn, 'w')
localf.write(u.read())
localf.close()

# data examples
#COL1,  COL2,   COL3,   COL4,   COL5
#5.1    3.5    1.4    0.2    setosa
#…    …    …    …    …
#4.7    3.2    1.3    0.2    setosa
#7    3.2    4.7    1.4    versicolor
#…    …    …    …    …
#6.9    3.1    4.9    1.5    versicolor
#6.3    3.3    6    2.5    virginica
#…    …    …    …    …
#7.1    3    5.9    2.1    virginica

#############################
#U can get description of 'iris.csv' 
#at 'http://aima.cs.berkeley.edu/data/iris.txt'
#Definiation of COLs:
#1. sepal length in cm (   )
#2. sepal width in cm(   )
#3. petal length in cm (   )
#4. petal width in cm(   )
#5. class: 
#      -- Iris Setosa
#      -- Iris Versicolour
#      -- Iris Virginica
#Missing Attribute Values: None
#################################


from numpy import genfromtxt, zeros
# read the first 4 columns
data = genfromtxt(localfn,delimiter=',',usecols=(0,1,2,3)) 
# read the fifth column
target = genfromtxt(localfn,delimiter=',',usecols=(4),dtype=str)

print data.shape
# output: (150, 4)
print target.shape
# output: (150,)

#auto build a collection of unique elements
print set(target)  
# output: set(['setosa', 'versicolor', 'virginica'])
#print set(data) #wrong usage of set, numbers is unhashable

######################
#plot     :
#'bo'=blue+circle; 'r+'=red+plus;'g'=red+*
#search keyword 'matlab plot' on web for details
#http://www.360doc.com/content/15/0113/23/16740871_440559122.shtml
#http://zhidao.baidu.com/link?url=6JA9-A-UT3kmslX1Ba5uTY1718Xh-OgebUJVuOs3bdzfnt4jz4XXQdAmvb7R5JYMHyRbBU0MYr-OtXPyKxnxXsPPkm9u5qAciwxIVACR8k7
######################

#figure for 2D data
from pylab import plot, show
plot(data[target=='setosa',0],data[target=='setosa',2],'bo')
plot(data[target=='versicolor',0],data[target=='versicolor',2],'r+')
plot(data[target=='virginica',0],data[target=='virginica',2],'g*')
show()

#  :   Ubuntu python        , figure      RUN.
#    Anaconda spyder(Python2.7)     ,   figure      console
#

#figure for all 4D(4   ) data,     ,    ,    
setosa_sepal_x=ssx=data[target=='setosa',0]
setosa_sepal_y=ssy=data[target=='setosa',1]
setosa_petal_x=spx=data[target=='setosa',2]
setosa_petal_y=spy=data[target=='setosa',3]

versicolor_sepal_x=vsx=data[target=='versicolor',0]
versicolor_sepal_y=vsy=data[target=='versicolor',1]
versicolor_petal_x=vpx=data[target=='versicolor',2]
versicolor_petal_y=vpy=data[target=='versicolor',3]

virginica_sepal_x=vgsx=data[target=='virginica',0]
virginica_sepal_y=vgsy=data[target=='virginica',1]
virginica_petal_x=vgpx=data[target=='virginica',2]
virginica_petal_y=vgpy=data[target=='virginica',3]

plot(ssx,ssy,'bo',spx,spy,'b+')
plot(vsx,vsy,'ro',vpx,vpy,'r+')
plot(vgsx,vgsy,'go',vgpx,vgpy,'g+')
show()


#figure for 1D(     ),            
#pylab        
#http://hyry.dip.jp/tech/book/page/scipy/matplotlib_fast_plot.html
from pylab import figure, subplot, hist, xlim, show
xmin = min(data[:,0])
xmax = max(data[:,0])
figure() #   ,       figure
subplot(411) # distribution of the setosa class (1st, on the top)
hist(data[target=='setosa',0],color='b',alpha=.7)
xlim(xmin,xmax)
#subplot( , ,plot );(4,1,2)   412,   10   
subplot(412) # distribution of the versicolor class (2nd)
hist(data[target=='versicolor',0],color='r',alpha=.7)
xlim(xmin,xmax)
subplot(413) # distribution of the virginica class (3rd)
hist(data[target=='virginica',0],color='g',alpha=.7)
xlim(xmin,xmax)
subplot(414) # global histogram (4th, on the bottom)
hist(data[:,0],color='y',alpha=.7)
xlim(xmin,xmax)
show()

###########################
#(2)    
#               ,  (    /      /      )
###########################

#  target  (1 )   0 t  
t = zeros(len(target))
#type(t) #show type of t (numpy.ndarray)
#print t #show contains of t
# target             1(   )
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
#print t

#   data     
from sklearn.naive_bayes import GaussianNB
classifier = cf = GaussianNB()
cf.fit(data,t) # training on the iris dataset
print cf.predict(data[0]) #     1   
#output:[ 1.]
print t[0]
#output:1.0

#     data           ,t      
from sklearn import cross_validation
train, test, t_train, t_test = cross_validation.train_test_split(data, t, \
test_size=0.4, random_state=0)

print train.shape
#output:(90, 4)
print test.shape
#output:(60, 4)
print t_train.shape
#output:(90,)
print t_test.shape
#output:(60,)

# 60%     ,  40%    ,  93.3%
cf.fit(train,t_train) # train
print cf.score(test,t_test) # test
#output:0.93333333333333335
cf.score(train,t_train) #                 100%  !
#output:0.97777777777777775

#        ,      ,      97%
cf.fit(data,t)
#output:GaussianNB()
cf.score(data,t)
#output:0.95999999999999996


# 100%     ,  40%    ,  94.99%
cf.fit(data,t)
#output:GaussianNB()
cf.score(test,t_test)
#output:0.94999999999999996

#############################################################
#TODO:    (            )
#              feature         。     ,   :
#1.      ,     ,       ,  2  feature; sepal-size,petal-size
#2.          ,    ,       ,  2  feature! whole-length,whole-wide
#3.       ,     ,  4   feature;
#4.       8 feature     ?  :   ,      ,     ?        ratio ?
#     ,         ?   ?
#       sepal-size petal-size        (            )
#          100%,            100%  ,              (      )
#TRY:      ,     ,       ,    ,       !
#############################################################


#            
from sklearn.metrics import confusion_matrix
print confusion_matrix(cf.predict(test),t_test)
#output:[[16  0  0]
#output: [ 0 23  4]
#output: [ 0  0 17]]

#        
#            
#        -----------
#         1  2  3
#  | 1 43  5   2
#  | 2 2   45  3
#  | 3 0   1   49
#  |
#
#
#  :     3     50   ;
#       3 1        2;
#       2 2        1,3        3
#       1 5        2,2        3

#          
#Precision:       
#Recall(       ):       
#F1-Score:precision recall      

from sklearn.metrics import classification_report
print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])
#output:            precision    recall  f1-score   support
#output:    setosa       1.00      1.00      1.00        16
#output:versicolor       1.00      0.85      0.92        27
#output: virginica       0.81      1.00      0.89        17
#output:avg / total      0.95      0.93      0.93        60

##############################################################
#          
#     :Hn=n/(1/a1+1/a2+...+1/an)
#     :Gn=(a1a2...an)^(1/n)
#     :An=(a1+a2+...+an)/n
#     :Qn=√ [(a1^2+a2^2+...+an^2)/n]
#         Hn ≤ Gn ≤ An ≤ Qn
#
#
# : 4             3、4、6、8 ,         (1      )?
# :        , 1/[(1/3+1/4+1/6+1/8)/4]=4/(1/3+1/4+1/6+1/8)=4.57 
###########################################################


#
#
#                   


#
#                 ,  Cross Validation。
#           :                 ,
#
#sklearn             :

from sklearn.cross_validation import cross_val_score
# cross validation with 6 iterations 
scores = cross_val_score(classifier, data, t, cv=6)
print scores
#output:[ 0.92592593  1.          0.91666667  0.91666667  0.95833333  1.        ]
#         。  CV=6,  6 

#                  。               :
from numpy import mean
print mean(scores)
#output:0.96

#        cv  ,   mean 
#  CV  >=2,    'ValueError: k-fold cross validation requires at least one train / test split by setting n_folds=2 or more, got n_folds=1.'
#  CV             ( t=50;t_train=27;t_test=16),    ndarray    !
#1.  data     cv             
for i in range(2, 51):
  scores = cross_val_score(classifier, data, t, cv=i)
  print mean(scores)#  for               (         )        !


#2.  test     cv               
for i in range(2, 17): print mean(cross_val_score(classifier, test, t_test, cv=i))


#3.  train     cv               
for i in range(2, 28): print mean(cross_val_score(classifier, train, t_train, cv=i))


#
#
#   numpy.ndarray        
ndarray={}
for item in t: ndarray[item] = ndarray.get(item, 0) + 1
    #         (     !),    python  for      

print(ndarray)
#output:{1.0: 50, 2.0: 50, 3.0: 50}

#   numpy.ndarray        
ndarray={}
for item in t_train: ndarray[item] = ndarray.get(item, 0) + 1
    #         ,    python  for      

print(ndarray)
#output:{1.0: 34, 2.0: 27, 3.0: 29}

#   numpy.ndarray        
ndarray={}
for item in t_test: ndarray[item] = ndarray.get(item, 0) + 1
    #         ,    python  for      

print(ndarray)
#output:{1.0: 16, 2.0: 23, 3.0: 21}

#
#
#***********************************
#    :     , 1 n-1 n-1 1          ;
#TODO:            model(           -    )       ;
#                       cv    ;
#                 ,  model                ?
#          figure X train/data(      %)(0,1);Y     mean         !(0,1)
#                           , RUN            
#TODO:         ,       ,           ,           ?
#                  。                  toolbox,          
#             auto   ,          ,        。
#***********************************

###########################
#(3)  
###########################
#k-means    :       k ,  n       k   ;       :             ;           ;
#                                      “    ”(    )     。
#k-means       :
#(1)   n         k           (      k );
#(2)            (    ),                ;                ;
#(3)       (   )     (    );
#(4)         ,       ,      ,     ;            (2)。
############################


from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3) # initialization     3   ,            3 。
#kmeans = KMeans(k=3, init='random') # both parameters are wrong
kms.fit(data) # actual execution
c = kms.predict(data)

from sklearn.metrics import completeness_score, homogeneity_score
print completeness_score(t,c)
#output:0.764986151449
print homogeneity_score(t,c)
#output:0.751485402199

#    !t    3    ,     1,2,3
#                        ,           1。
#                              1.
figure()
subplot(211) # top figure with the real classes
plot(data[t==1,0],data[t==1,2],'bo')
plot(data[t==2,0],data[t==2,2],'ro')
plot(data[t==3,0],data[t==3,2],'go')
subplot(212) # bottom figure with classes assigned automatically
plot(data[c==1,0],data[c==1,2],'bo',alpha=.5)
plot(data[c==2,0],data[c==2,2],'go',alpha=.5)
plot(data[c==0,0],data[c==0,2],'mo',alpha=.5)
show()

#          ,          k-means    ,
#                。  kmean               
#

#    4 feature     2        ,        3  ,
#
import matplotlib.pyplot as plt
plt.figure()
plt.subplot(211) # top figure with the real classes
plt.plot(data[t==1,0],data[t==1,1],'bo',data[t==1,2],data[t==1,3],'b+')
plt.plot(data[t==2,0],data[t==2,1],'ro',data[t==2,2],data[t==2,3],'r+')
plt.plot(data[t==3,0],data[t==3,1],'go',data[t==3,2],data[t==3,3],'g+')
plt.subplot(212) # bottom figure with classes assigned automatically
plt.plot(data[c==0,0],data[c==0,1],'bo',data[c==0,2],data[c==0,3],'b+',alpha=.7)
plt.plot(data[c==1,0],data[c==1,1],'ro',data[c==1,2],data[c==1,3],'r+',alpha=.7)
plt.plot(data[c==2,0],data[c==2,1],'go',data[c==2,2],data[c==2,3],'g+',alpha=.7)
p=plt
fig=plt.gcf()
fig.show() # p.show()  ,         。


###########################
#(4)  
###########################

#
#       :       ,       。
#          ;            ;
#          ,      。


##############
#sklear.linear_model    LinearRegression  。
#
#             。  sklearn  ;
#
##############

#         40    ,        
#
#Step1-        40  
from numpy.random import rand
x = rand(40,1) # explanatory variable
y = x*x*x+rand(40,1)/5 # depentend variable

#Step2-    
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x,y)

#Step3-    x  ,         y  (        )
from numpy import linspace, matrix
#  0 1  40    
randx = linspace(0,1,40) 
#      40 x   ,        y   ,     
#  y   x       y   
plot(x,y,'o',randx,linreg.predict(matrix(randx).T),'--r')
show()

#Step4-    MSE                。0  
from sklearn.metrics import mean_squared_error
print mean_squared_error(linreg.predict(x),y)

#########################
#                  
#########################
#  x y(  reshape     (50,)     (50,1),   linreg.fit!
ssx_blue=data[target=='setosa',0].reshape((50,1)) #  setosa sepal  length
ssy_blue=data[target=='setosa',1].reshape((50,1)) #  setosa sepal  width

# x y        
linreg = LinearRegression()
linreg.fit(ssx_blue,ssy_blue)

#    x  ,         y  (        )
#        setosa   sepal        X:[4.0-6.0]y:[2.5-4.5]
randx = linspace(4.0,6.0,50) 
plot(ssx_blue,ssy_blue,'o',randx,linreg.predict(matrix(randx).T),'--r')
show()

#    MSE                。0  
print mean_squared_error(linreg.predict(ssx_blue),ssy_blue)


###########################
#(5)     
###########################

#    feature                 ,    。
#                 。                   。
#
#          4                。
#    :feature         ,            feature        ,
#                       ,       feature        。

#            。                   。
#1        ,0     ,-1        。

#              ,         :
#“    ”petal width “    ”petal length     。

from numpy import corrcoef
corr = corrcoef(data.T) # .T gives the transpose
print corr
#output:[[ 1.         -0.10936925  0.87175416  0.81795363]
#output: [-0.10936925  1.         -0.4205161  -0.35654409]
#output: [ 0.87175416 -0.4205161   1.          0.9627571 ]
#output: [ 0.81795363 -0.35654409  0.9627571   1.        ]]

from pylab import pcolor, colorbar, xticks, yticks
from numpy import arange
pcolor(corr) #       ,4      4x4
colorbar() #       
#  X,Y   ,       1,   1,2,3,4,      name  。
xticks(arange(1,5),['sepal length',  'sepal width', 'petal length', 'petal width'],rotation=-20)
yticks(arange(1,5),['sepal length',  'sepal width', 'petal length', 'petal width'],rotation=-45)
show()


###########################
#(6)    (  )
#       PCA
###########################


from sklearn.decomposition import PCA
#     feature(   )         
#  3D    ,    ,     2D   , 4D          
#   data   4 feature   2    。
#    :       feature      ,         。
pca = PCA(n_components=2)

pcad = pca.fit_transform(data)

plot(pcad[target=='setosa',0],pcad[target=='setosa',1],'bo')
plot(pcad[target=='versicolor',0],pcad[target=='versicolor',1],'ro')
plot(pcad[target=='virginica',0],pcad[target=='virginica',1],'go')
show()

#     PC
print pca.explained_variance_ratio_
#output: [ 0.92461621  0.05301557]
pc1, pc2 = pca.explained_variance_ratio_ #  2 PC

print 1-sum(pca.explained_variance_ratio_)
#output:0.0223682249752
print 1.0-pc1-pc2 #       

#       
data_inv = pca.inverse_transform(pcad)
#                
print abs(sum(sum(data - data_inv)))
#output:6.66133814775e-15

#    :PC   1  4 (      4 )
# PCA     ;4   100%,3     ;
for i in range(1,5):
    pca = PCA(n_components=i)
    pca.fit(data)
    print sum(pca.explained_variance_ratio_) * 100,'%'

#output:92.4616207174 %
#output:97.7631775025 %
#output:99.481691455 %
#output:100.0 %

  :http://www.cnblogs.com/taichu/p/5251332.html

좋은 웹페이지 즐겨찾기