python 기계 학습 sklearn 데이터 세트 iris 소개
89181 단어 pythonscikit-learniris 데이터 세트
###########################
# :
# , “http://python.jobbole.com/83563/”
# , 。 , :
# 1. ;
# 2. ;
# 3. lib ;
# 4. python ;
# 5. , ;
# 6. , , !
# 7.
# : python 0 ! , , !
# “ ” !
# : , ( )。
# : “http://www.cnblogs.com/taichu/p/5216659.html”, , !
###########################
###########################
#(0)
# 1. , 。
# ‘ ’ , 。
# , !
###########################
###########################
#(1) ( )
# : ;
###########################
##################
# ubuntu15.10 6 python
#sudo apt-get install python # python , 2.7
#sudo apt-get install python-numpy # python numpy
#sudo apt-get install python-matplotlib
#sudo apt-get install python-networkx
#sudo apt-get install python-sklearn
#python # python , , ?
# , Anaconda Python IDE , , SCIPY , !
# : WIN10 Ubuntu15.10 , Ubuntu python, lib !
##################
import urllib2
url = 'http://aima.cs.berkeley.edu/data/iris.csv'
u = urllib2.urlopen(url)
# , !
#localfn='/mnt/hgfs/sharedfolder/iris.csv' #for linux
localfn='D:\\Virtual Machines\\sharedfolder\\iris.csv' #for windows
localf = open(localfn, 'w')
localf.write(u.read())
localf.close()
# data examples
#COL1, COL2, COL3, COL4, COL5
#5.1 3.5 1.4 0.2 setosa
#… … … … …
#4.7 3.2 1.3 0.2 setosa
#7 3.2 4.7 1.4 versicolor
#… … … … …
#6.9 3.1 4.9 1.5 versicolor
#6.3 3.3 6 2.5 virginica
#… … … … …
#7.1 3 5.9 2.1 virginica
#############################
#U can get description of 'iris.csv'
#at 'http://aima.cs.berkeley.edu/data/iris.txt'
#Definiation of COLs:
#1. sepal length in cm ( )
#2. sepal width in cm( )
#3. petal length in cm ( )
#4. petal width in cm( )
#5. class:
# -- Iris Setosa
# -- Iris Versicolour
# -- Iris Virginica
#Missing Attribute Values: None
#################################
from numpy import genfromtxt, zeros
# read the first 4 columns
data = genfromtxt(localfn,delimiter=',',usecols=(0,1,2,3))
# read the fifth column
target = genfromtxt(localfn,delimiter=',',usecols=(4),dtype=str)
print data.shape
# output: (150, 4)
print target.shape
# output: (150,)
#auto build a collection of unique elements
print set(target)
# output: set(['setosa', 'versicolor', 'virginica'])
#print set(data) #wrong usage of set, numbers is unhashable
######################
#plot :
#'bo'=blue+circle; 'r+'=red+plus;'g'=red+*
#search keyword 'matlab plot' on web for details
#http://www.360doc.com/content/15/0113/23/16740871_440559122.shtml
#http://zhidao.baidu.com/link?url=6JA9-A-UT3kmslX1Ba5uTY1718Xh-OgebUJVuOs3bdzfnt4jz4XXQdAmvb7R5JYMHyRbBU0MYr-OtXPyKxnxXsPPkm9u5qAciwxIVACR8k7
######################
#figure for 2D data
from pylab import plot, show
plot(data[target=='setosa',0],data[target=='setosa',2],'bo')
plot(data[target=='versicolor',0],data[target=='versicolor',2],'r+')
plot(data[target=='virginica',0],data[target=='virginica',2],'g*')
show()
# : Ubuntu python , figure RUN.
# Anaconda spyder(Python2.7) , figure console
# !
#figure for all 4D(4 ) data, , ,
setosa_sepal_x=ssx=data[target=='setosa',0]
setosa_sepal_y=ssy=data[target=='setosa',1]
setosa_petal_x=spx=data[target=='setosa',2]
setosa_petal_y=spy=data[target=='setosa',3]
versicolor_sepal_x=vsx=data[target=='versicolor',0]
versicolor_sepal_y=vsy=data[target=='versicolor',1]
versicolor_petal_x=vpx=data[target=='versicolor',2]
versicolor_petal_y=vpy=data[target=='versicolor',3]
virginica_sepal_x=vgsx=data[target=='virginica',0]
virginica_sepal_y=vgsy=data[target=='virginica',1]
virginica_petal_x=vgpx=data[target=='virginica',2]
virginica_petal_y=vgpy=data[target=='virginica',3]
plot(ssx,ssy,'bo',spx,spy,'b+')
plot(vsx,vsy,'ro',vpx,vpy,'r+')
plot(vgsx,vgsy,'go',vgpx,vgpy,'g+')
show()
#figure for 1D( ),
#pylab
#http://hyry.dip.jp/tech/book/page/scipy/matplotlib_fast_plot.html
from pylab import figure, subplot, hist, xlim, show
xmin = min(data[:,0])
xmax = max(data[:,0])
figure() # , figure
subplot(411) # distribution of the setosa class (1st, on the top)
hist(data[target=='setosa',0],color='b',alpha=.7)
xlim(xmin,xmax)
#subplot( , ,plot );(4,1,2) 412, 10
subplot(412) # distribution of the versicolor class (2nd)
hist(data[target=='versicolor',0],color='r',alpha=.7)
xlim(xmin,xmax)
subplot(413) # distribution of the virginica class (3rd)
hist(data[target=='virginica',0],color='g',alpha=.7)
xlim(xmin,xmax)
subplot(414) # global histogram (4th, on the bottom)
hist(data[:,0],color='y',alpha=.7)
xlim(xmin,xmax)
show()
###########################
#(2)
# , ( / / )
###########################
# target (1 ) 0 t
t = zeros(len(target))
#type(t) #show type of t (numpy.ndarray)
#print t #show contains of t
# target 1( )
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
#print t
# data
from sklearn.naive_bayes import GaussianNB
classifier = cf = GaussianNB()
cf.fit(data,t) # training on the iris dataset
print cf.predict(data[0]) # 1
#output:[ 1.]
print t[0]
#output:1.0
# data ,t
from sklearn import cross_validation
train, test, t_train, t_test = cross_validation.train_test_split(data, t, \
test_size=0.4, random_state=0)
print train.shape
#output:(90, 4)
print test.shape
#output:(60, 4)
print t_train.shape
#output:(90,)
print t_test.shape
#output:(60,)
# 60% , 40% , 93.3%
cf.fit(train,t_train) # train
print cf.score(test,t_test) # test
#output:0.93333333333333335
cf.score(train,t_train) # 100% !
#output:0.97777777777777775
# , , 97%
cf.fit(data,t)
#output:GaussianNB()
cf.score(data,t)
#output:0.95999999999999996
# 100% , 40% , 94.99%
cf.fit(data,t)
#output:GaussianNB()
cf.score(test,t_test)
#output:0.94999999999999996
#############################################################
#TODO: ( )
# feature 。 , :
#1. , , , 2 feature; sepal-size,petal-size
#2. , , , 2 feature! whole-length,whole-wide
#3. , , 4 feature;
#4. 8 feature ? : , , ? ratio ?
# , ? ?
# sepal-size petal-size ( )
# 100%, 100% , ( )
#TRY: , , , , !
#############################################################
#
from sklearn.metrics import confusion_matrix
print confusion_matrix(cf.predict(test),t_test)
#output:[[16 0 0]
#output: [ 0 23 4]
#output: [ 0 0 17]]
#
#
# -----------
# 1 2 3
# | 1 43 5 2
# | 2 2 45 3
# | 3 0 1 49
# |
#
# :
# : 3 50 ;
# 3 1 2;
# 2 2 1,3 3
# 1 5 2,2 3
#
#Precision:
#Recall( ):
#F1-Score:precision recall
from sklearn.metrics import classification_report
print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])
#output: precision recall f1-score support
#output: setosa 1.00 1.00 1.00 16
#output:versicolor 1.00 0.85 0.92 27
#output: virginica 0.81 1.00 0.89 17
#output:avg / total 0.95 0.93 0.93 60
##############################################################
#
# :Hn=n/(1/a1+1/a2+...+1/an)
# :Gn=(a1a2...an)^(1/n)
# :An=(a1+a2+...+an)/n
# :Qn=√ [(a1^2+a2^2+...+an^2)/n]
# Hn ≤ Gn ≤ An ≤ Qn
#
# :
# : 4 3、4、6、8 , (1 )?
# : , 1/[(1/3+1/4+1/6+1/8)/4]=4/(1/3+1/4+1/6+1/8)=4.57
###########################################################
# 。
# 、
#
# ,
# , Cross Validation。
# : ,
# 。
#sklearn :
from sklearn.cross_validation import cross_val_score
# cross validation with 6 iterations
scores = cross_val_score(classifier, data, t, cv=6)
print scores
#output:[ 0.92592593 1. 0.91666667 0.91666667 0.95833333 1. ]
# 。 CV=6, 6
# 。 :
from numpy import mean
print mean(scores)
#output:0.96
# cv , mean
# CV >=2, 'ValueError: k-fold cross validation requires at least one train / test split by setting n_folds=2 or more, got n_folds=1.'
# CV ( t=50;t_train=27;t_test=16), ndarray !
#1. data cv
for i in range(2, 51):
scores = cross_val_score(classifier, data, t, cv=i)
print mean(scores)# for ( ) !
#2. test cv
for i in range(2, 17): print mean(cross_val_score(classifier, test, t_test, cv=i))
#3. train cv
for i in range(2, 28): print mean(cross_val_score(classifier, train, t_train, cv=i))
#
#
# numpy.ndarray
ndarray={}
for item in t: ndarray[item] = ndarray.get(item, 0) + 1
# ( !), python for
print(ndarray)
#output:{1.0: 50, 2.0: 50, 3.0: 50}
# numpy.ndarray
ndarray={}
for item in t_train: ndarray[item] = ndarray.get(item, 0) + 1
# , python for
print(ndarray)
#output:{1.0: 34, 2.0: 27, 3.0: 29}
# numpy.ndarray
ndarray={}
for item in t_test: ndarray[item] = ndarray.get(item, 0) + 1
# , python for
print(ndarray)
#output:{1.0: 16, 2.0: 23, 3.0: 21}
#
#
#***********************************
# : , 1 n-1 n-1 1 ;
#TODO: model( - ) ;
# cv ;
# , model ?
# figure X train/data( %)(0,1);Y mean !(0,1)
# , RUN
#TODO: , , , ?
# 。 toolbox,
# auto , , 。
#***********************************
###########################
#(3)
###########################
#k-means : k , n k ; : ; ;
# “ ”( ) 。
#k-means :
#(1) n k ( k );
#(2) ( ), ; ;
#(3) ( ) ( );
#(4) , , , ; (2)。
############################
from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3) # initialization 3 , 3 。
#kmeans = KMeans(k=3, init='random') # both parameters are wrong
kms.fit(data) # actual execution
c = kms.predict(data)
from sklearn.metrics import completeness_score, homogeneity_score
print completeness_score(t,c)
#output:0.764986151449
print homogeneity_score(t,c)
#output:0.751485402199
# !t 3 , 1,2,3
# , 1。
# 1.
figure()
subplot(211) # top figure with the real classes
plot(data[t==1,0],data[t==1,2],'bo')
plot(data[t==2,0],data[t==2,2],'ro')
plot(data[t==3,0],data[t==3,2],'go')
subplot(212) # bottom figure with classes assigned automatically
plot(data[c==1,0],data[c==1,2],'bo',alpha=.5)
plot(data[c==2,0],data[c==2,2],'go',alpha=.5)
plot(data[c==0,0],data[c==0,2],'mo',alpha=.5)
show()
# , k-means ,
# 。 kmean
# ;
# 4 feature 2 , 3 ,
# 。
import matplotlib.pyplot as plt
plt.figure()
plt.subplot(211) # top figure with the real classes
plt.plot(data[t==1,0],data[t==1,1],'bo',data[t==1,2],data[t==1,3],'b+')
plt.plot(data[t==2,0],data[t==2,1],'ro',data[t==2,2],data[t==2,3],'r+')
plt.plot(data[t==3,0],data[t==3,1],'go',data[t==3,2],data[t==3,3],'g+')
plt.subplot(212) # bottom figure with classes assigned automatically
plt.plot(data[c==0,0],data[c==0,1],'bo',data[c==0,2],data[c==0,3],'b+',alpha=.7)
plt.plot(data[c==1,0],data[c==1,1],'ro',data[c==1,2],data[c==1,3],'r+',alpha=.7)
plt.plot(data[c==2,0],data[c==2,1],'go',data[c==2,2],data[c==2,3],'g+',alpha=.7)
p=plt
fig=plt.gcf()
fig.show() # p.show() , 。
###########################
#(4)
###########################
# 。
# : , 。
# ; ;
# , 。
##############
#sklear.linear_model LinearRegression 。
# ,
# 。 sklearn ;
#
##############
# 40 ,
# ,
#Step1- 40
from numpy.random import rand
x = rand(40,1) # explanatory variable
y = x*x*x+rand(40,1)/5 # depentend variable
#Step2-
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x,y)
#Step3- x , y ( )
from numpy import linspace, matrix
# 0 1 40
randx = linspace(0,1,40)
# 40 x , y ,
# y x y
plot(x,y,'o',randx,linreg.predict(matrix(randx).T),'--r')
show()
#Step4- MSE 。0
from sklearn.metrics import mean_squared_error
print mean_squared_error(linreg.predict(x),y)
#########################
#
#########################
# x y( reshape (50,) (50,1), linreg.fit!
ssx_blue=data[target=='setosa',0].reshape((50,1)) # setosa sepal length
ssy_blue=data[target=='setosa',1].reshape((50,1)) # setosa sepal width
# x y
linreg = LinearRegression()
linreg.fit(ssx_blue,ssy_blue)
# x , y ( )
# setosa sepal X:[4.0-6.0]y:[2.5-4.5]
randx = linspace(4.0,6.0,50)
plot(ssx_blue,ssy_blue,'o',randx,linreg.predict(matrix(randx).T),'--r')
show()
# MSE 。0
print mean_squared_error(linreg.predict(ssx_blue),ssy_blue)
###########################
#(5)
###########################
# feature , 。
# 。 。
# 。
# 4 。
# :feature , feature ,
# , feature 。
# 。 。
#1 ,0 ,-1 。
# , :
#“ ”petal width “ ”petal length 。
from numpy import corrcoef
corr = corrcoef(data.T) # .T gives the transpose
print corr
#output:[[ 1. -0.10936925 0.87175416 0.81795363]
#output: [-0.10936925 1. -0.4205161 -0.35654409]
#output: [ 0.87175416 -0.4205161 1. 0.9627571 ]
#output: [ 0.81795363 -0.35654409 0.9627571 1. ]]
from pylab import pcolor, colorbar, xticks, yticks
from numpy import arange
pcolor(corr) # ,4 4x4
colorbar() #
# X,Y , 1, 1,2,3,4, name 。
xticks(arange(1,5),['sepal length', 'sepal width', 'petal length', 'petal width'],rotation=-20)
yticks(arange(1,5),['sepal length', 'sepal width', 'petal length', 'petal width'],rotation=-45)
show()
###########################
#(6) ( )
# PCA
###########################
from sklearn.decomposition import PCA
# feature( )
# 3D , , 2D , 4D
# data 4 feature 2 。
# : feature , 。
pca = PCA(n_components=2)
pcad = pca.fit_transform(data)
plot(pcad[target=='setosa',0],pcad[target=='setosa',1],'bo')
plot(pcad[target=='versicolor',0],pcad[target=='versicolor',1],'ro')
plot(pcad[target=='virginica',0],pcad[target=='virginica',1],'go')
show()
# PC
print pca.explained_variance_ratio_
#output: [ 0.92461621 0.05301557]
pc1, pc2 = pca.explained_variance_ratio_ # 2 PC
print 1-sum(pca.explained_variance_ratio_)
#output:0.0223682249752
print 1.0-pc1-pc2 #
#
data_inv = pca.inverse_transform(pcad)
#
print abs(sum(sum(data - data_inv)))
#output:6.66133814775e-15
# :PC 1 4 ( 4 )
# PCA ;4 100%,3 ;
for i in range(1,5):
pca = PCA(n_components=i)
pca.fit(data)
print sum(pca.explained_variance_ratio_) * 100,'%'
#output:92.4616207174 %
#output:97.7631775025 %
#output:99.481691455 %
#output:100.0 %
:http://www.cnblogs.com/taichu/p/5251332.html
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
로마 숫자를 정수로 또는 그 반대로 변환그 중 하나는 로마 숫자를 정수로 변환하는 함수를 만드는 것이었고 두 번째는 그 반대를 수행하는 함수를 만드는 것이었습니다. 문자만 포함합니다'I', 'V', 'X', 'L', 'C', 'D', 'M' ; 문자열이 ...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.