數據分析--iris dataset

數據分析–iris dataset

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import pylab
from pylab import figure, subplot, hist, xlim, show
from sklearn.datasets import load_iris
data = load_iris()
features = data.data
targets = data.target
plt.plot(features[targets==0,0],features[targets==0,1],'bo',features[targets==0,2],features[targets==0,3],'b+')
plt.plot(features[targets==1,0],features[targets==1,1],'go',features[targets==1,2],features[targets==1,3],'g+')
plt.plot(features[targets==2,0],features[targets==2,3],'ro',features[targets==2,2],features[targets==2,3],'r+')
xmin = min(features[:,0])
xmax = max(features[:,0])
subplot(411)
hist(features[targets==0,0],color='b',alpha=.7)
xlim(xmin,xmax)
subplot(412)
hist(features[targets==1,0],color='r',alpha=.7)
xlim(xmin,xmax)
subplot(413)
hist(features[targets==2,0],color='y',alpha=.7)
xlim(xmin,xmax)
subplot(414)
hist(features[:,0],color='g',alpha=.7)
xlim(xmin,xmax)

使用貝葉斯分類器

from sklearn.naive_bayes import GaussianNB
classifer = cf = GaussianNB()
cf.fit(features,targets)
print cf.predict(features)
print targets

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

劃分訓練集和驗證集

from sklearn import cross_validation
train, test, t_train, t_test = cross_validation.train_test_split(features, targets, test_size=0.4, random_state=0)
cf.fit(train,t_train)
t_score=cf.score(test,t_test)
print t_score
train_score = cf.score(train,t_train)
print train_score


0.933333333333
0.977777777778

pca 降維

from sklearn.decomposition import PCA
pca=PCA(n_components=2)
pcad=pca.fit_transform(features)
plt.plot(pcad[targets==0,0],pcad[targets==0,1],'bo')

聚類

from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3)
kms.fit(features)
c = kms.predict(features)
from sklearn.metrics import completeness_score, homogeneity_score
print completeness_score(targets,c)
print homogeneity_score(targets,c)

0.764986151449
0.751485402199

相關性分析

from numpy import corrcoef
corr = corrcoef(features.T)
print corr

[[ 1.         -0.10936925  0.87175416  0.81795363]
 [-0.10936925  1.         -0.4205161  -0.35654409]
 [ 0.87175416 -0.4205161   1.          0.9627571 ]
 [ 0.81795363 -0.35654409  0.9627571   1.        ]]
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章