文章目錄
一.Iris數據集的 Fisher線性分類判斷準確率
1.代碼:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
path=r'Iris.csv'
df = pd.read_csv(path, header=0)
Iris1=df.values[0:50,0:4]
Iris2=df.values[50:100,0:4]
Iris3=df.values[100:150,0:4]
m1=np.mean(Iris1,axis=0)
m2=np.mean(Iris2,axis=0)
m3=np.mean(Iris3,axis=0)
s1=np.zeros((4,4))
s2=np.zeros((4,4))
s3=np.zeros((4,4))
for i in range(0,30,1):
a=Iris1[i,:]-m1
a=np.array([a])
b=a.T
s1=s1+np.dot(b,a)
for i in range(0,30,1):
c=Iris2[i,:]-m2
c=np.array([c])
d=c.T
s2=s2+np.dot(d,c)
#s2=s2+np.dot((Iris2[i,:]-m2).T,(Iris2[i,:]-m2))
for i in range(0,30,1):
a=Iris3[i,:]-m3
a=np.array([a])
b=a.T
s3=s3+np.dot(b,a)
sw12=s1+s2
sw13=s1+s3
sw23=s2+s3
#投影方向
a=np.array([m1-m2])
sw12=np.array(sw12,dtype='float')
sw13=np.array(sw13,dtype='float')
sw23=np.array(sw23,dtype='float')
#判別函數以及T
#需要先將m1-m2轉化成矩陣才能進行求其轉置矩陣
a=m1-m2
a=np.array([a])
a=a.T
b=m1-m3
b=np.array([b])
b=b.T
c=m2-m3
c=np.array([c])
c=c.T
w12=(np.dot(np.linalg.inv(sw12),a)).T
w13=(np.dot(np.linalg.inv(sw13),b)).T
w23=(np.dot(np.linalg.inv(sw23),c)).T
#print(m1+m2) #1x4維度 invsw12 4x4維度 m1-m2 4x1維度
T12=-0.5*(np.dot(np.dot((m1+m2),np.linalg.inv(sw12)),a))
T13=-0.5*(np.dot(np.dot((m1+m3),np.linalg.inv(sw13)),b))
T23=-0.5*(np.dot(np.dot((m2+m3),np.linalg.inv(sw23)),c))
kind1=0
kind2=0
kind3=0
newiris1=[]
newiris2=[]
newiris3=[]
for i in range(30,49):
x=Iris1[i,:]
x=np.array([x])
g12=np.dot(w12,x.T)+T12
g13=np.dot(w13,x.T)+T13
g23=np.dot(w23,x.T)+T23
if g12>0 and g13>0:
newiris1.extend(x)
kind1=kind1+1
elif g12<0 and g23>0:
newiris2.extend(x)
elif g13<0 and g23<0 :
newiris3.extend(x)
#print(newiris1)
for i in range(30,49):
x=Iris2[i,:]
x=np.array([x])
g12=np.dot(w12,x.T)+T12
g13=np.dot(w13,x.T)+T13
g23=np.dot(w23,x.T)+T23
if g12>0 and g13>0:
newiris1.extend(x)
elif g12<0 and g23>0:
newiris2.extend(x)
kind2=kind2+1
elif g13<0 and g23<0 :
newiris3.extend(x)
for i in range(30,50):
x=Iris3[i,:]
x=np.array([x])
g12=np.dot(w12,x.T)+T12
g13=np.dot(w13,x.T)+T13
g23=np.dot(w23,x.T)+T23
if g12>0 and g13>0:
newiris1.extend(x)
elif g12<0 and g23>0:
newiris2.extend(x)
elif g13<0 and g23<0 :
newiris3.extend(x)
kind3=kind3+1
#花瓣與花萼的長度散點圖
plt.scatter(df.values[:50, 3], df.values[:50, 1], color='red', marker='o', label='setosa')
plt.scatter(df.values[50:100, 3], df.values[50: 100, 1], color='blue', marker='x', label='versicolor')
plt.scatter(df.values[100:150, 3], df.values[100: 150, 1], color='green', label='virginica')
plt.xlabel('petal length')
plt.ylabel('sepal length')
plt.title("花瓣與花萼長度的散點圖")
plt.rcParams['font.sans-serif']=['SimHei'] #顯示中文標籤
plt.rcParams['axes.unicode_minus']=False
plt.legend(loc='upper left')
plt.show()
#花瓣與花萼的寬度度散點圖
plt.scatter(df.values[:50, 4], df.values[:50, 2], color='red', marker='o', label='setosa')
plt.scatter(df.values[50:100, 4], df.values[50: 100, 2], color='blue', marker='x', label='versicolor')
plt.scatter(df.values[100:150, 4], df.values[100: 150, 2], color='green', label='virginica')
plt.xlabel('petal width')
plt.ylabel('sepal width')
plt.title("花瓣與花萼寬度的散點圖")
plt.legend(loc='upper left')
plt.show()
correct=(kind1+kind2+kind3)/60
print("樣本類內離散度矩陣S1:",s1,'\n')
print("樣本類內離散度矩陣S2:",s2,'\n')
print("樣本類內離散度矩陣S3:",s3,'\n')
print('-----------------------------------------------------------------------------------------------')
print("總體類內離散度矩陣Sw12:",sw12,'\n')
print("總體類內離散度矩陣Sw13:",sw13,'\n')
print("總體類內離散度矩陣Sw23:",sw23,'\n')
print('-----------------------------------------------------------------------------------------------')
print('判斷出來的綜合正確率:',correct*100,'%')
2.結果:
二.數據可視化
1.數據顯示
import pandas as pd
df_Iris = pd.read_csv(r'Iris.csv')
#前五行數據
print(df_Iris.head())
print('-----------------------------------------------------------------------------------------------')
#後五行數據
print(df_Iris.tail())
print('-----------------------------------------------------------------------------------------------')
#查看數據整體信息
df_Iris.info()
print('-----------------------------------------------------------------------------------------------')
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
-----------------------------------------------------------------------------------------------
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm \
145 146 6.7 3.0 5.2 2.3
146 147 6.3 2.5 5.0 1.9
147 148 6.5 3.0 5.2 2.0
148 149 6.2 3.4 5.4 2.3
149 150 5.9 3.0 5.1 1.8
Species
145 Iris-virginica
146 Iris-virginica
147 Iris-virginica
148 Iris-virginica
149 Iris-virginica
-----------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
Id 150 non-null int64
SepalLengthCm 150 non-null float64
SepalWidthCm 150 non-null float64
PetalLengthCm 150 non-null float64
PetalWidthCm 150 non-null float64
Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
-----------------------------------------------------------------------------------------------
2.描述性統計
#描述性統計
df_Iris.describe()
Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | |
---|---|---|---|---|---|
count | 150.000000 | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
mean | 75.500000 | 5.843333 | 3.054000 | 3.758667 | 1.198667 |
std | 43.445368 | 0.828066 | 0.433594 | 1.764420 | 0.763161 |
min | 1.000000 | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
25% | 38.250000 | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
50% | 75.500000 | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
75% | 112.750000 | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
max | 150.000000 | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
3.特徵計數
df_Iris.Species.value_counts()
Iris-virginica 50
Iris-versicolor 50
Iris-setosa 50
Name: Species, dtype: int64
三.特徵工程
import seaborn as sns
import matplotlib.pyplot as plt
#sns初始化
sns.set()
#去掉Species下的字符
df_Iris['Species']= df_Iris.Species.apply(lambda x: x.split('-')[1])
df_Iris.Species.unique()
array(['setosa', 'versicolor', 'virginica'], dtype=object)
1.繪製花萼的長度與寬度的散點圖
#花萼長度與寬度
sns.relplot(x='SepalLengthCm', y='SepalWidthCm', hue='Species', style='Species', data=df_Iris )
plt.title('SepalLengthCm and SepalWidthCm data by Species')
Text(0.5, 1, 'SepalLengthCm and SepalWidthCm data by Species')
2.繪製花瓣的長度與寬度的散點圖
#花瓣長度與寬度散點分佈圖
sns.relplot(x='PetalLengthCm', y='PetalWidthCm', hue='Species', style='Species', data=df_Iris )
plt.title('PetalLengthCm and PetalWidthCm data by Species')
Text(0.5, 1, 'PetalLengthCm and PetalWidthCm data by Species')
3.關係圖
#花萼長度與Id之間關係圖
sns.relplot(x="Id", y="SepalLengthCm",hue="Species", style="Species",kind="line", data=df_Iris)
plt.title('SepalLengthCm and Id data analysize')
#花萼寬度與Id之間關係圖
sns.relplot(x="Id", y="SepalWidthCm",hue="Species", style="Species",kind="line", data=df_Iris)
plt.title('SepalWidthCm and Id data analysize')
#花瓣長度與Id之間關係圖
sns.relplot(x="Id", y="PetalLengthCm",hue="Species", style="Species",kind="line", data=df_Iris)
plt.title('PetalLengthCm and Id data analysize')
#花瓣寬度與Id之間關係圖
sns.relplot(x="Id", y="PetalWidthCm",hue="Species", style="Species",kind="line", data=df_Iris)
plt.title('PetalWidthCm and Id data analysize')
Text(0.5, 1, 'PetalWidthCm and Id data analysize')
4.散點圖與直方圖
#花萼長度與寬度直方圖
sns.jointplot(x='SepalLengthCm', y='SepalWidthCm', data=df_Iris)
#花瓣長度與寬度直方圖
sns.jointplot(x='PetalLengthCm', y='PetalWidthCm', data=df_Iris)
<seaborn.axisgrid.JointGrid at 0x2758dd42ec8>
5.箱線圖
箱線圖能顯示出一組數據的最大值, 最小值, 四分位數以及異常點
#比如數據中的SepalLengthCm屬性
sns.boxplot(x='SepalLengthCm', data=df_Iris)
<matplotlib.axes._subplots.AxesSubplot at 0x2758da0d1c8>
#對於每個屬性的data創建一個新的DataFrame
import numpy as np
Iris1 = pd.DataFrame({"Id": np.arange(1,151), 'Attribute': 'SepalLengthCm', 'Data':df_Iris.SepalLengthCm, 'Species':df_Iris.Species})
Iris2 = pd.DataFrame({"Id": np.arange(151,301), 'Attribute': 'SepalWidthCm', 'Data':df_Iris.SepalWidthCm, 'Species':df_Iris.Species})
Iris3 = pd.DataFrame({"Id": np.arange(301,451), 'Attribute': 'PetalLengthCm', 'Data':df_Iris.PetalLengthCm, 'Species':df_Iris.Species})
Iris4 = pd.DataFrame({"Id": np.arange(451,601), 'Attribute': 'PetalWidthCm', 'Data':df_Iris.PetalWidthCm, 'Species':df_Iris.Species})
#將四個DataFrame合併爲一個.
Iris = pd.concat([Iris1, Iris2, Iris3, Iris4])
#繪製箱線圖
sns.boxplot(x='Attribute', y='Data', data=Iris)
<matplotlib.axes._subplots.AxesSubplot at 0x2758ddaed48>
sns.boxplot(x='Attribute', y='Data',hue='Species', data=Iris)
<matplotlib.axes._subplots.AxesSubplot at 0x2758de44448>
6.繪製琴圖
sns.violinplot(x='Attribute', y='Data', hue='Species', data=Iris )
<matplotlib.axes._subplots.AxesSubplot at 0x2758df9a688>
7.繪製分佈圖
sns.pairplot(df_Iris.drop('Id', axis=1), hue='Species')
#保存圖片
plt.savefig('pairplot.png')
plt.show()