python機器學習及實戰-Python基礎綜合實踐

#讀取數據
import pandas as pd
df_train = pd.read_csv('breast-cancer-train.csv')
df_test = pd.read_csv('breast-cancer-test.csv')

#print(df_train.info())
#print(df_test.info())

df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']]#將Type這一列值等於0的行的Clump Thickness,Cell Size列取出來,有點拗口
df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']]#將Type這一列值等於1的行的Clump Thickness,Cell Size列取出來

#print(df_test_negative)
#print(df_test_positive)

#繪製散點圖1
import matplotlib.pyplot as plt
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

#繪製二維直線圖2
import numpy as np
intercept = np.random.random([1])
coef = np.random.random([2])
lx=np.arange(0, 12)

ly = (-intercept - lx * coef[0]) / coef[1]
plt.plot(lx, ly, c='yellow')
plt.show()

#測試樣本正樣本和負樣本散點圖圖3
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker = 'x', s = 150, c = 'black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

#訓練樣本前十行訓練的線性分類器圖4
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness', 'Cell Size']][:10], df_train['Type'][:10])#採用訓練樣本的前十行進行訓練
#print(df_train[['Clump Thickness', 'Cell Size']][:10])
#print(df_train['Type'][:10])
print('Testing accuracy (10 training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))

intercept = lr.intercept_
coef = lr.coef_[0, :]
ly = (-intercept - lx * coef[0]) / coef[1]

plt.plot(lx, ly, c = 'green')
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker = 'x', s = 200, c = 'black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

#所有樣本訓練的線性分類器圖5
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])#採用所有訓練樣本進行訓練
print('Testing accuracy (all traning samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))
intercept = lr.intercept_
coef = lr.coef_[0, :]
ly = (-intercept - lx * coef[0]) / coef[1]

plt.plot(lx, ly, c = 'blue')
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker = 'o', s = 200, c = 'red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker = 'x', s= 200, c = 'black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

運行結果:

Testing accuracy (10 training samples): 0.868571428571
Testing accuracy (all traning samples): 0.937142857143
效果圖:


所用到的訓練數據和測試數據鏈接鏈接:http://pan.baidu.com/s/1c30cDS 密碼:h8a0

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章