Python機器學習庫:scikit-learn
1、特性:
簡單高效的數據挖掘和機器學習分析
對所有用戶開放,根據不同需求可高度可重用性
基於Numpy,Scipy和matplotlib
開源,商用級別:獲得BSD許可
2、覆蓋問題領域:
分類(Classification),迴歸(Regression),聚類(Clustering),降維(dimensionality reduction)
模型選擇(model selection),預處理(preprocessing)
3、使用scikit-learn
安裝scikit-learn:pip,easy_install,Windows installer
安裝必須package:numpy,Scipy和matplotlib,可使用Anaconda(包含numpy,scipy等科學計算常用package)
安裝注意問題:python解釋器版本(2.7 or 3.4?),32-bit or 64-bit系統
安裝Graphviz:http://www.graphviz.org/ 配置環境變量 轉化dot文件至pdf可視化決策樹。
決策樹(decision tree)應用:Datesets.csv數據集
from sklearn.feature_extraction import DictVectorizer #feature特徵 extraction提取 dict字典 vectorizer向量化程序
import csv #csv程序
from sklearn import preprocessing #preprocessing預處理
from sklearn import tree #tree樹
from sklearn.externals.six import StringIO #external外部的 string字符串
allElectronicsData = open(r'E:\\data\\DataSets.csv','rb') #electronics電子工業
reader = csv.reader(allElectronicsData)
headers = reader.next()
print(headers)
featureList = [] #特徵示例
labelList = [] #標號表
for row in reader: #行
labelList.append(row[len(row) - 1])
rowDict = {}
for i in range(1, len(row) - 1):
rowDict[headers[i]] = row[i]
featureList.append(rowDict)
print(featureList)
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()
print("dummyX:" + str(dummyX)) #dummy虛擬的
print(vec.get_feature_names())
print("labelList:" + str(labelList))
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print("dummyY:" + str(dummyY))
clf = tree.DecisionTreeClassifier(criterion='entropy') #熵
clf = clf.fit(dummyX,dummyY)
print("clf:" + str(clf))
with open("allElectronicInformationGainOri.dot",'w') as f:
f = tree.export_graphviz(clf,feature_names=vec.get_feature_names(),out_file = f)
oneRowX = dummyX[0,:]
print("oneRowX:" + str(oneRowX))
newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX:" + str(newRowX))
predictedY = clf.predict(newRowX)
print("predictedY:" + str(predictedY))