Spark MLlib 下的邏輯迴歸二元分類

訓練模型

導入必要的包

import numpy as np
import pyspark
from matplotlib import pyplot as plt
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics

演示迴歸曲線

x = np.linspace(0,1000,100)
y_t = x*4 + 5
y_r = y_t + np.random.randint(-1000,1000,100)
plt.plot(x,y_t,ls="-",c="r")
plt.scatter(x,y_r)

初始化spark的上下文對象

sc = pyspark.SparkContext(master="local[*]",appName="StumbleuponAnalysis")

準備爲數據

def extract_features(fields,categories_dict,end):
    # 加載字段對應的類別id
    category_id = categories_dict[fields[3]]
    # 初始化類別特徵集合 全置爲0
    category_features = np.zeros(len(categories_dict))
    # 把類別id對應的位置變爲1
    category_features[category_id] = 1
    # 初始化數值特徵集
    numerical_features = [0.0 if f=="?" else float(f) for f in fields[4:end] ]
    # 將兩個特徵集合並後返回
    return np.concatenate((category_features,numerical_features))

def parpare_data(sc,scale):
    # 讀入文件
    raw_lines_and_header = sc.textFile("file:/home/zh123/.jupyter/workspace/stumbleupon/train.tsv")
    # 取文件頭部數據
    header_line = raw_lines_and_header.first()
    # 將數據去掉頭部
    raw_non_header_data = raw_lines_and_header.filter(lambda l:l!=header_line)
    # 去掉引號
    raw_non_quot_lines = raw_non_header_data.map(lambda l:l.replace("\"",""))
    # 每行以 “\t“ 分割成多個字段
    raw_data = raw_non_quot_lines.map(lambda l:l.split("\t"))
    print("數據長度:",raw_data.count())
    # 類別字典 將文本類別映射爲數值
    categories_dict = raw_data.map(lambda field:field[3]).distinct().zipWithIndex().collectAsMap()
    # 標籤
    label_rdd = raw_data.map(lambda fields:float(fields[-1]))
    # 特徵
    feature_rdd = raw_data.map(lambda fields:extract_features(fields,categories_dict,len(fields)-1))
    #============================vvvv 將特徵數據標準化 vvvv============================================
    # 初始化一個標準執行器
    std_scaler = StandardScaler(withMean=True,withStd=True).fit(feature_rdd)
    # 將原始特徵數據轉換爲標準特徵
    scaler_features = std_scaler.transform(feature_rdd)
    # 壓縮爲（標籤，特徵）
    label_point = label_rdd.zip(scaler_features)
    # 構建LabelPoint
    label_point_rdd = label_point.map(lambda r:LabeledPoint(r[0],r[1]))
    # 根據傳入的比例返回訓練集，驗證集，測試集， 類別映射字典
    return label_point_rdd.randomSplit(scale),categories_dict

模型評估

定義評估模型AUC值的函數

def evaluate_model(model,validation_data):
    # 根據傳入的驗證集通過模型產生 預測集
    predict = model.predict(validation_data.map(lambda p:p.features)).map(lambda x:float(x))
    # 將預測集和標籤集 壓縮成（預測值，標籤值）
    predict_and_label = predict.zip(validation_data.map(lambda p:p.label))
    # 初始化二元分類矩陣
    metrics = BinaryClassificationMetrics(predict_and_label)
    # 返回AUC值，ROC曲線下的面積
    return metrics.areaUnderROC

定義綜合模型評估函數

import time
def train_evaluate_model(train_data,validation_data,numIterations,stepSize,miniBatchFraction):
    # 記錄模型訓練開始時間
    start_time = time.time()
    # 訓練模型
    model = LogisticRegressionWithSGD.train(train_data,numIterations,stepSize,miniBatchFraction)
    # 訓練模型耗時
    duration = time.time() - start_time
    # 計算模型的AUC值
    AUC = evaluate_model(model,validation_data)
    return (model,AUC,duration,numIterations,stepSize,miniBatchFraction)
# train_data,validation_data,test_data = parpare_data(sc,scale=[8,1,1])
# train_evaluate_model(train_data,test_data,1,100,0.7)

定義評估參數的函數

import pandas as pd
def evaluate_parameter(train_data,validation_data,numIterationsList,stepSizeList,miniBatchFractionList):
    # 評測矩陣
    metrics = []
    # 列索引
    columns = ["Model","AUC","Duration","numIterations","stepSize","miniBatchFraction"]
    for numIterations in numIterationsList:
        for stepSize in stepSizeList:
            for miniBatchFraction in miniBatchFractionList:
                # 在評測矩陣中記錄結果
                metrics.append(train_evaluate_model(train_data,validation_data,numIterations,stepSize,miniBatchFraction))
    # 判斷長度大於1的作爲自變量，將其作爲行索引
    if(len(numIterationsList) > 1):
        return pd.DataFrame(metrics,index=numIterationsList,columns=columns)
    elif(len(stepSizeList) > 1):
        return pd.DataFrame(metrics,index=stepSizeList,columns=columns)
    elif(len(miniBatchFractionList)>1):
        return pd.DataFrame(metrics,index=miniBatchFractionList,columns=columns)
    else:
        # 默認爲數字索引
        return pd.DataFrame(metrics,index=[0],columns=columns)

獲取訓練數據,驗證數據,測試數據

((train_data,validation_data,test_data),categories_dict) = parpare_data(sc,scale=[8,1,1])
# 將所有數據持久化到內存當中加快模型訓練速度
train_data.persist()
validation_data.persist()
test_data.persist()

數據長度: 7395





PythonRDD[4739] at RDD at PythonRDD.scala:52

評估 numIterations參數影響

訓練模型並獲取評估參數表

evaluate_table = evaluate_parameter(train_data,validation_data,[i for i in range(1,50,5)],[10],[1])
evaluate_table

	Model	AUC	Duration	numIterations	stepSize	miniBatchFraction
1	(weights=[0.6677226910837364,-0.69951944405741...	0.664205	0.542155	1	10	1
6	(weights=[0.28810190368216665,-0.3890579409906...	0.603375	0.149749	6	10	1
11	(weights=[0.2982103093226861,-0.30009276222335...	0.637453	0.186136	11	10	1
16	(weights=[0.2590246366263148,-0.27478234116180...	0.690569	0.213902	16	10	1
21	(weights=[0.25133027462275814,-0.2542369719546...	0.696628	0.267709	21	10	1
26	(weights=[0.24840617513903634,-0.2527605271207...	0.697719	0.317076	26	10	1
31	(weights=[0.2480626698782132,-0.25281749529624...	0.693588	0.355656	31	10	1
36	(weights=[0.24788753296317756,-0.2530393653347...	0.693588	0.488446	36	10	1
41	(weights=[0.24788753296317756,-0.2530393653347...	0.693588	0.362525	41	10	1
46	(weights=[0.24788753296317756,-0.2530393653347...	0.693588	0.378403	46	10	1

根據評估參數表繪製圖像

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=4)
ax.set_ylim(0.6,0.7)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
ax.grid()
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

評估 stepSize 參數的影響

訓練模型並獲取評估參數表

evaluate_table = evaluate_parameter(train_data,validation_data,[26],[i for i in range(10,200,15)],[1])
evaluate_table

	Model	AUC	Duration	numIterations	stepSize	miniBatchFraction
10	(weights=[0.24840617513903634,-0.2527605271207...	0.697719	0.306683	26	10	1
25	(weights=[0.40103746760777653,-0.4924966686183...	0.591412	0.305612	26	25	1
40	(weights=[0.5409425093445586,-0.77344879343874...	0.564893	0.311465	26	40	1
55	(weights=[0.6844234097438462,-1.09699570420703...	0.559457	0.418840	26	55	1
70	(weights=[0.8379207450635585,-1.43000712772985...	0.557723	0.299107	26	70	1
85	(weights=[1.0323510305921046,-1.76105166506314...	0.571635	0.288278	26	85	1
100	(weights=[1.313234120315815,-2.091223074965485...	0.590554	0.304034	26	100	1
115	(weights=[1.5106494358271485,-2.37554034126727...	0.590554	0.288630	26	115	1
130	(weights=[1.6808460801490464,-2.64560901166279...	0.586638	0.323949	26	130	1
145	(weights=[1.846760000240688,-2.914826089181457...	0.585547	0.307586	26	145	1
160	(weights=[2.0073226982616266,-3.18046915476317...	0.581202	0.305315	26	160	1
175	(weights=[2.1580796544605683,-3.43464112632351...	0.570992	0.295500	26	175	1
190	(weights=[2.295776697917227,-3.674935300385708...	0.565770	0.337451	26	190	1

根據評估參數表繪製圖像

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=6)
ax.set_ylim(0.6,0.7)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

評估miniBatchFraction 參數影響

訓練模型並獲取評估參數表

evaluate_table = evaluate_parameter(train_data,validation_data,[26],[10],np.linspace(0.1,1,5))
evaluate_table

	Model	AUC	Duration	numIterations	stepSize	miniBatchFraction
0.100	(weights=[0.22432239986157868,-0.2165393087222...	0.682073	0.293671	26	10	0.100
0.325	(weights=[0.25329319340814027,-0.2708727029103...	0.702727	0.273905	26	10	0.325
0.550	(weights=[0.24474754141432709,-0.2484500877818...	0.693803	0.276777	26	10	0.550
0.775	(weights=[0.25171480871609914,-0.2515106513891...	0.702064	0.292244	26	10	0.775
1.000	(weights=[0.24840617513903634,-0.2527605271207...	0.697719	0.280513	26	10	1.000

根據評估參數表繪製圖像

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=0.1)
ax.set_ylim(0.6,0.75)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

測試模型

導入測試集

def loadTestData(sc):
    raw_lines_and_header = sc.textFile("file:/home/zh123/.jupyter/workspace/stumbleupon/test.tsv")
    header_line = raw_lines_and_header.first()
    raw_non_header_data = raw_lines_and_header.filter(lambda l:l!=header_line)
    raw_non_quot_lines = raw_non_header_data.map(lambda l:l.replace("\"",""))
    raw_data = raw_non_quot_lines.map(lambda l:l.split("\t"))
    print("數據長度:",raw_data.count())
    
    # 和前面準備訓練集的方式類似，只不過這裏的類別字典是使用的前面的
    # 標籤的位置換成了網站的url
    web_url_rdd = raw_data.map(lambda fields:fields[0])

    feature_rdd = raw_data.map(lambda fields:extract_features(fields,categories_dict,len(fields)))
    std_scaler = StandardScaler(withMean=True,withStd=True).fit(feature_rdd)
    scaler_features = std_scaler.transform(feature_rdd)
    test_point_rdd = web_url_rdd.zip(scaler_features)
    
    return test_point_rdd
test_file_data = loadTestData(sc)
test_file_data.first()

數據長度: 3171





('http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-sauce.html',
 DenseVector([-0.355, -0.2496, -0.7015, -0.3917, -0.1041, -0.2274, -0.21, -0.059, -0.1056, 0.0, 0.0, 2.3909, -0.2594, -0.1983, 0.1364, -0.021, -0.3888, 0.3429, -0.4867, -0.3604, -0.3208, 0.342, 0.0, 0.2093, -0.1513, -0.1, -0.0436, 0.7933, 0.7491, -0.7269, -0.2042, -0.0052, -0.2303, -0.5689, 0.406, -0.2558]))

加載最終的模型

model = evaluate_table[evaluate_table.AUC == evaluate_table.AUC.max()].Model.values[0]

使用模型進行預測

# 從測試文件集中隨機抽取10個數據
for f in test_file_data.randomSplit([10,3171-10])[0].collect():
    # 打印網站名稱和預測結果
    print(f[0],bool(model.predict(f[1])))

http://www.youbeauty.com/body-fitness/dressing-for-your-body-type?page=2 False
http://www.couponingncooking.com/2012/03/super-easy-whole-chicken-in-crock-pot.html True
http://www.rsvlts.com/2012/08/04/inside-the-london-olympics-week-one-62-high-quality-photos/ False
http://backtoherroots.com/2011/08/04/90-second-nutella-chocolate-cake/ True
http://cathlincooks.blogspot.com/ True
http://www.cheapcooking.com/articles/healthy-school-lunch-ideas.htm True
http://www.ted.com/index.php/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen.html False
http://www.break.com/index/hot-girls-risky-business-fail.html True
http://www.salon.com/2010/04/03/toasted_peeps_brulee_recipe/ True
http://www.joepastry.com/category/pastry/charlotte/ True
http://www.behance.net/leon_farrant/frame/2878481 True
http://www.wimp.com/pageturner/ False

StumbleuponAnalysis－－邏輯迴歸二元分類

訓練模型

導入必要的包

演示迴歸曲線

初始化spark的上下文對象

準備爲數據

模型評估

定義評估模型AUC值的函數

定義綜合模型評估函數

定義評估參數的函數

獲取訓練數據,驗證數據,測試數據

評估 numIterations參數影響

訓練模型並獲取評估參數表

根據評估參數表繪製圖像

評估 stepSize 參數的影響

訓練模型並獲取評估參數表

根據評估參數表繪製圖像

評估miniBatchFraction 參數影響

訓練模型並獲取評估參數表

根據評估參數表繪製圖像

測試模型

導入測試集

加載最終的模型

使用模型進行預測

何爲 zookeeper？

大數據環境部署第六章——HA模式下的Hadoop（進階）

pandas學習筆記(第二彈)

pandas學習筆記(第三彈)

史上最詳細的 MySQL 常用知識總結

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結