StumbleuponAnalysis--邏輯迴歸二元分類

Spark MLlib 下的邏輯迴歸二元分類

訓練模型

導入必要的包

import numpy as np
import pyspark
from matplotlib import pyplot as plt
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics

演示迴歸曲線

x = np.linspace(0,1000,100)
y_t = x*4 + 5
y_r = y_t + np.random.randint(-1000,1000,100)
plt.plot(x,y_t,ls="-",c="r")
plt.scatter(x,y_r)

在這裏插入圖片描述

初始化spark的上下文對象

sc = pyspark.SparkContext(master="local[*]",appName="StumbleuponAnalysis")

準備爲數據

def extract_features(fields,categories_dict,end):
    # 加載字段對應的類別id
    category_id = categories_dict[fields[3]]
    # 初始化類別特徵集合 全置爲0
    category_features = np.zeros(len(categories_dict))
    # 把類別id對應的位置變爲1
    category_features[category_id] = 1
    # 初始化數值特徵集
    numerical_features = [0.0 if f=="?" else float(f) for f in fields[4:end] ]
    # 將兩個特徵集合並後返回
    return np.concatenate((category_features,numerical_features))

def parpare_data(sc,scale):
    # 讀入文件
    raw_lines_and_header = sc.textFile("file:/home/zh123/.jupyter/workspace/stumbleupon/train.tsv")
    # 取文件頭部數據
    header_line = raw_lines_and_header.first()
    # 將數據去掉頭部
    raw_non_header_data = raw_lines_and_header.filter(lambda l:l!=header_line)
    # 去掉引號
    raw_non_quot_lines = raw_non_header_data.map(lambda l:l.replace("\"",""))
    # 每行以 “\t“ 分割成多個字段
    raw_data = raw_non_quot_lines.map(lambda l:l.split("\t"))
    print("數據長度:",raw_data.count())
    # 類別字典 將文本類別映射爲數值
    categories_dict = raw_data.map(lambda field:field[3]).distinct().zipWithIndex().collectAsMap()
    # 標籤
    label_rdd = raw_data.map(lambda fields:float(fields[-1]))
    # 特徵
    feature_rdd = raw_data.map(lambda fields:extract_features(fields,categories_dict,len(fields)-1))
    #============================vvvv 將特徵數據標準化 vvvv============================================
    # 初始化一個標準執行器
    std_scaler = StandardScaler(withMean=True,withStd=True).fit(feature_rdd)
    # 將原始特徵數據轉換爲標準特徵
    scaler_features = std_scaler.transform(feature_rdd)
    # 壓縮爲(標籤,特徵)
    label_point = label_rdd.zip(scaler_features)
    # 構建LabelPoint
    label_point_rdd = label_point.map(lambda r:LabeledPoint(r[0],r[1]))
    # 根據傳入的比例返回訓練集,驗證集,測試集, 類別映射字典
    return label_point_rdd.randomSplit(scale),categories_dict

模型評估

定義評估模型AUC值的函數

def evaluate_model(model,validation_data):
    # 根據傳入的驗證集通過模型產生 預測集
    predict = model.predict(validation_data.map(lambda p:p.features)).map(lambda x:float(x))
    # 將預測集和標籤集 壓縮成(預測值,標籤值)
    predict_and_label = predict.zip(validation_data.map(lambda p:p.label))
    # 初始化二元分類矩陣
    metrics = BinaryClassificationMetrics(predict_and_label)
    # 返回AUC值,ROC曲線下的面積
    return metrics.areaUnderROC

定義綜合模型評估函數

import time
def train_evaluate_model(train_data,validation_data,numIterations,stepSize,miniBatchFraction):
    # 記錄模型訓練開始時間
    start_time = time.time()
    # 訓練模型
    model = LogisticRegressionWithSGD.train(train_data,numIterations,stepSize,miniBatchFraction)
    # 訓練模型耗時
    duration = time.time() - start_time
    # 計算模型的AUC值
    AUC = evaluate_model(model,validation_data)
    return (model,AUC,duration,numIterations,stepSize,miniBatchFraction)
# train_data,validation_data,test_data = parpare_data(sc,scale=[8,1,1])
# train_evaluate_model(train_data,test_data,1,100,0.7)

定義評估參數的函數

import pandas as pd
def evaluate_parameter(train_data,validation_data,numIterationsList,stepSizeList,miniBatchFractionList):
    # 評測矩陣
    metrics = []
    # 列索引
    columns = ["Model","AUC","Duration","numIterations","stepSize","miniBatchFraction"]
    for numIterations in numIterationsList:
        for stepSize in stepSizeList:
            for miniBatchFraction in miniBatchFractionList:
                # 在評測矩陣中記錄結果
                metrics.append(train_evaluate_model(train_data,validation_data,numIterations,stepSize,miniBatchFraction))
    # 判斷長度大於1的作爲自變量,將其作爲行索引
    if(len(numIterationsList) > 1):
        return pd.DataFrame(metrics,index=numIterationsList,columns=columns)
    elif(len(stepSizeList) > 1):
        return pd.DataFrame(metrics,index=stepSizeList,columns=columns)
    elif(len(miniBatchFractionList)>1):
        return pd.DataFrame(metrics,index=miniBatchFractionList,columns=columns)
    else:
        # 默認爲數字索引
        return pd.DataFrame(metrics,index=[0],columns=columns)

獲取訓練數據,驗證數據,測試數據

((train_data,validation_data,test_data),categories_dict) = parpare_data(sc,scale=[8,1,1])
# 將所有數據持久化到內存當中加快模型訓練速度
train_data.persist()
validation_data.persist()
test_data.persist()
數據長度: 7395





PythonRDD[4739] at RDD at PythonRDD.scala:52

評估 numIterations參數影響

訓練模型並獲取評估參數表

evaluate_table = evaluate_parameter(train_data,validation_data,[i for i in range(1,50,5)],[10],[1])
evaluate_table
Model AUC Duration numIterations stepSize miniBatchFraction
1 (weights=[0.6677226910837364,-0.69951944405741... 0.664205 0.542155 1 10 1
6 (weights=[0.28810190368216665,-0.3890579409906... 0.603375 0.149749 6 10 1
11 (weights=[0.2982103093226861,-0.30009276222335... 0.637453 0.186136 11 10 1
16 (weights=[0.2590246366263148,-0.27478234116180... 0.690569 0.213902 16 10 1
21 (weights=[0.25133027462275814,-0.2542369719546... 0.696628 0.267709 21 10 1
26 (weights=[0.24840617513903634,-0.2527605271207... 0.697719 0.317076 26 10 1
31 (weights=[0.2480626698782132,-0.25281749529624... 0.693588 0.355656 31 10 1
36 (weights=[0.24788753296317756,-0.2530393653347... 0.693588 0.488446 36 10 1
41 (weights=[0.24788753296317756,-0.2530393653347... 0.693588 0.362525 41 10 1
46 (weights=[0.24788753296317756,-0.2530393653347... 0.693588 0.378403 46 10 1

根據評估參數表繪製圖像

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=4)
ax.set_ylim(0.6,0.7)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
ax.grid()
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

在這裏插入圖片描述

評估 stepSize 參數的影響

訓練模型並獲取評估參數表

evaluate_table = evaluate_parameter(train_data,validation_data,[26],[i for i in range(10,200,15)],[1])
evaluate_table
Model AUC Duration numIterations stepSize miniBatchFraction
10 (weights=[0.24840617513903634,-0.2527605271207... 0.697719 0.306683 26 10 1
25 (weights=[0.40103746760777653,-0.4924966686183... 0.591412 0.305612 26 25 1
40 (weights=[0.5409425093445586,-0.77344879343874... 0.564893 0.311465 26 40 1
55 (weights=[0.6844234097438462,-1.09699570420703... 0.559457 0.418840 26 55 1
70 (weights=[0.8379207450635585,-1.43000712772985... 0.557723 0.299107 26 70 1
85 (weights=[1.0323510305921046,-1.76105166506314... 0.571635 0.288278 26 85 1
100 (weights=[1.313234120315815,-2.091223074965485... 0.590554 0.304034 26 100 1
115 (weights=[1.5106494358271485,-2.37554034126727... 0.590554 0.288630 26 115 1
130 (weights=[1.6808460801490464,-2.64560901166279... 0.586638 0.323949 26 130 1
145 (weights=[1.846760000240688,-2.914826089181457... 0.585547 0.307586 26 145 1
160 (weights=[2.0073226982616266,-3.18046915476317... 0.581202 0.305315 26 160 1
175 (weights=[2.1580796544605683,-3.43464112632351... 0.570992 0.295500 26 175 1
190 (weights=[2.295776697917227,-3.674935300385708... 0.565770 0.337451 26 190 1

根據評估參數表繪製圖像

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=6)
ax.set_ylim(0.6,0.7)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

在這裏插入圖片描述

評估miniBatchFraction 參數影響

訓練模型並獲取評估參數表

evaluate_table = evaluate_parameter(train_data,validation_data,[26],[10],np.linspace(0.1,1,5))
evaluate_table
Model AUC Duration numIterations stepSize miniBatchFraction
0.100 (weights=[0.22432239986157868,-0.2165393087222... 0.682073 0.293671 26 10 0.100
0.325 (weights=[0.25329319340814027,-0.2708727029103... 0.702727 0.273905 26 10 0.325
0.550 (weights=[0.24474754141432709,-0.2484500877818... 0.693803 0.276777 26 10 0.550
0.775 (weights=[0.25171480871609914,-0.2515106513891... 0.702064 0.292244 26 10 0.775
1.000 (weights=[0.24840617513903634,-0.2527605271207... 0.697719 0.280513 26 10 1.000

根據評估參數表繪製圖像

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=0.1)
ax.set_ylim(0.6,0.75)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

在這裏插入圖片描述

測試模型

導入測試集

def loadTestData(sc):
    raw_lines_and_header = sc.textFile("file:/home/zh123/.jupyter/workspace/stumbleupon/test.tsv")
    header_line = raw_lines_and_header.first()
    raw_non_header_data = raw_lines_and_header.filter(lambda l:l!=header_line)
    raw_non_quot_lines = raw_non_header_data.map(lambda l:l.replace("\"",""))
    raw_data = raw_non_quot_lines.map(lambda l:l.split("\t"))
    print("數據長度:",raw_data.count())
    
    # 和前面準備訓練集的方式類似,只不過這裏的類別字典是使用的前面的
    # 標籤的位置換成了網站的url
    web_url_rdd = raw_data.map(lambda fields:fields[0])

    feature_rdd = raw_data.map(lambda fields:extract_features(fields,categories_dict,len(fields)))
    std_scaler = StandardScaler(withMean=True,withStd=True).fit(feature_rdd)
    scaler_features = std_scaler.transform(feature_rdd)
    test_point_rdd = web_url_rdd.zip(scaler_features)
    
    return test_point_rdd
test_file_data = loadTestData(sc)
test_file_data.first()
數據長度: 3171





('http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-sauce.html',
 DenseVector([-0.355, -0.2496, -0.7015, -0.3917, -0.1041, -0.2274, -0.21, -0.059, -0.1056, 0.0, 0.0, 2.3909, -0.2594, -0.1983, 0.1364, -0.021, -0.3888, 0.3429, -0.4867, -0.3604, -0.3208, 0.342, 0.0, 0.2093, -0.1513, -0.1, -0.0436, 0.7933, 0.7491, -0.7269, -0.2042, -0.0052, -0.2303, -0.5689, 0.406, -0.2558]))

加載最終的模型

model = evaluate_table[evaluate_table.AUC == evaluate_table.AUC.max()].Model.values[0]

使用模型進行預測

# 從測試文件集中隨機抽取10個數據
for f in test_file_data.randomSplit([10,3171-10])[0].collect():
    # 打印網站名稱和預測結果
    print(f[0],bool(model.predict(f[1])))
http://www.youbeauty.com/body-fitness/dressing-for-your-body-type?page=2 False
http://www.couponingncooking.com/2012/03/super-easy-whole-chicken-in-crock-pot.html True
http://www.rsvlts.com/2012/08/04/inside-the-london-olympics-week-one-62-high-quality-photos/ False
http://backtoherroots.com/2011/08/04/90-second-nutella-chocolate-cake/ True
http://cathlincooks.blogspot.com/ True
http://www.cheapcooking.com/articles/healthy-school-lunch-ideas.htm True
http://www.ted.com/index.php/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen.html False
http://www.break.com/index/hot-girls-risky-business-fail.html True
http://www.salon.com/2010/04/03/toasted_peeps_brulee_recipe/ True
http://www.joepastry.com/category/pastry/charlotte/ True
http://www.behance.net/leon_farrant/frame/2878481 True
http://www.wimp.com/pageturner/ False
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章