Spark MLlib 下的邏輯迴歸二元分類
訓練模型
導入必要的包
import numpy as np
import pyspark
from matplotlib import pyplot as plt
from pyspark. mllib. classification import LogisticRegressionWithSGD
from pyspark. mllib. feature import StandardScaler
from pyspark. mllib. regression import LabeledPoint
from pyspark. mllib. evaluation import BinaryClassificationMetrics
演示迴歸曲線
x = np. linspace( 0 , 1000 , 100 )
y_t = x* 4 + 5
y_r = y_t + np. random. randint( - 1000 , 1000 , 100 )
plt. plot( x, y_t, ls= "-" , c= "r" )
plt. scatter( x, y_r)
初始化spark的上下文對象
sc = pyspark. SparkContext( master= "local[*]" , appName= "StumbleuponAnalysis" )
準備爲數據
def extract_features ( fields, categories_dict, end) :
category_id = categories_dict[ fields[ 3 ] ]
category_features = np. zeros( len ( categories_dict) )
category_features[ category_id] = 1
numerical_features = [ 0.0 if f== "?" else float ( f) for f in fields[ 4 : end] ]
return np. concatenate( ( category_features, numerical_features) )
def parpare_data ( sc, scale) :
raw_lines_and_header = sc. textFile( "file:/home/zh123/.jupyter/workspace/stumbleupon/train.tsv" )
header_line = raw_lines_and_header. first( )
raw_non_header_data = raw_lines_and_header. filter ( lambda l: l!= header_line)
raw_non_quot_lines = raw_non_header_data. map ( lambda l: l. replace( "\"" , "" ) )
raw_data = raw_non_quot_lines. map ( lambda l: l. split( "\t" ) )
print ( "數據長度:" , raw_data. count( ) )
categories_dict = raw_data. map ( lambda field: field[ 3 ] ) . distinct( ) . zipWithIndex( ) . collectAsMap( )
label_rdd = raw_data. map ( lambda fields: float ( fields[ - 1 ] ) )
feature_rdd = raw_data. map ( lambda fields: extract_features( fields, categories_dict, len ( fields) - 1 ) )
std_scaler = StandardScaler( withMean= True , withStd= True ) . fit( feature_rdd)
scaler_features = std_scaler. transform( feature_rdd)
label_point = label_rdd. zip ( scaler_features)
label_point_rdd = label_point. map ( lambda r: LabeledPoint( r[ 0 ] , r[ 1 ] ) )
return label_point_rdd. randomSplit( scale) , categories_dict
模型評估
定義評估模型AUC值的函數
def evaluate_model ( model, validation_data) :
predict = model. predict( validation_data. map ( lambda p: p. features) ) . map ( lambda x: float ( x) )
predict_and_label = predict. zip ( validation_data. map ( lambda p: p. label) )
metrics = BinaryClassificationMetrics( predict_and_label)
return metrics. areaUnderROC
定義綜合模型評估函數
import time
def train_evaluate_model ( train_data, validation_data, numIterations, stepSize, miniBatchFraction) :
start_time = time. time( )
model = LogisticRegressionWithSGD. train( train_data, numIterations, stepSize, miniBatchFraction)
duration = time. time( ) - start_time
AUC = evaluate_model( model, validation_data)
return ( model, AUC, duration, numIterations, stepSize, miniBatchFraction)
定義評估參數的函數
import pandas as pd
def evaluate_parameter ( train_data, validation_data, numIterationsList, stepSizeList, miniBatchFractionList) :
metrics = [ ]
columns = [ "Model" , "AUC" , "Duration" , "numIterations" , "stepSize" , "miniBatchFraction" ]
for numIterations in numIterationsList:
for stepSize in stepSizeList:
for miniBatchFraction in miniBatchFractionList:
metrics. append( train_evaluate_model( train_data, validation_data, numIterations, stepSize, miniBatchFraction) )
if ( len ( numIterationsList) > 1 ) :
return pd. DataFrame( metrics, index= numIterationsList, columns= columns)
elif ( len ( stepSizeList) > 1 ) :
return pd. DataFrame( metrics, index= stepSizeList, columns= columns)
elif ( len ( miniBatchFractionList) > 1 ) :
return pd. DataFrame( metrics, index= miniBatchFractionList, columns= columns)
else :
return pd. DataFrame( metrics, index= [ 0 ] , columns= columns)
獲取訓練數據,驗證數據,測試數據
( ( train_data, validation_data, test_data) , categories_dict) = parpare_data( sc, scale= [ 8 , 1 , 1 ] )
train_data. persist( )
validation_data. persist( )
test_data. persist( )
數據長度: 7395
PythonRDD[4739] at RDD at PythonRDD.scala:52
評估 numIterations參數影響
訓練模型並獲取評估參數表
evaluate_table = evaluate_parameter( train_data, validation_data, [ i for i in range ( 1 , 50 , 5 ) ] , [ 10 ] , [ 1 ] )
evaluate_table
Model
AUC
Duration
numIterations
stepSize
miniBatchFraction
1
(weights=[0.6677226910837364,-0.69951944405741...
0.664205
0.542155
1
10
1
6
(weights=[0.28810190368216665,-0.3890579409906...
0.603375
0.149749
6
10
1
11
(weights=[0.2982103093226861,-0.30009276222335...
0.637453
0.186136
11
10
1
16
(weights=[0.2590246366263148,-0.27478234116180...
0.690569
0.213902
16
10
1
21
(weights=[0.25133027462275814,-0.2542369719546...
0.696628
0.267709
21
10
1
26
(weights=[0.24840617513903634,-0.2527605271207...
0.697719
0.317076
26
10
1
31
(weights=[0.2480626698782132,-0.25281749529624...
0.693588
0.355656
31
10
1
36
(weights=[0.24788753296317756,-0.2530393653347...
0.693588
0.488446
36
10
1
41
(weights=[0.24788753296317756,-0.2530393653347...
0.693588
0.362525
41
10
1
46
(weights=[0.24788753296317756,-0.2530393653347...
0.693588
0.378403
46
10
1
根據評估參數表繪製圖像
fig = plt. figure( )
ax = fig. add_subplot( 111 )
ax. bar( evaluate_table. index, evaluate_table[ "AUC" ] , color= "c" , tick_label= evaluate_table. index, label= "AUC" , width= 4 )
ax. set_ylim( 0.6 , 0.7 )
ax2 = ax. twinx( )
ax2. plot( evaluate_table. index, evaluate_table[ "Duration" ] , c= "r" , label= "Duration" , marker= "o" )
ax. grid( )
fig. legend( loc= 1 , bbox_to_anchor= ( 1 , 1 ) , bbox_transform= ax. transAxes)
評估 stepSize 參數的影響
訓練模型並獲取評估參數表
evaluate_table = evaluate_parameter( train_data, validation_data, [ 26 ] , [ i for i in range ( 10 , 200 , 15 ) ] , [ 1 ] )
evaluate_table
Model
AUC
Duration
numIterations
stepSize
miniBatchFraction
10
(weights=[0.24840617513903634,-0.2527605271207...
0.697719
0.306683
26
10
1
25
(weights=[0.40103746760777653,-0.4924966686183...
0.591412
0.305612
26
25
1
40
(weights=[0.5409425093445586,-0.77344879343874...
0.564893
0.311465
26
40
1
55
(weights=[0.6844234097438462,-1.09699570420703...
0.559457
0.418840
26
55
1
70
(weights=[0.8379207450635585,-1.43000712772985...
0.557723
0.299107
26
70
1
85
(weights=[1.0323510305921046,-1.76105166506314...
0.571635
0.288278
26
85
1
100
(weights=[1.313234120315815,-2.091223074965485...
0.590554
0.304034
26
100
1
115
(weights=[1.5106494358271485,-2.37554034126727...
0.590554
0.288630
26
115
1
130
(weights=[1.6808460801490464,-2.64560901166279...
0.586638
0.323949
26
130
1
145
(weights=[1.846760000240688,-2.914826089181457...
0.585547
0.307586
26
145
1
160
(weights=[2.0073226982616266,-3.18046915476317...
0.581202
0.305315
26
160
1
175
(weights=[2.1580796544605683,-3.43464112632351...
0.570992
0.295500
26
175
1
190
(weights=[2.295776697917227,-3.674935300385708...
0.565770
0.337451
26
190
1
根據評估參數表繪製圖像
fig = plt. figure( )
ax = fig. add_subplot( 111 )
ax. bar( evaluate_table. index, evaluate_table[ "AUC" ] , color= "c" , tick_label= evaluate_table. index, label= "AUC" , width= 6 )
ax. set_ylim( 0.6 , 0.7 )
ax2 = ax. twinx( )
ax2. plot( evaluate_table. index, evaluate_table[ "Duration" ] , c= "r" , label= "Duration" , marker= "o" )
fig. legend( loc= 1 , bbox_to_anchor= ( 1 , 1 ) , bbox_transform= ax. transAxes)
評估miniBatchFraction 參數影響
訓練模型並獲取評估參數表
evaluate_table = evaluate_parameter( train_data, validation_data, [ 26 ] , [ 10 ] , np. linspace( 0.1 , 1 , 5 ) )
evaluate_table
Model
AUC
Duration
numIterations
stepSize
miniBatchFraction
0.100
(weights=[0.22432239986157868,-0.2165393087222...
0.682073
0.293671
26
10
0.100
0.325
(weights=[0.25329319340814027,-0.2708727029103...
0.702727
0.273905
26
10
0.325
0.550
(weights=[0.24474754141432709,-0.2484500877818...
0.693803
0.276777
26
10
0.550
0.775
(weights=[0.25171480871609914,-0.2515106513891...
0.702064
0.292244
26
10
0.775
1.000
(weights=[0.24840617513903634,-0.2527605271207...
0.697719
0.280513
26
10
1.000
根據評估參數表繪製圖像
fig = plt. figure( )
ax = fig. add_subplot( 111 )
ax. bar( evaluate_table. index, evaluate_table[ "AUC" ] , color= "c" , tick_label= evaluate_table. index, label= "AUC" , width= 0.1 )
ax. set_ylim( 0.6 , 0.75 )
ax2 = ax. twinx( )
ax2. plot( evaluate_table. index, evaluate_table[ "Duration" ] , c= "r" , label= "Duration" , marker= "o" )
fig. legend( loc= 1 , bbox_to_anchor= ( 1 , 1 ) , bbox_transform= ax. transAxes)
測試模型
導入測試集
def loadTestData ( sc) :
raw_lines_and_header = sc. textFile( "file:/home/zh123/.jupyter/workspace/stumbleupon/test.tsv" )
header_line = raw_lines_and_header. first( )
raw_non_header_data = raw_lines_and_header. filter ( lambda l: l!= header_line)
raw_non_quot_lines = raw_non_header_data. map ( lambda l: l. replace( "\"" , "" ) )
raw_data = raw_non_quot_lines. map ( lambda l: l. split( "\t" ) )
print ( "數據長度:" , raw_data. count( ) )
web_url_rdd = raw_data. map ( lambda fields: fields[ 0 ] )
feature_rdd = raw_data. map ( lambda fields: extract_features( fields, categories_dict, len ( fields) ) )
std_scaler = StandardScaler( withMean= True , withStd= True ) . fit( feature_rdd)
scaler_features = std_scaler. transform( feature_rdd)
test_point_rdd = web_url_rdd. zip ( scaler_features)
return test_point_rdd
test_file_data = loadTestData( sc)
test_file_data. first( )
數據長度: 3171
('http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-sauce.html',
DenseVector([-0.355, -0.2496, -0.7015, -0.3917, -0.1041, -0.2274, -0.21, -0.059, -0.1056, 0.0, 0.0, 2.3909, -0.2594, -0.1983, 0.1364, -0.021, -0.3888, 0.3429, -0.4867, -0.3604, -0.3208, 0.342, 0.0, 0.2093, -0.1513, -0.1, -0.0436, 0.7933, 0.7491, -0.7269, -0.2042, -0.0052, -0.2303, -0.5689, 0.406, -0.2558]))
加載最終的模型
model = evaluate_table[ evaluate_table. AUC == evaluate_table. AUC. max ( ) ] . Model. values[ 0 ]
使用模型進行預測
for f in test_file_data. randomSplit( [ 10 , 3171 - 10 ] ) [ 0 ] . collect( ) :
print ( f[ 0 ] , bool ( model. predict( f[ 1 ] ) ) )
http://www.youbeauty.com/body-fitness/dressing-for-your-body-type?page=2 False
http://www.couponingncooking.com/2012/03/super-easy-whole-chicken-in-crock-pot.html True
http://www.rsvlts.com/2012/08/04/inside-the-london-olympics-week-one-62-high-quality-photos/ False
http://backtoherroots.com/2011/08/04/90-second-nutella-chocolate-cake/ True
http://cathlincooks.blogspot.com/ True
http://www.cheapcooking.com/articles/healthy-school-lunch-ideas.htm True
http://www.ted.com/index.php/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen.html False
http://www.break.com/index/hot-girls-risky-business-fail.html True
http://www.salon.com/2010/04/03/toasted_peeps_brulee_recipe/ True
http://www.joepastry.com/category/pastry/charlotte/ True
http://www.behance.net/leon_farrant/frame/2878481 True
http://www.wimp.com/pageturner/ False