以邏輯迴歸模型舉例介紹完整的分類模型構建過程。
數據集下載:http://www.kaggle.com/c/stumbleupon
該數據集是關於網頁中推薦的頁面是短暫存在還是可以長時間流行的一個分類問題,目標值-1表示長久,0表示短暫。
首先將數據第一行刪除,通過管道保存到以train_noheader.tsv命名的文件中
1
|
sed 1 d
train.tsv > train _ noheader.tsv |
啓動spark-shell
1
|
spark-shell
--driver-memory 4 g |
讀入訓練數據到RDD,並檢查
1
2
3
|
val rawData = sc.textFile( "train_noheader.tsv" ) val records = rawData.map(line = >
line.split( "\t" )) records.first |
數據處理
1
2
3
4
5
6
7
8
|
import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors val data = records.map
{ r = > val trimmed = r.map( _ .replaceAll( "\"" , "" ))\\去掉多餘的“符號 val label = trimmed(r.size
- 1 ).toInt\\標籤轉化爲整數 val features = trimmed.slice( 4 ,
r.size - 1 ).map(d = > if (d == "?" ) 0.0 else d.toDouble)\\用 0 代替表示缺失數據的?。 LabeledPoint(label,
Vectors.dense(features))\\存儲標籤和特徵向量到Vectors中 } |
對數據緩存和統計樣本數
1
2
|
data.cache val numData = data.count |
訓練邏輯迴歸分類模型
1
2
|
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD val lrModel = LogisticRegressionWithSGD.train(data,
numIterations) |
使用分類模型
1
2
3
4
5
|
val dataPoint = data.first val prediction = lrModel.predict(dataPoint.features) //
prediction: Double = 1.0\\預測爲長久 val trueLabel = dataPoint.label //
trueLabel: Double = 0.0\\實際爲短暫 |
評估模型性能
預測的正確率(訓練樣本被正確分類的數目處於總樣本數)
1
2
3
4
5
6
7
|
val lrTotalCorrect = data.map
{ point = > if (lrModel.predict(point.features) == point.label) 1 else 0 }.sum //
lrTotalCorrect: Double = 3806.0 val lrAccuracy = lrTotalCorrect
/ numData //
lrAccuracy: Double = 0.5146720757268425//51.5%的正確率,結果不太好,跟隨機預測差不多 |
模型評價指標:準確率-召回率(PR)曲線和ROC曲線的面積
1
2
3
4
5
6
7
8
|
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics //計算指標 val metrics = Seq(lrModel,
svmModel).map { model = > val scoreAndLabels = data.map
{ point = > (model.predict(point.features),
point.label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) (model.getClass.getSimpleName,
metrics.areaUnderPR, metrics.areaUnderROC) } //分別計算邏輯迴歸和支持向量機模型的指標 |
邏輯迴歸模型,PR:75%,ROC:50%,效果不好
改進模型與參數調優
統計數據
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
import org.apache.spark.mllib.linalg.distributed.RowMatrix val vectors = data.map(lp = >
lp.features) val matrix = new RowMatrix(vectors) val matrixSummary = matrix.computeColumnSummaryStatistics() //計算特徵矩陣每列的統計數據 println(matrixSummary.mean) println(matrixSummary.min) println(matrixSummary.max) println(matrixSummary.variance) println(matrixSummary.numNonzeros) |
特徵標準化
1
2
3
4
5
|
import org.apache.spark.mllib.feature.StandardScaler val scaler = new StandardScaler(withMean = true ,
withStd = true ).fit(vectors) //withMean和withStd設爲True val scaledData = data.map(lp = >
LabeledPoint(lp.label, scaler.transform(lp.features))) //標準化後的數據 println(scaleData.first.features) |
重新訓練模型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
val lrModelScaled = LogisticRegressionWithSGD.train(scaledData,
numIterations) val lrTotalCorrectScaled = scaledData.map
{ point = > if (lrModelScaled.predict(point.features) == point.label) 1 else 0 }.sum val lrAccuracyScaled = lrTotalCorrectScaled
/ numData //
lrAccuracyScaled: Double = 0.6204192021636241 val lrPredictionsVsTrue = scaledData.map
{ point = > (lrModelScaled.predict(point.features),
point.label) } val lrMetricsScaled = new BinaryClassificationMetrics(lrPredictionsVsTrue) val lrPr = lrMetricsScaled.areaUnderPR val lrRoc = lrMetricsScaled.areaUnderROC println(f "${lrModelScaled.getClass.getSimpleName}\nAccuracy:
${lrAccuracyScaled * 100}%2.4f%%\nArea under PR: ${lrPr * 100.0}%2.4f%%\nArea under ROC: ${lrRoc * 100.0}%2.4f%%" ) /* LogisticRegressionModel Accuracy:
62.0419% Area
under PR: 72.7254% Area
under ROC: 61.9663% */ /簡單的對特徵標準化,提高了準確率 |
考慮其他特徵,未使用category 和boilerplate 列的內容
添加category,對每個類別做一個索引,可以用1-of-k編碼。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
val categories = records.map(r = >
r( 3 )).distinct.collect.zipWithIndex.toMap //
categories: scala.collection.immutable.Map[String,Int] = Map("weather" -> 0, "sports" -> 6, //
"unknown" -> 4, "computer_internet" -> 12, "?" -> 11, "culture_politics" -> 3, "religion" -> 8, //
"recreation" -> 2, "arts_entertainment" -> 9, "health" -> 5, "law_crime" -> 10, "gaming" -> 13, //
"business" -> 1, "science_technology" -> 7) val numCategories = categories.size //
numCategories: Int = 14 val dataCategories = records.map
{ r = > val trimmed = r.map( _ .replaceAll( "\"" , "" )) val label = trimmed(r.size
- 1 ).toInt val categoryIdx = categories(r( 3 )) val categoryFeatures = Array.ofDim[Double](numCategories) categoryFeatures(categoryIdx) = 1.0 val otherFeatures = trimmed.slice( 4 ,
r.size - 1 ).map(d = > if (d == "?" ) 0.0 else d.toDouble) val features = categoryFeatures
++ otherFeatures LabeledPoint(label,
Vectors.dense(features)) } println(dataCategories.first) |
1
2
3
|
//
LabeledPoint(0.0, [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556, //
0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182, //
0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]) |
標準化
1
2
3
4
5
6
7
8
9
10
11
12
13
|
val scalerCats = new StandardScaler(withMean = true ,
withStd = true ).fit(dataCategories.map(lp = >
lp.features)) val scaledDataCats = dataCategories.map(lp = >
LabeledPoint(lp.label, scalerCats.transform(lp.features))) println(scaledDataCats.first.features) /* [-0.023261105535492967,2.720728254208072,-0.4464200056407091,-0.2205258360869135,-0.028492999745483565, -0.2709979963915644,-0.23272692307249684,-0.20165301179556835,-0.09914890962355712,-0.381812077600508, -0.06487656833429316,-0.6807513271391559,-0.2041811690290381,-0.10189368073492189,1.1376439023494747, -0.08193556218743517,1.0251347662842047,-0.0558631837375738,-0.4688883677664047,-0.35430044806743044 ,-0.3175351615705111,0.3384496941616097,0.0,0.8288021759842215,-0.14726792180045598,0.22963544844991393, -0.14162589530918376,0.7902364255801262,0.7171932152231301,-0.29799680188379124,-0.20346153667348232, -0.03296720969318916,-0.0487811294839849,0.9400696843533806,-0.10869789547344721,-0.2788172632659348] */ |
再次訓練模型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
val lrModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats,
numIterations) val lrTotalCorrectScaledCats = scaledDataCats.map
{ point = > if (lrModelScaledCats.predict(point.features) == point.label) 1 else 0 }.sum val lrAccuracyScaledCats = lrTotalCorrectScaledCats
/ numData val lrPredictionsVsTrueCats = scaledDataCats.map
{ point = > (lrModelScaledCats.predict(point.features),
point.label) } val lrMetricsScaledCats = new BinaryClassificationMetrics(lrPredictionsVsTrueCats) val lrPrCats = lrMetricsScaledCats.areaUnderPR val lrRocCats = lrMetricsScaledCats.areaUnderROC println(f "${lrModelScaledCats.getClass.getSimpleName}\nAccuracy:
${lrAccuracyScaledCats * 100}%2.4f%%\nArea under PR: ${lrPrCats * 100.0}%2.4f%%\nArea under ROC: ${lrRocCats * 100.0}%2.4f%%" ) /* LogisticRegressionModel Accuracy:
66.5720% Area
under PR: 75.7964% Area
under ROC: 66.5483% */ //準確率有所提升 |
模型參數調優
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization.Updater import org.apache.spark.mllib.optimization.SimpleUpdater import org.apache.spark.mllib.optimization.L 1 Updater import org.apache.spark.mllib.optimization.SquaredL 2 Updater import org.apache.spark.mllib.classification.ClassificationModel //
輔助函數,根據給定數據輸入模型 def trainWithParams(input : RDD[LabeledPoint],
regParam : Double,
numIterations : Int,
updater : Updater,
stepSize : Double) = { val lr = new LogisticRegressionWithSGD lr.optimizer.setNumIterations(numIterations).setUpdater(updater).setRegParam(regParam).setStepSize(stepSize) lr.run(input) } //
輔助函數,根據輸入數據和分類模型,計算AUC def createMetrics(label : String,
data : RDD[LabeledPoint],
model : ClassificationModel) = { val scoreAndLabels = data.map
{ point = > (model.predict(point.features),
point.label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) (label,
metrics.areaUnderROC) } |
緩存數據
1
|
scaledDataCats.cache |
設置不同迭代次數
1
2
3
4
5
6
7
8
9
10
11
|
val iterResults = Seq( 1 , 5 , 10 , 50 ).map
{ param = > val model = trainWithParams(scaledDataCats, 0.0 ,
param, new SimpleUpdater, 1.0 ) createMetrics(s "$param
iterations" ,
scaledDataCats, model) } iterResults.foreach
{ case (param,
auc) = >
println(f "$param,
AUC = ${auc * 100}%2.2f%%" )
} /* 1
iterations, AUC = 64.97% 5
iterations, AUC = 66.62% 10
iterations, AUC = 66.55% 50
iterations, AUC = 66.81%//達到某個次數,結果影響變小 */ |
設置不同步長
1
2
3
4
5
6
7
8
9
10
11
12
|
val stepResults = Seq( 0.001 , 0.01 , 0.1 , 1.0 , 10.0 ).map
{ param = > val model = trainWithParams(scaledDataCats, 0.0 ,
numIterations, new SimpleUpdater,
param) createMetrics(s "$param
step size" ,
scaledDataCats, model) } stepResults.foreach
{ case (param,
auc) = >
println(f "$param,
AUC = ${auc * 100}%2.2f%%" )
} /* 0.001
step size, AUC = 64.95% 0.01
step size, AUC = 65.00% 0.1
step size, AUC = 65.52% 1.0
step size, AUC = 66.55% 10.0
step size, AUC = 61.92%//步長過大反而更不準確 */ |
正則化,不同的正則參數
1
2
3
4
5
6
7
8
9
10
11
12
|
val regResults = Seq( 0.001 , 0.01 , 0.1 , 1.0 , 10.0 ).map
{ param = > val model = trainWithParams(scaledDataCats,
param, numIterations, new SquaredL 2 Updater, 1.0 ) createMetrics(s "$param
L2 regularization parameter" ,
scaledDataCats, model) } regResults.foreach
{ case (param,
auc) = >
println(f "$param,
AUC = ${auc * 100}%2.2f%%" )
} /* 0.001
L2 regularization parameter, AUC = 66.55% 0.01
L2 regularization parameter, AUC = 66.55% 0.1
L2 regularization parameter, AUC = 66.63% 1.0
L2 regularization parameter, AUC = 66.04% 10.0
L2 regularization parameter, AUC = 35.33%//採用L2正則化 */ |
交叉驗證
1
2
3
|
val trainTestSplit = scaledDataCats.randomSplit(Array( 0.6 , 0.4 ), 123 ) //六四分 val train = trainTestSplit( 0 ) val test = trainTestSplit( 1 ) |
調整正則化參數
1
2
3
4
5
6
7
8
9
10
11
12
|
val regResultsTest = Seq( 0.0 , 0.001 , 0.0025 , 0.005 , 0.01 ).map
{ param = > val model = trainWithParams(train,
param, numIterations, new SquaredL 2 Updater, 1.0 ) createMetrics(s "$param
L2 regularization parameter" ,
test, model) } regResultsTest.foreach
{ case (param,
auc) = >
println(f "$param,
AUC = ${auc * 100}%2.6f%%" )
} /* 0.0
L2 regularization parameter, AUC = 66.480874% 0.001
L2 regularization parameter, AUC = 66.480874% 0.0025
L2 regularization parameter, AUC = 66.515027% 0.005
L2 regularization parameter, AUC = 66.515027% 0.01
L2 regularization parameter, AUC = 66.549180% */ |
再計算測試集
1
2
3
4
5
6
7
8
9
10
11
12
|
val regResultsTrain = Seq( 0.0 , 0.001 , 0.0025 , 0.005 , 0.01 ).map
{ param = > val model = trainWithParams(train,
param, numIterations, new SquaredL 2 Updater, 1.0 ) createMetrics(s "$param
L2 regularization parameter" ,
train, model) } regResultsTrain.foreach
{ case (param,
auc) = >
println(f "$param,
AUC = ${auc * 100}%2.6f%%" )
} /* 0.0
L2 regularization parameter, AUC = 66.260311% 0.001
L2 regularization parameter, AUC = 66.260311% 0.0025
L2 regularization parameter, AUC = 66.260311% 0.005
L2 regularization parameter, AUC = 66.238294% 0.01
L2 regularization parameter, AUC = 66.238294% */ |
正則化參數較小,效果較好,但容易過擬合。
交叉驗證中,一般選擇測試集中表現最好的參數。然後進行新數據集的預測。