本文記錄了用pyspark構建一個簡單的模型的過程。
1. 讀取數據集
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Spark_mllearn_example").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master("local").appName("Spark_mllearn_example").config("", "").getOrCreate()
dpath = '/Users/huoshirui/Desktop/Spark/'
df = spark.read.csv(dpath + 'spark_mllearn_test.csv', header=True)
數據集如下圖:
2. 將數據集轉換成可以用於模型使用的features/label的形式
df = df.withColumn('c2', df['c2'].cast('double'))\
.withColumn('c3', df['c3'].cast('double'))\
.withColumn('c4', df['c4'].cast('double'))\
.withColumn('c5', df['c5'].cast('double'))\
.withColumn('c6', df['c6'].cast('double'))\
.withColumn('c7', df['c7'].cast('double'))\
.withColumn('c8', df['c8'].cast('double'))\
.withColumn('c9', df['c9'].cast('double'))\
.withColumn('c10', df['c10'].cast('double'))\
.withColumn('c11', df['c11'].cast('double'))\
.withColumn('c12', df['c12'].cast('double'))\
.withColumn('c13', df['c13'].cast('double'))\
.withColumn('c14', df['c14'].cast('double'))\
.withColumn('c15', df['c15'].cast('double'))\
.withColumn('c16', df['c16'].cast('double'))
input_col = ['c2', 'c3', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']
vecAssembler = VectorAssembler(inputCols=input_col, outputCol="features")
stringIndexer = StringIndexer(inputCol="c4", outputCol="label")
pipeline = Pipeline(stages=[vecAssembler, stringIndexer])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
以下爲轉換後的數據集的結果:
3.劃分訓練集和測試集
這裏將數據中的70%作爲訓練集,剩下的30%作爲測試集
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
4.模型訓練
- 首先使用LogisticRegression模型訓練
# 模型訓練
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
用訓練好的模型在測試集上預測
# 模型預測
prediction = lrModel.transform(testData)
# ROC score
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(prediction)
使用十折交叉驗證
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
.addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
.build())
evaluator = BinaryClassificationEvaluator()
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr,
estimatorParamMaps=grid,
evaluator=evaluator,
numFolds=10)
cvModel = cv.fit(trainingData)
predictions = cvModel.transform(testData)
# Evaluate best model
#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)
- 使用使用隨機森林訓練
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# 構建模型
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
# 十折交叉驗證
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
.addGrid(rf.maxDepth, [3, 5, 7, 10])
.addGrid(rf.maxBins, [20, 30, 40])
.build())
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rf,
evaluator=evaluator,
estimatorParamMaps=grid,
numFolds=10)
cvModel_rf = cv.fit(trainingData)
# 模型預測 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)