pyspark構建簡單模型(RandomForest&LogisticRegression)

本文記錄了用pyspark構建一個簡單的模型的過程。

1. 讀取數據集

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Spark_mllearn_example").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master("local").appName("Spark_mllearn_example").config("", "").getOrCreate()

dpath = '/Users/huoshirui/Desktop/Spark/'
df = spark.read.csv(dpath + 'spark_mllearn_test.csv', header=True)

數據集如下圖:
這裏寫圖片描述

2. 將數據集轉換成可以用於模型使用的features/label的形式

df = df.withColumn('c2', df['c2'].cast('double'))\
       .withColumn('c3', df['c3'].cast('double'))\
       .withColumn('c4', df['c4'].cast('double'))\
       .withColumn('c5', df['c5'].cast('double'))\
       .withColumn('c6', df['c6'].cast('double'))\
       .withColumn('c7', df['c7'].cast('double'))\
       .withColumn('c8', df['c8'].cast('double'))\
       .withColumn('c9', df['c9'].cast('double'))\
       .withColumn('c10', df['c10'].cast('double'))\
       .withColumn('c11', df['c11'].cast('double'))\
       .withColumn('c12', df['c12'].cast('double'))\
       .withColumn('c13', df['c13'].cast('double'))\
       .withColumn('c14', df['c14'].cast('double'))\
       .withColumn('c15', df['c15'].cast('double'))\
       .withColumn('c16', df['c16'].cast('double'))

input_col = ['c2', 'c3', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']
vecAssembler = VectorAssembler(inputCols=input_col, outputCol="features")
stringIndexer = StringIndexer(inputCol="c4", outputCol="label")
pipeline = Pipeline(stages=[vecAssembler, stringIndexer])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

以下爲轉換後的數據集的結果:
這裏寫圖片描述

3.劃分訓練集和測試集

這裏將數據中的70%作爲訓練集,剩下的30%作爲測試集

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

4.模型訓練

  1. 首先使用LogisticRegression模型訓練
# 模型訓練
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

用訓練好的模型在測試集上預測

# 模型預測
prediction = lrModel.transform(testData)

# ROC score
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(prediction)

使用十折交叉驗證

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())
evaluator = BinaryClassificationEvaluator()
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=10)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)
  1. 使用使用隨機森林訓練
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# 構建模型
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
# 十折交叉驗證
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
                          .addGrid(rf.maxDepth, [3, 5, 7, 10])
                          .addGrid(rf.maxBins, [20, 30, 40])
                          .build())
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=10)
cvModel_rf = cv.fit(trainingData)

# 模型預測 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章