1. 讀取數據集

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Spark_mllearn_example").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master("local").appName("Spark_mllearn_example").config("", "").getOrCreate()

dpath = '/Users/huoshirui/Desktop/Spark/'
df = spark.read.csv(dpath + 'spark_mllearn_test.csv', header=True)

數據集如下圖：

2. 將數據集轉換成可以用於模型使用的features/label的形式

df = df.withColumn('c2', df['c2'].cast('double'))\
       .withColumn('c3', df['c3'].cast('double'))\
       .withColumn('c4', df['c4'].cast('double'))\
       .withColumn('c5', df['c5'].cast('double'))\
       .withColumn('c6', df['c6'].cast('double'))\
       .withColumn('c7', df['c7'].cast('double'))\
       .withColumn('c8', df['c8'].cast('double'))\
       .withColumn('c9', df['c9'].cast('double'))\
       .withColumn('c10', df['c10'].cast('double'))\
       .withColumn('c11', df['c11'].cast('double'))\
       .withColumn('c12', df['c12'].cast('double'))\
       .withColumn('c13', df['c13'].cast('double'))\
       .withColumn('c14', df['c14'].cast('double'))\
       .withColumn('c15', df['c15'].cast('double'))\
       .withColumn('c16', df['c16'].cast('double'))

input_col = ['c2', 'c3', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']
vecAssembler = VectorAssembler(inputCols=input_col, outputCol="features")
stringIndexer = StringIndexer(inputCol="c4", outputCol="label")
pipeline = Pipeline(stages=[vecAssembler, stringIndexer])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

以下爲轉換後的數據集的結果：

3.劃分訓練集和測試集

這裏將數據中的70%作爲訓練集，剩下的30%作爲測試集

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

4.模型訓練

首先使用LogisticRegression模型訓練

# 模型訓練
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

用訓練好的模型在測試集上預測

# 模型預測
prediction = lrModel.transform(testData)

# ROC score
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(prediction)

使用十折交叉驗證

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())
evaluator = BinaryClassificationEvaluator()
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=10)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

使用使用隨機森林訓練

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# 構建模型
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
# 十折交叉驗證
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
                          .addGrid(rf.maxDepth, [3, 5, 7, 10])
                          .addGrid(rf.maxBins, [20, 30, 40])
                          .build())
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=10)
cvModel_rf = cv.fit(trainingData)

# 模型預測 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

pyspark構建簡單模型(RandomForest&LogisticRegression)

1. 讀取數據集

2. 將數據集轉換成可以用於模型使用的features/label的形式

3.劃分訓練集和測試集

4.模型訓練

資源管理與調度系統YARN(YARN基本架構及原理)

python進程（一）

RNN

tensorflow卷積層&池化層

keras中的卷積層&池化層

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結