一、數據預處理
1、加載數據
# 導入包
import os
import time
from pyspark.sql import SparkSession
# 實例化SparkSession對象,以本地模式是運行Spark程序
spark = SparkSession \
.builder \
.appName("PySpark_ML_Pipeline") \
.master("local[4]")\
.getOrCreate()
print spark
print spark.sparkContext
'''
<pyspark.sql.session.SparkSession object at 0x00000000066CB5C0>
<SparkContext master=local[4] appName=PySpark_ML_Pipeline>
'''
2、SparkSession讀取CSV格式文件
help(spark.read.csv)
# 讀取數據集,
raw_df = spark.read.csv('./datas/train.tsv', header='true', sep='\t',\
inferSchema='true')
# 顯示條目數
print raw_df.count()
==>7395
raw_df.printSchema()
# 由於字段太多,選擇某些字段值
raw_df.select('url', 'alchemy_category', 'alchemy_category_score', \
'label').show(10)
3、清洗數據
# 定義函數轉換 ?轉換爲 0
def replace_question_func(x):
return '0' if x == '?' else x
# 註冊函數
from pyspark.sql.functions import udf
replace_question = udf(replace_question_func)
# col函數將 一個字符串轉換爲DataFrame中列, 獲取對應DataFrame中此列的值
from pyspark.sql.functions import col
# 使用自定義的函數,轉換數據
df = raw_df.select(['url', 'alchemy_category'] +\
[ replace_question(col(column)).cast('double')\
.alias(column) for column in raw_df.columns[4:]])
df.printSchema()
df.select('url', 'alchemy_category', 'alchemy_category_score', \
'label').show(10)
# 將數據集分爲 訓練集和測試集
train_df, test_df = df.randomSplit([0.7, 0.3])
print train_df.cache().count()
print test_df.cache().count()
"""
5216
2179
"""
4、特徵處理
1、alchemy_category
類別特徵數據轉換
第一特徵轉換器、StringIndexer
將文字的類別特徵 轉換 數字
第二特徵轉換器、OneHotEncoder
將數值的 類別特徵字段 轉換爲 多個字段的Vector
2、特徵的組合
第二特徵轉換器、VectorAssembler
將多個特徵整合到一起
4.1、StringIndexer
網址:http://spark.apache.org/docs/2.2.0/ml-features.html#stringindexer
# 導入模塊
from pyspark.ml.feature import StringIndexer
help(StringIndexer)
# 創建StringIndexer實例對象
"""
參數說明:
inputCol -> 要轉換的字段名稱
outputCol -> 轉換後的字段名稱
"""
categoryIndexer = StringIndexer(inputCol='alchemy_category',\
outputCol='alchemy_category_index')
print type(categoryIndexer)
"""
==><class 'pyspark.ml.feature.StringIndexer'>
"""
調用StringIndexer類中的 fit 方法,獲取到轉換器Transformer
categoryTransformer = categoryIndexer.fit(df)
print type(categoryTransformer)
# 使用 categoryTransformer 轉換器 將所有的 train_df 進行轉換
df1 = categoryTransformer.transform(train_df)
df1.select('alchemy_category', 'alchemy_category_index').show(10)
"""
+------------------+----------------------+
| alchemy_category|alchemy_category_index|
+------------------+----------------------+
| ?| 0.0|
|arts_entertainment| 2.0|
| ?| 0.0|
| business| 3.0|
|arts_entertainment| 2.0|
| ?| 0.0|
| ?| 0.0|
| recreation| 1.0|
| business| 3.0|
|arts_entertainment| 2.0|
+------------------+----------------------+
only showing top 10 rows
"""
df1.printSchema() #查看結構數據
4.2、OneHotEncoder
OneHotEncoder可以將一個數值的類別特徵字段轉換爲多個字段的Vector向量
from pyspark.ml.feature import OneHotEncoder
# 創建 OneHotEncoder 實例對象
encoder = OneHotEncoder(inputCol='alchemy_category_index',
outputCol='alchemy_category_index_vector')
print type(encoder)
"""
<class 'pyspark.ml.feature.OneHotEncoder'>
"""
df2 = encoder.transform(df1)
df2.printSchema()
df2.select('alchemy_category', 'alchemy_category_index',\
'alchemy_category_index_vector').show(10)
4.3、VectorAssembler
特徵的組合
第二特徵轉換器、VectorAssembler,將多個特徵整合到一起
from pyspark.ml.feature import VectorAssembler
assembler_inputs = ['alchemy_category_index_vector'] \
+ raw_df.columns[4:-1]
print assembler_inputs
"""
['alchemy_category_index_vector', 'alchemy_category_score',
'avglinksize', 'commonlinkratio_1', 'commonlinkratio_2',
'commonlinkratio_3', 'commonlinkratio_4', 'compression_ratio',
'embed_ratio', 'framebased', 'frameTagRatio', 'hasDomainLink',
'linkwordscore', 'news_front_page', 'non_markup_alphanum_characters',
'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio',
'spelling_errors_ratio']
"""
######創建 VectorAssembler 實例對象,傳遞參數,指定合併哪些字段,輸出的字段名稱
assembler = VectorAssembler(inputCols=assembler_inputs,
outputCol='features')
df3 = assembler.transform(df2)
df3.printSchema()
"""
+--------------------+-----+
| features|label|
+--------------------+-----+
|(35,[0,14,15,16,1...| 1.0|
|(35,[2,13,14,15,1...| 1.0|
|(35,[0,14,15,19,2...| 0.0|
|(35,[3,13,14,15,1...| 1.0|
|(35,[2,13,14,15,1...| 0.0|
+--------------------+-----+
only showing top 5 rows
"""
df3.select('features').take(1)
"""
[Row(features=SparseVector(35,
{0: 1.0, 14: 2.1446, 15: 0.7969, 16: 0.3945, 17: 0.332,
18: 0.3203, 19: 0.5022, 22: 0.028, 24: 0.1898, 25: 0.2354,
26: 1.0, 27: 1.0, 28: 17.0, 30: 10588.0, 31: 256.0, 32:
5.0, 33: 0.3828, 34: 0.1368}))]
"""
二、建模
分類決策樹DecisionTreeClassifier
from pyspark.ml.classification import DecisionTreeClassifier
# 使用決策樹分類算法
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='label',
impurity='gini', maxDepth=5, maxBins=32)
# 將 訓練數據 應用到 算法
dtc_model = dtc.fit(df3)
# 使用模型預測
df4 = dtc_model.transform(df3)
df4.select('label', 'prediction',
'rawPrediction', 'probability')
.show(20, truncate=False)
label | prediction | rawPrediction | probability |
---|---|---|---|
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
0.0 | 0.0 | [38.0,1.0] | [0.9743589743589743,0.02564102564102564] |
1.0 | 1.0 | [27.0,177.0] | [0.1323529411764706,0.8676470588235294] |
0.0 | 0.0 | [95.0,28.0] | [0.7723577235772358,0.22764227642276422] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 0.0 | [144.0,95.0] | [0.602510460251046,0.39748953974895396] |
0.0 | 0.0 | [363.0,146.0] | [0.7131630648330058,0.2868369351669941] |
0.0 | 0.0 | [86.0,23.0] | [0.7889908256880734,0.21100917431192662] |
0.0 | 0.0 | [144.0,95.0] | [0.602510460251046,0.39748953974895396] |
0.0 | 0.0 | [144.0,95.0] | [0.602510460251046,0.39748953974895396] |
0.0 | 0.0 | [43.0,1.0] | [0.9772727272727273,0.022727272727272728] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [27.0,177.0] | [0.1323529411764706,0.8676470588235294] |
1.0 | 1.0 | [129.0,417.0] | [0.23626373626373626,0.7637362637362637] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
0.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
only showing top 20 rows
三、評估(ROC曲線)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# 創建 實例對象, 傳遞參數值
evaluator = BinaryClassificationEvaluator(labelCol='label',
rawPredictionCol='rawPrediction')
# 計算指標 metricName="areaUnderROC"
auc = evaluator.evaluate(df4)
print auc
"""
0.6087142511
"""
總結上述開發流程:
1、從原始數據 提取特徵數據
2、特徵數據應用到算法,得到模型
3、使用模型預測數據
4、評估模型
Pipeline:
相當於一個“算法” -> 模型學習器
包含兩部分內容;
-a. Estimator 模型學習器
fit()
-b. transformers 轉換器
transformer()
pipeline = Pipeline(Stages(.....))
pipeline.fit().....
model.transfor().....
四、打包(ML Pipeline)
Step 1. 創建流程中 轉換器和 模型學習器
# 1. 導入全部需要 模塊
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
# a. StringIndexer
string_indexer = StringIndexer(inputCol='alchemy_category',\
outputCol='alchemy_category_index')
# b. OneHotEncoding
one_hot_encoder = OneHotEncoder(inputCol='alchemy_category_index',\
outputCol='alchemy_category_index_vector')
# c. VectorAessmbler
assembler_inputs = ['alchemy_category_index_vector'] \
+ raw_df.columns[4:-1]
vector_assembler = VectorAssembler(inputCols=assembler_inputs,\
outputCol='features')
# d. DecisionTreeClassifier 模型學習器
dt = DecisionTreeClassifier(featuresCol='features', labelCol='label',\
impurity='gini', maxDepth=5, maxBins=32)
Step 2. 創建Pipeline實例對象
# 按照數據處理順序
pipeline = Pipeline(stages=[string_indexer,
one_hot_encoder, vector_assembler, dt])
pipeline.getStages()
"""
[StringIndexer_43e8b50676a58dad4d05,
OneHotEncoder_4bf2a31a6b4b12aebd78,
VectorAssembler_4429bf16ed1cc6c14207,
DecisionTreeClassifier_451682088ef8fcaa79ae]
"""
step3. Pipeline 數據處理與訓練模型
# 調用fit方法學,
pipleline_model = pipeline.fit(train_df)
type(pipleline_model) #pyspark.ml.pipeline.PipelineModel
pipleline_model.stages[3]
Step 4. PipelineModel模型預測
predict_df = pipleline_model.transform(test_df)
step5、PipelineModel模型保存於加載
# 保存 模型
pipleline_model.save('./datas/dtc-model')
step6、調用
# 加載模型
from pyspark.ml.pipeline import PipelineModel
load_pipeline_model = PipelineModel.load('./datas/dtc-model')
load_pipeline_model.stages[3]
# 預測
load_pipeline_model.transform(test_df) \
.select('label', 'prediction', 'rawPrediction',\
'probability').show(20, truncate=False)
label | prediction | rawPrediction | probability |
---|---|---|---|
0.0 | 0.0 | [361.0,300.0] | [0.546142208774584,0.45385779122541603] |
1.0 | 0.0 | [144.0,95.0] | [0.602510460251046,0.39748953974895396] |
0.0 | 1.0 | [0.0,8.0] | [0.0,1.0] |
1.0 | 1.0 | [129.0,417.0] | [0.23626373626373626,0.7637362637362637] |
0.0 | 0.0 | [363.0,146.0] | [0.7131630648330058,0.2868369351669941] |
0.0 | 0.0 | [363.0,146.0] | [0.7131630648330058,0.2868369351669941] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [129.0,417.0] | [0.23626373626373626,0.7637362637362637] |
1.0 | 1.0 | [27.0,177.0] | [0.1323529411764706,0.8676470588235294] |
1.0 | 1.0 | [27.0,177.0] | [0.1323529411764706,0.8676470588235294] |
1.0 | 1.0 | [27.0,177.0] | [0.1323529411764706,0.8676470588235294] |
1.0 | 1.0 | [27.0,177.0] | [0.1323529411764706,0.8676470588235294] |
1.0 | 1.0 | [27.0,177.0] | [0.1323529411764706,0.8676470588235294] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
0.0 | 0.0 | [363.0,146.0] | [0.7131630648330058,0.2868369351669941] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 1.0 | [909.0,1104.0] | [0.45156482861400893,0.5484351713859911] |
1.0 | 0.0 | [361.0,300.0] | [0.546142208774584,0.45385779122541603] |
0.0 | 0.0 | [86.0,23.0] | [0.7889908256880734,0.21100917431192662] |
only showing top 20 rows
五、驗證選擇最優模型
5.1、創建 TrainValidationSplit 實例對象
(訓練檢驗分離選擇最優)
導入模塊
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
構建一個 決策樹分類算法 網格參數
"""
調整三個參數:
-1. 不純度度量
-2. 最多深度
-3. 最大分支數
"""
param_grid = ParamGridBuilder() \
.addGrid(dt.impurity, ['gini', 'entropy']) \
.addGrid(dt.maxDepth, [5, 10, 20]) \
.addGrid(dt.maxBins, [8, 16, 32]) \
.build()
print type(param_grid)
for param in param_grid:
print param
針對二分類創建模型評估器
binary_class_evaluator = BinaryClassificationEvaluator(labelCol='label',\
rawPredictionCol='rawPrediction')
創建 TrainValidationSplit 實例對象
"""
__init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75, seed=None)
參數解釋:
estimator:
模型學習器,針對哪個算法進行調整超參數,這裏是DT
estimatorParamMaps:
算法調整的參數組合
evaluator:
評估模型的評估器,比如二分類的話,使用auc面積
trainRatio:
訓練集與驗證集 所佔的比例,此處的值表示的是 訓練集比例
"""
train_validataion_split = TrainValidationSplit(estimator=dt,
evaluator=binary_class_evaluator,
estimatorParamMaps=param_grid, trainRatio=0.8)
type(train_validataion_split)
#pyspark.ml.tuning.TrainValidationSplit
建立新的Pipeline實例對象
#使用 train_validataion_split 代替 原先 dt
tvs_pipeline = Pipeline(stages=[string_indexer, \
one_hot_encoder, vector_assembler, \
train_validataion_split])
# tvs_pipeline 進行數據處理、模型訓練(找到最佳模型)
tvs_pipeline_model = tvs_pipeline.fit(train_df)
best_model = tvs_pipeline_model.stages[3].bestModel
"""
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_\
451682088ef8fcaa79ae) of depth 20 with 1851 nodes
"""
評估最佳模型
predictions_df = tvs_pipeline_model.transform(test_df)
model_auc = binary_class_evaluator.evaluate(predictions_df)
print model_auc
0.649609702764
5.1、Cross-Validation交叉驗證
"""
__init__(self, estimator=None, estimatorParamMaps=None, \
evaluator=None, numFolds=3, seed=None)
假設 K-Fold的CrossValidation交叉驗證 K = 3,將數據分爲3個部分:
1、A + B作爲訓練,C作爲驗證
2、B + C作爲訓練,A作爲驗證
3、A + C最爲訓練,B作爲驗證
"""
# 導入模塊
from pyspark.ml.tuning import CrossValidator
# 構建 CrossValidator實例對象,設置相關參數
cross_validator = CrossValidator(estimator=dt, \
evaluator=binary_class_evaluator,\
estimatorParamMaps=param_grid, numFolds=3)
# 創建Pipeline
cv_pipeline = Pipeline(stages=[string_indexer, one_hot_encoder, \
vector_assembler, cross_validator])
使用 cv_pipeline 進行訓練與驗證(交叉)
cv_pipeline_model = cv_pipeline.fit(train_df)
查看最佳模型
best_model = cv_pipeline_model.stages[3].bestModel
"""
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_ \
451682088ef8fcaa79ae) of depth 10 with 527 nodes
"""
使用測試集評估最佳模型
cv_predictions = cv_pipeline_model.transform(test_df)
cv_model_auc = binary_class_evaluator.evaluate(cv_predictions)
print cv_model_auc
六、提升:隨即森林(RF算法)
# 導入隨機森林分類算法模塊
from pyspark.ml.classification import RandomForestClassifier
# 創建RFC實例對象
rfc = RandomForestClassifier(labelCol='label', \
featuresCol='features',\
numTrees=10, \
featureSubsetStrategy="auto",\
maxDepth=5, \
maxBins=32, \
impurity="gini")
# 創建Pipeline實例對象
rfc_pipeline = Pipeline(stages=[string_indexer, one_hot_encoder, \
vector_assembler, rfc])
# 使用訓練數據訓練模型
rfc_pipeline_model = rfc_pipeline.fit(train_df)
# 預測
rfc_predictions = rfc_pipeline_model.transform(test_df)
rfc_model_auc = binary_class_evaluator.evaluate(rfc_predictions)
print rfc_model_auc
"""
0.716242043615
"""