Coursera Big Data系列課程筆記2

hadoop Cloudera virtual machine 操作

week18 spark data preparation for ML機器學習數據清洗


pyspark #啓動網頁窗口
Downloads/big-data-4/handling-missing-values.ipynb

[1] from pyspark.sql import SQLContext ##加載SQLContext
sqlContext=SQLContext(sc)     ##創建一個SQLContext    
df=sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv',format='com.databricks.spark.csv',header='true',inferSchema='true') ##讀取天氣數據到數據框中
[2] df.descirbe().toPandas().transpose() ##顯示數據框描述性統計信息
[3] df.describe('air_temp_9am').show()  ##顯示數據框某列描述性統計信息
[4] df.count() ##顯示行數
[5] removeALLDF=df.na.drop() ##去除字段缺失數據行
[6] removeALLDF.describe('air_temp_9am').show()
[7] removeALLDF.count()
[8] from pyspark.sql.functions import avg
imputeDF=df
[9] for x in imputeDF.columns:
        meanValue=removeALLDF.agg(avg(x)).first()[0]  ##對每列計算平均值
        print(x,meanValue)  
        imputeDF=imputeDF.na.fill(meanValue,[x])  ##用每列的平均值替代缺失值
[10] df.describe('air_temp_9am').show()
imputeDF.describe('air_temp_9am').show()

week19 spark Classification Algorthms決策樹分類算法


先設置虛擬鏡像處理器數量爲2
pyspark #啓動網頁窗口
Downloads/big-data-4/classification.ipynb

[1] from pyspark.sql import SQLContext ##導入需要的包
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
[2] sqlContext = SQLContext(sc) ##創建一個SQLContext對象
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv', format='com.databricks.spark.csv', header='true',inferSchema='true')
df.columns
[3] featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am','rain_duration_9am']  ##創建條件列頭數組
[4] df=df.drop("number") ##去掉number列
[5] df=df.na.drop() ##去掉含NA值的行
[6] df.count(),len(df.columns) ##計算行數和列數
[7] binarizer =Binarizer(threshold=24.99999,inputCol="relative_humidity_3pm",outputCol="label") 
binarizedDF=binarizer.transform(df) ##創建一個二元列函數,並對df創建一個二元列
[8] binarizedDF.select("relative_humidity_3pm","label").show(4) ##顯示指定列
[9] assembler=VectorAssembler(inputCols=featureColumns,outputCol="features") 
assembled=assembler.transform(binarizedDF) ##創建一個VectorAssembler類用於定義參數估計,得到數據模型(輸入條件和輸出)
[10] (trainingData,testData)=assembled.randomSplit([0.8,0.2],seed=13234) ##按一定條件分爲訓練數據和測試數據
[11] trainingData.count(),testData.count() ##分別計算訓練和測試數據量
[12] dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",maxDepth=5,minInstancesPerNode=20,impurity="gini") ##創建一個決策樹類
[13] pipeline=Pipeline(stages=[dt]) ##創建分佈式計算類,並通過訓練數據獲得模型
model=pipeline.fit(trainingData)
[14] predictions=model.transform(testData) ##得到預測結果
[15] predictions.select("prediction","label").show(10) 
[16] dictions.select("prediction","label").write.save(path="file:///home/cloudera/Downloads/big-data-4/predictions.csv",  format="com.databricks.spark.csv",header="true") ##保存結果

week20 spark Evaluation of decision tree決策樹模型評估


pyspark #啓動網頁窗口
Downloads/big-data-4/model-evaluation.ipynb

[1] from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
[2] sqlContext = SQLContext(sc) ##創建一個SQLContext對象dataframe
predictions = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/predictions.csv', format='com.databricks.spark.csv', header='true',inferSchema='true')
[3] evaluator = MulticlassClassificationEvaluator(
    labelCol="label",predictionCol="prediction",metricName="precision")  ##生成一個評估矩陣
[4] accuracy=evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy)) ##獲取評估結果:準確率
[5] predictions.rdd.take(2) ##獲取評估矩陣的rdd值
[6] predictions.rdd.map(tuple).take(2) ##把rdd值轉換爲元組
[7] metrics = MulticlassMetrics(predictions.rdd.map(tuple)) ##將rdd元組轉換爲Spark Matrix矩陣
[8] metrics.confusionMatrix().toArray().transpose() ##將矩陣再轉換爲Python Numpy array

week21 spark Cluster Analysis聚類算法K-means聚類


pyspark #啓動網頁窗口
Downloads/big-data-4/clustering.ipynb

[1] from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from notebooks import utils  ##教程中提供的一個utils.py包用於一些特定功能,如顯示圖示
%matplotlib inline
[2] sqlContext = SQLContext(sc) ##創建一個SQLContext對象dataframe
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/minute_weather.csv', format='com.databricks.spark.csv',header='true',inferSchema='true')
[3] df.count()
[4] filteredDF =df.filter((df.rowID % 10)==0) ##取十分之一的數據來操作
filteredDF.count()
[5] filteredDF.describe().toPandas().transpose()  ##顯示數據的基本統計信息
[6] filteredDF.filter(filteredDF.rain_accumulation == 0.0).count()  ##計算rain_accumulation爲0的行數
[7] filteredDF.filter(filteredDF.rain_duration == 0.0).count()
[8] workingDF=filteredDF.drop("rain_accumulation").drop("rain_duration").drop("hpwren_timestamp") ##去掉某幾列數
[9] before =workingDF.count()  ##去含NA的行
workingDF=workingDF.na.drop()
after= workingDF.count()
before - after
[10] workingDF.columns  ##顯示dataframe各列名稱
[11] featuresUsed = ['air_pressure','air_temp','avg_wind_direction','avg_wind_speed','max_wind_direction','max_wind_speed','relative_humidity']
assembler=VectorAssembler(inputCols=featuresUsed,outputCol="features_unscaled")
assembled = assembler.transform(workingDF)  ##創建一列,合併指定各列的信息,即該列每個值都是一個數組
[12] scaler = StandardScaler(inputCol="features_unscaled",outputCol="features",withStd=True,withMean=True)  ##創建一個標準化模型,設置參數
scalerModel=scaler.fit(assembled)  ##使模型匹配數據對象
scaledData =scalerModel.transform(assembled)  ##將數據對象通過模型進行轉換
[13] scaledData=scaledData.select("features","rowID")  ##選擇某兩列數據
elbowset = scaledData.filter((scaledData.rowID % 3) == 0).select("features")  ##選擇三分之一的數據
elbowset.persist()  ##將數據保存在內存中,便於快速調用
[14] clusters = range(2,31) 
wsseList = utils.elbow(elbowset,clusters)  ##計算elbowset的K-MEANS結果,其中K值由clusters提供,此處分別提供了2到31的K值
# Training for cluster size 2 
# ......................WSSE = 114993.13181214455 
# Training for cluster size 3 
# ......................WSSE = 104181.0978581738 
# Training for cluster size 4 
# ......................WSSE = 94577.27151288437 
# Training for cluster size 5 
# ......................WSSE = 87993.46098415818 
# Training for cluster size 6 
# ......................WSSE = 85084.23922296544 
# Training for cluster size 7 
# ......................WSSE = 81664.9602448752
[15] utils.elbow_plot(wsseList,clusters)  ##顯示clusters的各k值下WSSE誤差水平的有序散點圖,用於判斷拐點12

elbow_plot

[16] scaledDataFeat =scaledData.select("features")  ##選擇features列
scaledDataFeat.persist()  ##將數據保存在內存中,便於快速調用
[17] kmeans=KMeans(k=12,seed=1)  ##創建一個KMeans模型,設定參數
model=kmeans.fit(scaledDataFeat)  ##使模型匹配數據
transformed =model.transform(scaledDataFeat)  ##將數據通過模型,生成結果
[18] centers =model.clusterCenters()  ##獲取最終的12箇中心點
centers  ##顯示
[19] P=utils.pd_centers(featuresUsed,centers)  ##通過matplotlib,創建一個Pandas DataFrame,保存特徵名稱和中心點。
[20] utils.parallel_plot(P[P["relative_humidity"] < -0.5],P) ##顯示一個點圖,顯示relative_humidity小於-0.5的各中心點特徵分佈

parallel_plot1

[21]utils.parallel_plot(P[P["air_temp"] < -0.5],P)
[22]utils.parallel_plot(P[(P["relative_humidity"] > -0.5) & (P["air_temp"] < -0.5)],P)
[23]utils.parallel_plot(P.iloc[[2]],P)

parallel_plot2

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章