hadoop Cloudera virtual machine 操作
week18 spark data preparation for ML機器學習數據清洗
pyspark #啓動網頁窗口
Downloads/big-data-4/handling-missing-values.ipynb
[1] from pyspark.sql import SQLContext ##加載SQLContext
sqlContext=SQLContext(sc) ##創建一個SQLContext
df=sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv',format='com.databricks.spark.csv',header='true',inferSchema='true') ##讀取天氣數據到數據框中
[2] df.descirbe().toPandas().transpose() ##顯示數據框描述性統計信息
[3] df.describe('air_temp_9am').show() ##顯示數據框某列描述性統計信息
[4] df.count() ##顯示行數
[5] removeALLDF=df.na.drop() ##去除字段缺失數據行
[6] removeALLDF.describe('air_temp_9am').show()
[7] removeALLDF.count()
[8] from pyspark.sql.functions import avg
imputeDF=df
[9] for x in imputeDF.columns:
meanValue=removeALLDF.agg(avg(x)).first()[0] ##對每列計算平均值
print(x,meanValue)
imputeDF=imputeDF.na.fill(meanValue,[x]) ##用每列的平均值替代缺失值
[10] df.describe('air_temp_9am').show()
imputeDF.describe('air_temp_9am').show()
week19 spark Classification Algorthms決策樹分類算法
先設置虛擬鏡像處理器數量爲2
pyspark #啓動網頁窗口
Downloads/big-data-4/classification.ipynb
[1] from pyspark.sql import SQLContext ##導入需要的包
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
[2] sqlContext = SQLContext(sc) ##創建一個SQLContext對象
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv', format='com.databricks.spark.csv', header='true',inferSchema='true')
df.columns
[3] featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am','rain_duration_9am'] ##創建條件列頭數組
[4] df=df.drop("number") ##去掉number列
[5] df=df.na.drop() ##去掉含NA值的行
[6] df.count(),len(df.columns) ##計算行數和列數
[7] binarizer =Binarizer(threshold=24.99999,inputCol="relative_humidity_3pm",outputCol="label")
binarizedDF=binarizer.transform(df) ##創建一個二元列函數,並對df創建一個二元列
[8] binarizedDF.select("relative_humidity_3pm","label").show(4) ##顯示指定列
[9] assembler=VectorAssembler(inputCols=featureColumns,outputCol="features")
assembled=assembler.transform(binarizedDF) ##創建一個VectorAssembler類用於定義參數估計,得到數據模型(輸入條件和輸出)
[10] (trainingData,testData)=assembled.randomSplit([0.8,0.2],seed=13234) ##按一定條件分爲訓練數據和測試數據
[11] trainingData.count(),testData.count() ##分別計算訓練和測試數據量
[12] dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",maxDepth=5,minInstancesPerNode=20,impurity="gini") ##創建一個決策樹類
[13] pipeline=Pipeline(stages=[dt]) ##創建分佈式計算類,並通過訓練數據獲得模型
model=pipeline.fit(trainingData)
[14] predictions=model.transform(testData) ##得到預測結果
[15] predictions.select("prediction","label").show(10)
[16] dictions.select("prediction","label").write.save(path="file:///home/cloudera/Downloads/big-data-4/predictions.csv", format="com.databricks.spark.csv",header="true") ##保存結果
week20 spark Evaluation of decision tree決策樹模型評估
pyspark #啓動網頁窗口
Downloads/big-data-4/model-evaluation.ipynb
[1] from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
[2] sqlContext = SQLContext(sc) ##創建一個SQLContext對象dataframe
predictions = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/predictions.csv', format='com.databricks.spark.csv', header='true',inferSchema='true')
[3] evaluator = MulticlassClassificationEvaluator(
labelCol="label",predictionCol="prediction",metricName="precision") ##生成一個評估矩陣
[4] accuracy=evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy)) ##獲取評估結果:準確率
[5] predictions.rdd.take(2) ##獲取評估矩陣的rdd值
[6] predictions.rdd.map(tuple).take(2) ##把rdd值轉換爲元組
[7] metrics = MulticlassMetrics(predictions.rdd.map(tuple)) ##將rdd元組轉換爲Spark Matrix矩陣
[8] metrics.confusionMatrix().toArray().transpose() ##將矩陣再轉換爲Python Numpy array
week21 spark Cluster Analysis聚類算法K-means聚類
pyspark #啓動網頁窗口
Downloads/big-data-4/clustering.ipynb
[1] from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from notebooks import utils ##教程中提供的一個utils.py包用於一些特定功能,如顯示圖示
%matplotlib inline
[2] sqlContext = SQLContext(sc) ##創建一個SQLContext對象dataframe
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/minute_weather.csv', format='com.databricks.spark.csv',header='true',inferSchema='true')
[3] df.count()
[4] filteredDF =df.filter((df.rowID % 10)==0) ##取十分之一的數據來操作
filteredDF.count()
[5] filteredDF.describe().toPandas().transpose() ##顯示數據的基本統計信息
[6] filteredDF.filter(filteredDF.rain_accumulation == 0.0).count() ##計算rain_accumulation爲0的行數
[7] filteredDF.filter(filteredDF.rain_duration == 0.0).count()
[8] workingDF=filteredDF.drop("rain_accumulation").drop("rain_duration").drop("hpwren_timestamp") ##去掉某幾列數
[9] before =workingDF.count() ##去含NA的行
workingDF=workingDF.na.drop()
after= workingDF.count()
before - after
[10] workingDF.columns ##顯示dataframe各列名稱
[11] featuresUsed = ['air_pressure','air_temp','avg_wind_direction','avg_wind_speed','max_wind_direction','max_wind_speed','relative_humidity']
assembler=VectorAssembler(inputCols=featuresUsed,outputCol="features_unscaled")
assembled = assembler.transform(workingDF) ##創建一列,合併指定各列的信息,即該列每個值都是一個數組
[12] scaler = StandardScaler(inputCol="features_unscaled",outputCol="features",withStd=True,withMean=True) ##創建一個標準化模型,設置參數
scalerModel=scaler.fit(assembled) ##使模型匹配數據對象
scaledData =scalerModel.transform(assembled) ##將數據對象通過模型進行轉換
[13] scaledData=scaledData.select("features","rowID") ##選擇某兩列數據
elbowset = scaledData.filter((scaledData.rowID % 3) == 0).select("features") ##選擇三分之一的數據
elbowset.persist() ##將數據保存在內存中,便於快速調用
[14] clusters = range(2,31)
wsseList = utils.elbow(elbowset,clusters) ##計算elbowset的K-MEANS結果,其中K值由clusters提供,此處分別提供了2到31的K值
# Training for cluster size 2
# ......................WSSE = 114993.13181214455
# Training for cluster size 3
# ......................WSSE = 104181.0978581738
# Training for cluster size 4
# ......................WSSE = 94577.27151288437
# Training for cluster size 5
# ......................WSSE = 87993.46098415818
# Training for cluster size 6
# ......................WSSE = 85084.23922296544
# Training for cluster size 7
# ......................WSSE = 81664.9602448752
[15] utils.elbow_plot(wsseList,clusters) ##顯示clusters的各k值下WSSE誤差水平的有序散點圖,用於判斷拐點12
[16] scaledDataFeat =scaledData.select("features") ##選擇features列
scaledDataFeat.persist() ##將數據保存在內存中,便於快速調用
[17] kmeans=KMeans(k=12,seed=1) ##創建一個KMeans模型,設定參數
model=kmeans.fit(scaledDataFeat) ##使模型匹配數據
transformed =model.transform(scaledDataFeat) ##將數據通過模型,生成結果
[18] centers =model.clusterCenters() ##獲取最終的12箇中心點
centers ##顯示
[19] P=utils.pd_centers(featuresUsed,centers) ##通過matplotlib,創建一個Pandas DataFrame,保存特徵名稱和中心點。
[20] utils.parallel_plot(P[P["relative_humidity"] < -0.5],P) ##顯示一個點圖,顯示relative_humidity小於-0.5的各中心點特徵分佈
[21]utils.parallel_plot(P[P["air_temp"] < -0.5],P)
[22]utils.parallel_plot(P[(P["relative_humidity"] > -0.5) & (P["air_temp"] < -0.5)],P)
[23]utils.parallel_plot(P.iloc[[2]],P)