Coursera Big Data系列课程笔记2

hadoop Cloudera virtual machine 操作

week18 spark data preparation for ML机器学习数据清洗


pyspark #启动网页窗口
Downloads/big-data-4/handling-missing-values.ipynb

[1] from pyspark.sql import SQLContext ##加载SQLContext
sqlContext=SQLContext(sc)     ##创建一个SQLContext    
df=sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv',format='com.databricks.spark.csv',header='true',inferSchema='true') ##读取天气数据到数据框中
[2] df.descirbe().toPandas().transpose() ##显示数据框描述性统计信息
[3] df.describe('air_temp_9am').show()  ##显示数据框某列描述性统计信息
[4] df.count() ##显示行数
[5] removeALLDF=df.na.drop() ##去除字段缺失数据行
[6] removeALLDF.describe('air_temp_9am').show()
[7] removeALLDF.count()
[8] from pyspark.sql.functions import avg
imputeDF=df
[9] for x in imputeDF.columns:
        meanValue=removeALLDF.agg(avg(x)).first()[0]  ##对每列计算平均值
        print(x,meanValue)  
        imputeDF=imputeDF.na.fill(meanValue,[x])  ##用每列的平均值替代缺失值
[10] df.describe('air_temp_9am').show()
imputeDF.describe('air_temp_9am').show()

week19 spark Classification Algorthms决策树分类算法


先设置虚拟镜像处理器数量为2
pyspark #启动网页窗口
Downloads/big-data-4/classification.ipynb

[1] from pyspark.sql import SQLContext ##导入需要的包
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
[2] sqlContext = SQLContext(sc) ##创建一个SQLContext对象
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv', format='com.databricks.spark.csv', header='true',inferSchema='true')
df.columns
[3] featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am','max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am','rain_duration_9am']  ##创建条件列头数组
[4] df=df.drop("number") ##去掉number列
[5] df=df.na.drop() ##去掉含NA值的行
[6] df.count(),len(df.columns) ##计算行数和列数
[7] binarizer =Binarizer(threshold=24.99999,inputCol="relative_humidity_3pm",outputCol="label") 
binarizedDF=binarizer.transform(df) ##创建一个二元列函数,并对df创建一个二元列
[8] binarizedDF.select("relative_humidity_3pm","label").show(4) ##显示指定列
[9] assembler=VectorAssembler(inputCols=featureColumns,outputCol="features") 
assembled=assembler.transform(binarizedDF) ##创建一个VectorAssembler类用于定义参数估计,得到数据模型(输入条件和输出)
[10] (trainingData,testData)=assembled.randomSplit([0.8,0.2],seed=13234) ##按一定条件分为训练数据和测试数据
[11] trainingData.count(),testData.count() ##分别计算训练和测试数据量
[12] dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",maxDepth=5,minInstancesPerNode=20,impurity="gini") ##创建一个决策树类
[13] pipeline=Pipeline(stages=[dt]) ##创建分布式计算类,并通过训练数据获得模型
model=pipeline.fit(trainingData)
[14] predictions=model.transform(testData) ##得到预测结果
[15] predictions.select("prediction","label").show(10) 
[16] dictions.select("prediction","label").write.save(path="file:///home/cloudera/Downloads/big-data-4/predictions.csv",  format="com.databricks.spark.csv",header="true") ##保存结果

week20 spark Evaluation of decision tree决策树模型评估


pyspark #启动网页窗口
Downloads/big-data-4/model-evaluation.ipynb

[1] from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
[2] sqlContext = SQLContext(sc) ##创建一个SQLContext对象dataframe
predictions = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/predictions.csv', format='com.databricks.spark.csv', header='true',inferSchema='true')
[3] evaluator = MulticlassClassificationEvaluator(
    labelCol="label",predictionCol="prediction",metricName="precision")  ##生成一个评估矩阵
[4] accuracy=evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy)) ##获取评估结果:准确率
[5] predictions.rdd.take(2) ##获取评估矩阵的rdd值
[6] predictions.rdd.map(tuple).take(2) ##把rdd值转换为元组
[7] metrics = MulticlassMetrics(predictions.rdd.map(tuple)) ##将rdd元组转换为Spark Matrix矩阵
[8] metrics.confusionMatrix().toArray().transpose() ##将矩阵再转换为Python Numpy array

week21 spark Cluster Analysis聚类算法K-means聚类


pyspark #启动网页窗口
Downloads/big-data-4/clustering.ipynb

[1] from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from notebooks import utils  ##教程中提供的一个utils.py包用于一些特定功能,如显示图示
%matplotlib inline
[2] sqlContext = SQLContext(sc) ##创建一个SQLContext对象dataframe
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/minute_weather.csv', format='com.databricks.spark.csv',header='true',inferSchema='true')
[3] df.count()
[4] filteredDF =df.filter((df.rowID % 10)==0) ##取十分之一的数据来操作
filteredDF.count()
[5] filteredDF.describe().toPandas().transpose()  ##显示数据的基本统计信息
[6] filteredDF.filter(filteredDF.rain_accumulation == 0.0).count()  ##计算rain_accumulation为0的行数
[7] filteredDF.filter(filteredDF.rain_duration == 0.0).count()
[8] workingDF=filteredDF.drop("rain_accumulation").drop("rain_duration").drop("hpwren_timestamp") ##去掉某几列数
[9] before =workingDF.count()  ##去含NA的行
workingDF=workingDF.na.drop()
after= workingDF.count()
before - after
[10] workingDF.columns  ##显示dataframe各列名称
[11] featuresUsed = ['air_pressure','air_temp','avg_wind_direction','avg_wind_speed','max_wind_direction','max_wind_speed','relative_humidity']
assembler=VectorAssembler(inputCols=featuresUsed,outputCol="features_unscaled")
assembled = assembler.transform(workingDF)  ##创建一列,合并指定各列的信息,即该列每个值都是一个数组
[12] scaler = StandardScaler(inputCol="features_unscaled",outputCol="features",withStd=True,withMean=True)  ##创建一个标准化模型,设置参数
scalerModel=scaler.fit(assembled)  ##使模型匹配数据对象
scaledData =scalerModel.transform(assembled)  ##将数据对象通过模型进行转换
[13] scaledData=scaledData.select("features","rowID")  ##选择某两列数据
elbowset = scaledData.filter((scaledData.rowID % 3) == 0).select("features")  ##选择三分之一的数据
elbowset.persist()  ##将数据保存在内存中,便于快速调用
[14] clusters = range(2,31) 
wsseList = utils.elbow(elbowset,clusters)  ##计算elbowset的K-MEANS结果,其中K值由clusters提供,此处分别提供了2到31的K值
# Training for cluster size 2 
# ......................WSSE = 114993.13181214455 
# Training for cluster size 3 
# ......................WSSE = 104181.0978581738 
# Training for cluster size 4 
# ......................WSSE = 94577.27151288437 
# Training for cluster size 5 
# ......................WSSE = 87993.46098415818 
# Training for cluster size 6 
# ......................WSSE = 85084.23922296544 
# Training for cluster size 7 
# ......................WSSE = 81664.9602448752
[15] utils.elbow_plot(wsseList,clusters)  ##显示clusters的各k值下WSSE误差水平的有序散点图,用于判断拐点12

elbow_plot

[16] scaledDataFeat =scaledData.select("features")  ##选择features列
scaledDataFeat.persist()  ##将数据保存在内存中,便于快速调用
[17] kmeans=KMeans(k=12,seed=1)  ##创建一个KMeans模型,设定参数
model=kmeans.fit(scaledDataFeat)  ##使模型匹配数据
transformed =model.transform(scaledDataFeat)  ##将数据通过模型,生成结果
[18] centers =model.clusterCenters()  ##获取最终的12个中心点
centers  ##显示
[19] P=utils.pd_centers(featuresUsed,centers)  ##通过matplotlib,创建一个Pandas DataFrame,保存特征名称和中心点。
[20] utils.parallel_plot(P[P["relative_humidity"] < -0.5],P) ##显示一个点图,显示relative_humidity小于-0.5的各中心点特征分布

parallel_plot1

[21]utils.parallel_plot(P[P["air_temp"] < -0.5],P)
[22]utils.parallel_plot(P[(P["relative_humidity"] > -0.5) & (P["air_temp"] < -0.5)],P)
[23]utils.parallel_plot(P.iloc[[2]],P)

parallel_plot2

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章