小落用的是pyspark,利用jupyter來編寫提交spark job。下面直接用代碼介紹:
在此之前要已經搭建並啓動hdfs+spark+jupyter
啓動spark api
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("spark://192.168.48.100:7077")\
.appName("rdd_demos").getOrCreate()
創建RDD
# 通過並行化內存中的集合來創建RDD
arr1 = [1,2,3,4,5,6,7,8,9,10]
rdd1 = spark.sparkContext.parallelize(arr1)
#加載外部文件創建RDD
file = "/spark_demo/wordcount/input/study.txt"
rdd3 = spark.sparkContext.textFile(file)
對RDD執行transformation操作
# 查看當前RDD有幾個分區
rdd1.getNumPartitions()
# 把rdd看作是流式的
#—— map(自定義規則)轉換,一對一
data_rdd1 = data.map(lambda x: x+1)
#—— flatMap轉換:先規則轉換,再扁平,一對多
data_rdd2 = data.flatMap(lambda x: range(x,4))
#—— filter轉換,多對一
data_rdd3 = data.filter(lambda x: x!=1)
#—— distinct轉換,多對一
data_rdd4 = data.distinct()
# ——sample:抽樣,多對一
data_rdd5 = data.sample(False, 0.5)
對RDD執行set操作
# 現在假設有兩個RDD,包含元素分別爲{1,2,3,3}和{3,4,5}
# 首先構造這兩個RDD
data1 = spark.sparkContext.parallelize([1,2,3,3])
data2 = spark.sparkContext.parallelize([3,4,5])
# union操作
data1.union(data2).collect()
# intersection操作:交集
data1.intersection(data2).collect()
# substact操作:差集
data1.subtract(data2).collect()
# cartesian操作:笛卡爾集
data1.cartesian(data2).collect()
# groupBy操作:按指定規則分組
a = spark.sparkContext.parallelize(["black", "blue", "white", "green", "grey"])
b = a.groupBy(lambda x: len(x)).collect()
print(b)
sorted([(x,sorted(y)) for (x,y) in b])
對RDD執行action操作
rdd.count()
rdd.collect()
rdd.first()
rdd.countByValue()
rdd.take(2)
rdd.takeOrdered(2)
rdd.takeOrdered(2, key=lambda x: -x)
rdd.takeSample(False, 2)
rdd.reduce(lambda x,y: x+y)
rdd.getNumPartitions()
複雜算子——aggregate
seqOp = (lambda x,y: x * y) # 每個分區執行的函數
combOp = (lambda x,y: x + y) # 各個分區結果最後聚集時使用的函數
2 #指定是初始值及輸出形式
result = rdd.aggregate(2, seqOp, combOp)
result
創建Pair RDD
# 創建Pair RDD的方式有多種。
# 第一種創建方式:從文件中加載,然後轉換爲Pair RDD。
file = "/spark_demo/wordcount/input/study.txt"
lines =spark.sparkContext.textFile(file)
pairRDD = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word,1))
pairRDD.collect()
# 第二種方式:通過並行集合創建Pair RDD
rdd = spark.sparkContext.parallelize(["Hadoop","Spark","Hive","Spark"])
pairRDD = rdd.map(lambda word: (word,1))
pairRDD.collect()
# keyBy():自定義key的分組規則
a = spark.sparkContext.parallelize(["black", "blue", "white", "green", "grey"])
# 通過應用指定的函數來創建該RDD中元素的元組(參數函數生成對應的key),返回一個pair RDD
b = a.keyBy(lambda x:len(x))
b.collect()
# 或者,通過元組列表創建 pair rdd
pets = spark.sparkContext.parallelize([("cat",1),("dog",1),("cat",2)])
pets.collect()
Pair RDD transformation操作
# reduceByKey(func):按照key來合併值(相同key的值進行合併)
pairRDD.reduceByKey(lambda x,y: x + y).collect()
# groupByKey():按照key分組
pairRDD.groupByKey().collect()
# keys:返回所有的key
pairRDD.keys().collect()
# values: 返回所有的value
pairRDD.values().collect()
# sortByKey():按照key進行排序,默認是升序
pairRDD.sortByKey(ascending=False).collect()
# mapValues(func):將函數應用到pair rdd中的每個元素上,不改變key
pairRDD.mapValues(lambda x: x*x).collect()
# flatMapValues(func)
pairRDD.flatMapValues(lambda x: range(x,6)).collect()
複雜算子 combineByKey()
# combineByKey():
data = spark.sparkContext.parallelize([("company-1",92),("company-1",85),("company-1",82),
("company-1",93),("company-1",86),("company-1",83),
("company-2",78),("company-2",96),("company-2",85),
("company-3",88),("company-3",94),("company-3",80)],3)
cbk = data.combineByKey(
lambda income: (income,1),
lambda t,income: (t[0]+income, t[1]+1),
lambda t1,t2: (t1[0]+t2[0], t1[1]+t2[1]) # 將不同分區的同一個key的C合併
)
cbk.collect() # 每個公司的總收入
# 每個公司的平均收入
cbk.map(lambda t: (t[0],t[1][0],t[1][0]/float(t[1][1]))).collect()
# groupByKey
x = spark.sparkContext.parallelize([
("USA", 1), ("USA", 2), ("India", 1),
("UK", 1), ("India", 4), ("India", 9),
("USA", 8), ("USA", 3), ("India", 4),
("UK", 6), ("UK", 9), ("UK", 5)], 4)
# 使用groupByKey,默認分區
y = x.groupByKey()
# 查看分區
print('分區數: ',y.getNumPartitions())
# 使用預定義的分區
y = x.groupByKey(2)
print('分區數: ',y.getNumPartitions())
# 輸出結果
for t in y.collect():
print(t[0], [v for v in t[1]])
複雜算子 aggregateByKey
# 使用key-value對創建PairRDD studentRDD
student_rdd = spark.sparkContext.parallelize([
("Joseph", "Maths", 83), ("Joseph", "Physics", 74), ("Joseph", "Chemistry", 91),
("Joseph", "Biology", 82), ("Jimmy", "Maths", 69), ("Jimmy", "Physics", 62),
("Jimmy", "Chemistry", 97), ("Jimmy", "Biology", 80), ("Tina", "Maths", 78),
("Tina", "Physics", 73), ("Tina", "Chemistry", 68), ("Tina", "Biology", 87),
("Thomas", "Maths", 87), ("Thomas", "Physics", 93), ("Thomas", "Chemistry", 91),
("Thomas", "Biology", 74), ("Cory", "Maths", 56), ("Cory", "Physics", 65),
("Cory", "Chemistry", 71), ("Cory", "Biology", 68), ("Jackeline", "Maths", 86),
("Jackeline", "Physics", 62), ("Jackeline", "Chemistry", 75), ("Jackeline", "Biology", 83),
("Juan", "Maths", 63), ("Juan", "Physics", 69), ("Juan", "Chemistry", 64),
("Juan", "Biology", 60)], 2)
# 定義Seqencial Operation and Combiner Operations
# Sequence operation : 從單個分區查找最大成績
def seq_op(accumulator, element):
if(accumulator > element[1]):
return accumulator
else:
return element[1]
# Combiner Operation : 從所有分區累加器中找出最大成績
def comb_op(accumulator1, accumulator2):
if(accumulator1 > accumulator2):
return accumulator1
else:
return accumulator2
# 在我們的情況下,零值將是零,因爲我們正在尋找最大的成績
zero_val = 0
aggr_rdd = student_rdd.map(lambda t: (t[0], (t[1], t[2]))).aggregateByKey(zero_val, seq_op, comb_op)
# 查看輸出
for tpl in aggr_rdd.collect():
print(tpl)
關閉 spark api,停止saprk job
spark.stop()
RDD下的簡單統計方法
rdd1.sum()
rdd1.max()
rdd1.min()
# 均值
rdd1.mean()
rdd1.count()
# 方差
rdd1.variance()
# 樣本方差
rdd1.sampleVariance()
# 標準差
rdd1.stdev()
# 抽樣標準差
rdd1.sampleStdev()
# 直方圖
# 方法1
# rdd1.histogram([1.0, 10.0, 20.9])
rdd1.histogram([1.0, 8.0, 20.9])
# 方法2
rdd1.histogram(3)
# 通過調用stats()方法返回一個StatsCounter對象
status = rdd1.stats()
print(status.count())
print(status.mean())
print(status.stdev())
print(status.max())
print(status.min())
print(status.sum())
print(status.variance())
print(status)
spark.stop()