# encoding: utf-8
import os
from pyspark import SparkConf, SparkContext
# 指定使用python3
os.environ["PYSPARK_PYTHON"] = "/usr/local/bin/python3"
# 初始化
sc = SparkContext()
# 創建RDD
intRDD = sc.parallelize([1, 2, 3, 4, 5, 1])
stringRDD = sc.parallelize(['a', 'b', 'c', 'd', 'e'])
# RDD數據類型使用collect轉爲python數據類型
print(intRDD.collect()) # [1, 2, 3, 4, 5, 1]
print(stringRDD.collect()) # ['a', 'b', 'c', 'd', 'e']
# map運算,將每個元素經過函數運算產生另外一個RDD
print(intRDD.map(lambda x: x+1).collect()) # [2, 3, 4, 5, 6, 2]
# filter運算,對RDD內元素進行篩選,併產生另外一個RDD
print(intRDD.filter(lambda x: x < 3).collect()) # [1, 2, 1]
print(stringRDD.filter(lambda y: 'a' in y).collect()) # ['a']
# distinct運算,刪除重複的值
print(intRDD.distinct().collect()) # [1, 2, 3, 4, 5]
# randomSplit運算,將RDD按照比例分成多個集合
sRDD = intRDD.randomSplit([0.2, 0.8])
print(len(sRDD)) # 2
print(sRDD[0].collect()) # [1]
print(sRDD[1].collect()) # [1, 2, 3, 4, 5]
# groupBy運算可以按照傳入的匿名函數規則,將數據分爲多個array
result = intRDD.groupBy(lambda x: x % 2).collect()
print(sorted([(x, sorted(y))for (x, y) in result])) # [(0, [2, 4]), (1, [1, 1, 3, 5])]
# 多個RDD運算
intRDD1 = sc.parallelize([1, 2, 3, 4, 5])
intRDD2 = sc.parallelize([5, 6, 7, 8])
intRDD3 = sc.parallelize([9, 10])
# union並集運算
print(intRDD1.union(intRDD2).union(intRDD3).collect()) # [1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10]
# intersection交集運算
print(intRDD1.intersection(intRDD2).collect()) # [5]
# subtract差集運算
print(intRDD1.subtract(intRDD2).collect()) # [1, 2, 3, 4]
# cartesian笛卡爾積運算
print(intRDD1.cartesian(intRDD2).collect()) # [(1, 5), (1, 6), (1, 7), (1, 8), (2, 5), (2, 6), (2, 7), (2, 8), (3, 5), (3, 6), (3, 7), (3, 8), (4, 5), (4, 6), (4, 7), (4, 8), (5, 5), (5, 6), (5, 7), (5, 8)]
# 讀取元素
# 讀取第一條數據
print(intRDD.first()) # 1
# 讀取前兩條數據
print(intRDD.take(2)) # [1, 2]
# 讀取前三條數據,升序排序
print(intRDD.takeOrdered(3)) # [1, 1, 2]
# 讀取前三條數據,降序排序
print(intRDD.takeOrdered(3, lambda x: -x)) # [5, 4, 3]
# 統計功能
# 統計
print(intRDD.stats()) # (count: 6, mean: 2.6666666666666665, stdev: 1.4907119849998598, max: 5.0, min: 1.0)
# 最小值
print(intRDD.min()) # 1
# 最大值
print(intRDD.max()) # 5
# 標準差
print(intRDD.stdev()) # 1.4907119849998598
# 計數
print(intRDD.count()) # 6
# 求和
print(intRDD.sum()) # 16
# 平均
print(intRDD.mean()) # 2.6666666666666665
# RDD key-value基本轉換運算
# 初始化
kvRDD1 = sc.parallelize([(1, 2), (5, 6), (7, 8), (3, 4), (3, 6)])
# 使用keys, values函數分別得到RDD的鍵和值
print(kvRDD1.collect()) # [(1, 2), (5, 6), (7, 8), (3, 4), (3, 6)]
print(kvRDD1.keys().collect()) # [1, 5, 7, 3, 3]
print(kvRDD1.values().collect()) # [2, 6, 8, 4, 6]
# 可以使用filter篩選元素,可以按照key,也可以按照values來篩選
print(kvRDD1.filter(lambda x: x[0] > 5).collect()) # [(7, 8)]
print(kvRDD1.filter(lambda x: x[1] < 5).collect()) # [(1, 2), (3, 4)]
# 使用mapValues方法處理value值
print(kvRDD1.mapValues(lambda x: x**2).collect()) # [(1, 4), (5, 36), (7, 64), (3, 16), (3, 36)]
# 使用sortByKey按照key進行排序,默認值和true代表從小到大,false代表從大到小
print(kvRDD1.sortByKey().collect()) # [(1, 2), (3, 4), (3, 6), (5, 6), (7, 8)]
print(kvRDD1.sortByKey(True).collect()) # [(1, 2), (3, 4), (3, 6), (5, 6), (7, 8)]
print(kvRDD1.sortByKey(False).collect()) # [(7, 8), (5, 6), (3, 4), (3, 6), (1, 2)]
# 使用reduceByKey函數可以合併具有相同key的數據
print(kvRDD1.reduceByKey(lambda x, y: x+y).collect()) # [(1, 2), (3, 10), (5, 6), (7, 8)]
# 多個RDD, key-values轉換運算
kvRDD2 = sc.parallelize([(3, 4), (5, 6), (6, 7), (1, 2)])
kvRDD3 = sc.parallelize([(3, 8), (7, 8)])
# join內連接運算,將兩個RDD按照相同的key值join起來
print(kvRDD2.join(kvRDD3).collect()) # [(3, (4, 8))]
# leftOuterJoin 左鏈接,如果kvRDD2中的key值對應不到kvRDD3,就顯示None
print(kvRDD2.leftOuterJoin(kvRDD3).collect()) # [(1, (2, None)), (3, (4, 8)), (5, (6, None)), (6, (7, None))]
# rightOuterJoin 右鏈接, 如果kvRDD3中的值對應不到kvRDD2,就會顯示None
print(kvRDD2.rightOuterJoin(kvRDD3).collect()) # [(3, (4, 8)), (7, (None, 8))]
# subtractByKey運算會刪除kvRDD2中key與kvRDD3中相同的數據
print(kvRDD2.subtractByKey(kvRDD3).collect()) # [(1, 2), (5, 6), (6, 7)]
# key-value的動作運算
# 讀取第一條數據
print(kvRDD2.first()) # (3, 4)
# 讀取前兩條數據
print(kvRDD2.take(2)) # [(3, 4), (5, 6)]
# 讀取第一條數據的key值
print(kvRDD2.first()[0]) # 3
# 讀取第一條數據的vales值
print(kvRDD2.first()[1]) # 4
# countByKey按key值統計
print(kvRDD2.countByKey()) # defaultdict(<class 'int'>, {3: 1, 5: 1, 6: 1, 1: 1})
# lookup查找運算
print(kvRDD2.lookup(3)) # [4]