import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName('EXAMPLE').getOrCreate()

對於 json

stringJSONRDD = sc.parallelize(("""
                                { "id": "123",
                                "name": "Katie",
                                "age": 19,
                                "eyeColor": "brown"
                                }""",
                                """{
                                "id": "234",
                                "name": "Michael",
                                "age": 22,
                                "eyeColor": "green"
                                }""",
                                """{
                                "id": "345",
                                "name": "Simone",
                                "age": 23,
                                "eyeColor": "blue"
                                }""")
                                )
# 創建DataFrame
swimmersJSON = spark.read.json(stringJSONRDD)
# 臨時表
swimmersJSON.createOrReplaceTempView("swimmersJSON")
# 顯示 
swimmersJSON.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+

spark.sql("select * from swimmersJSON").collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='blue', id='345', name='Simone')]

swimmersJSON.show(1)

+---+--------+---+-----+
|age|eyeColor| id| name|
+---+--------+---+-----+
| 19|   brown|123|Katie|
+---+--------+---+-----+
only showing top 1 row

swimmersJSON.take(1)

[Row(age=19, eyeColor='brown', id='123', name='Katie')]

swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)

對於CSV

from pyspark.sql.types import *

stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'),
                               (234, 'Michale', 22, 'green'),
                               (345, 'Simo', 23, 'red')])
schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyecolor", StringType(), True)
])

heros = spark.createDataFrame(stringCSVRDD, schema)
heros.createOrReplaceTempView('heros')
heros.show()

+---+-------+---+--------+
| id|   name|age|eyecolor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michale| 22|   green|
|345|   Simo| 23|     red|
+---+-------+---+--------+

創建一個DataFrame

from pyspark.sql.types import *

RDD_CSV = sc.parallelize([(123, 'Katie', 19, 'brown'),
                          (234, 'Michale', 22, 'green'),
                          (345, 'Simo', 23, 'red'),
                          (351, 'BOb', None, 'green'),
                          (459, 'Liux', 20, 'blue'),
                          (555, 'UKD', 27, None),
                          (666, 'LLTT', 21, 'black'),
                          (345, 'Simo', 23, 'red'),
                          (236, 'Zhangff', 45, 'blue'),
                          (125, 'Wang', 99, 'white'),
                          (199, 'LiTT', 12, 'blue'),
                          (454, 'LiuBin', 32, 'pink'),
                          (378, 'Yand', 22, 'black')])
schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyecolor", StringType(), True)
])

df = spark.createDataFrame(RDD_CSV, schema)
df.createOrReplaceTempView('df')
df.show()

+---+-------+----+--------+
| id|   name| age|eyecolor|
+---+-------+----+--------+
|123|  Katie|  19|   brown|
|234|Michale|  22|   green|
|345|   Simo|  23|     red|
|351|    BOb|null|   green|
|459|   Liux|  20|    blue|
|555|    UKD|  27|    null|
|666|   LLTT|  21|   black|
|345|   Simo|  23|     red|
|236|Zhangff|  45|    blue|
|125|   Wang|  99|   white|
|199|   LiTT|  12|    blue|
|454| LiuBin|  32|    pink|
|378|   Yand|  22|   black|
+---+-------+----+--------+

1 查詢

show（）

df.show()

+---+-------+----+--------+
| id|   name| age|eyecolor|
+---+-------+----+--------+
|123|  Katie|  19|   brown|
|234|Michale|  22|   green|
|345|   Simo|  23|     red|
|351|    BOb|null|   green|
|459|   Liux|  20|    blue|
|555|    UKD|  27|    null|
|666|   LLTT|  21|   black|
|345|   Simo|  23|     red|
|236|Zhangff|  45|    blue|
|125|   Wang|  99|   white|
|199|   LiTT|  12|    blue|
|454| LiuBin|  32|    pink|
|378|   Yand|  22|   black|
+---+-------+----+--------+

df.show(3)

+---+-------+---+--------+
| id|   name|age|eyecolor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michale| 22|   green|
|345|   Simo| 23|     red|
+---+-------+---+--------+
only showing top 3 rows

printSchema()

df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyecolor: string (nullable = true)

head() take() first()

df.head(4)

[Row(id=123, name='Katie', age=19, eyecolor='brown'),
 Row(id=234, name='Michale', age=22, eyecolor='green'),
 Row(id=345, name='Simo', age=23, eyecolor='red'),
 Row(id=351, name='BOb', age=None, eyecolor='green')]

df.take(5)

[Row(id=123, name='Katie', age=19, eyecolor='brown'),
 Row(id=234, name='Michale', age=22, eyecolor='green'),
 Row(id=345, name='Simo', age=23, eyecolor='red'),
 Row(id=351, name='BOb', age=None, eyecolor='green'),
 Row(id=459, name='Liux', age=20, eyecolor='blue')]

df.first()

Row(id=123, name='Katie', age=19, eyecolor='brown')

count()

df.count()

alias()

df.select(df.age.alias('age_value'), 'name').collect()

[Row(age_value=19, name='Katie'),
 Row(age_value=22, name='Michale'),
 Row(age_value=23, name='Simo'),
 Row(age_value=None, name='BOb'),
 Row(age_value=20, name='Liux'),
 Row(age_value=27, name='UKD'),
 Row(age_value=21, name='LLTT'),
 Row(age_value=23, name='Simo'),
 Row(age_value=45, name='Zhangff'),
 Row(age_value=99, name='Wang'),
 Row(age_value=12, name='LiTT'),
 Row(age_value=32, name='LiuBin'),
 Row(age_value=22, name='Yand')]

df.select(df.age.alias('age_value'), df.name).show()

+---------+-------+
|age_value|   name|
+---------+-------+
|       19|  Katie|
|       22|Michale|
|       23|   Simo|
|     null|    BOb|
|       20|   Liux|
|       27|    UKD|
|       21|   LLTT|
|       23|   Simo|
|       45|Zhangff|
|       99|   Wang|
|       12|   LiTT|
|       32| LiuBin|
|       22|   Yand|
+---------+-------+

isnull()

from pyspark.sql.functions import isnull

df.filter(isnull(df.age)).show()

+---+----+----+--------+
| id|name| age|eyecolor|
+---+----+----+--------+
|351| BOb|null|   green|
+---+----+----+--------+

df.filter(isnull('age')).count()

# 統計每個 ID 的 null值

df.rdd.map(lambda row : (row['id'], sum([c == None for c in row]))).collect()

[(123, 0),
 (234, 0),
 (345, 0),
 (351, 1),
 (459, 0),
 (555, 1),
 (666, 0),
 (345, 0),
 (236, 0),
 (125, 0),
 (199, 0),
 (454, 0),
 (378, 0)]

# 統計每一列空值所佔百分比

import pyspark.sql.functions as fn

df.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in df.columns]).show()

+----------+------------+-------------------+-------------------+
|id_missing|name_missing|        age_missing|   eyecolor_missing|
+----------+------------+-------------------+-------------------+
|       0.0|         0.0|0.07692307692307687|0.07692307692307687|
+----------+------------+-------------------+-------------------+

# 輸出list類型，list每個元素是row類

list = df.collect()
list

[Row(id=123, name='Katie', age=19, eyecolor='brown'),
 Row(id=234, name='Michale', age=22, eyecolor='green'),
 Row(id=345, name='Simo', age=23, eyecolor='red'),
 Row(id=351, name='BOb', age=None, eyecolor='green'),
 Row(id=459, name='Liux', age=20, eyecolor='blue'),
 Row(id=555, name='UKD', age=27, eyecolor=None),
 Row(id=666, name='LLTT', age=21, eyecolor='black'),
 Row(id=345, name='Simo', age=23, eyecolor='red'),
 Row(id=236, name='Zhangff', age=45, eyecolor='blue'),
 Row(id=125, name='Wang', age=99, eyecolor='white'),
 Row(id=199, name='LiTT', age=12, eyecolor='blue'),
 Row(id=454, name='LiuBin', age=32, eyecolor='pink'),
 Row(id=378, name='Yand', age=22, eyecolor='black')]

describe（）

df.describe().show()

+-------+------------------+-------+------------------+--------+
|summary|                id|   name|               age|eyecolor|
+-------+------------------+-------+------------------+--------+
|  count|                13|     13|                12|      12|
|   mean|343.84615384615387|   null|30.416666666666668|    null|
| stddev| 162.3596040449749|   null|23.059048049945535|    null|
|    min|               123|    BOb|                12|   black|
|    max|               666|Zhangff|                99|   white|
+-------+------------------+-------+------------------+--------+

df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyecolor: string (nullable = true)

distinct()

df.distinct().show()

+---+-------+----+--------+
| id|   name| age|eyecolor|
+---+-------+----+--------+
|454| LiuBin|  32|    pink|
|123|  Katie|  19|   brown|
|199|   LiTT|  12|    blue|
|345|   Simo|  23|     red|
|378|   Yand|  22|   black|
|234|Michale|  22|   green|
|666|   LLTT|  21|   black|
|459|   Liux|  20|    blue|
|125|   Wang|  99|   white|
|236|Zhangff|  45|    blue|
|351|    BOb|null|   green|
|555|    UKD|  27|    null|
+---+-------+----+--------+

columns

df.columns

['id', 'name', 'age', 'eyecolor']

df.name

Column<b'name'>

df['name']

Column<b'name'>

df.select('name')

DataFrame[name: string]

df.select(df.name, df.age + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|  Katie|       20|
|Michale|       23|
|   Simo|       24|
|    BOb|     null|
|   Liux|       21|
|    UKD|       28|
|   LLTT|       22|
|   Simo|       24|
|Zhangff|       46|
|   Wang|      100|
|   LiTT|       13|
| LiuBin|       33|
|   Yand|       23|
+-------+---------+

where

df.where('age = 19').show()

+---+-----+---+--------+
| id| name|age|eyecolor|
+---+-----+---+--------+
|123|Katie| 19|   brown|
+---+-----+---+--------+

filter()

df.select(df.name, df.eyecolor).filter("eyecolor like 'b%'").show()

+-------+--------+
|   name|eyecolor|
+-------+--------+
|  Katie|   brown|
|   Liux|    blue|
|   LLTT|   black|
|Zhangff|    blue|
|   LiTT|    blue|
|   Yand|   black|
+-------+--------+

orderby

df.orderBy(df.age.desc()).show()

+---+-------+----+--------+
| id|   name| age|eyecolor|
+---+-------+----+--------+
|125|   Wang|  99|   white|
|236|Zhangff|  45|    blue|
|454| LiuBin|  32|    pink|
|555|    UKD|  27|    null|
|345|   Simo|  23|     red|
|345|   Simo|  23|     red|
|234|Michale|  22|   green|
|378|   Yand|  22|   black|
|666|   LLTT|  21|   black|
|459|   Liux|  20|    blue|
|123|  Katie|  19|   brown|
|199|   LiTT|  12|    blue|
|351|    BOb|null|   green|
+---+-------+----+--------+

sample()

df.sample(withReplacement=False, fraction=0.2, seed=10).show()

+---+------+----+--------+
| id|  name| age|eyecolor|
+---+------+----+--------+
|351|   BOb|null|   green|
|459|  Liux|  20|    blue|
|666|  LLTT|  21|   black|
|345|  Simo|  23|     red|
|454|LiuBin|  32|    pink|
+---+------+----+--------+

# seed不變的話，兩次採樣到的結果是相同的

df.sample(withReplacement=False, fraction=0.2, seed=10).show()

+---+------+----+--------+
| id|  name| age|eyecolor|
+---+------+----+--------+
|351|   BOb|null|   green|
|459|  Liux|  20|    blue|
|666|  LLTT|  21|   black|
|345|  Simo|  23|     red|
|454|LiuBin|  32|    pink|
+---+------+----+--------+

df.sample(withReplacement=False, fraction=0.2, seed=3).show()

+---+------+---+--------+
| id|  name|age|eyecolor|
+---+------+---+--------+
|123| Katie| 19|   brown|
|345|  Simo| 23|     red|
|666|  LLTT| 21|   black|
|454|LiuBin| 32|    pink|
|378|  Yand| 22|   black|
+---+------+---+--------+

when

import pyspark.sql.functions as fn

df.select(df.name, fn.when(df.age > 25, 100).when(df.age < 20, 0).otherwise(50)).show()

+-------+----------------------------------------------------------------+
|   name|CASE WHEN (age > 25) THEN 100 WHEN (age < 20) THEN 0 ELSE 50 END|
+-------+----------------------------------------------------------------+
|  Katie|                                                               0|
|Michale|                                                              50|
|   Simo|                                                              50|
|    BOb|                                                              50|
|   Liux|                                                              50|
|    UKD|                                                             100|
|   LLTT|                                                              50|
|   Simo|                                                              50|
|Zhangff|                                                             100|
|   Wang|                                                             100|
|   LiTT|                                                               0|
| LiuBin|                                                             100|
|   Yand|                                                              50|
+-------+----------------------------------------------------------------+

df.select(df.name, fn.when(df.age > 25, 100).when(df.age < 20, 0).otherwise(50).alias('age')).show()

+-------+---+
|   name|age|
+-------+---+
|  Katie|  0|
|Michale| 50|
|   Simo| 50|
|    BOb| 50|
|   Liux| 50|
|    UKD|100|
|   LLTT| 50|
|   Simo| 50|
|Zhangff|100|
|   Wang|100|
|   LiTT|  0|
| LiuBin|100|
|   Yand| 50|
+-------+---+

between()

# 返回true 和 false

df.select(df.name, df.age.between(20, 25)).show()

+-------+-----------------------------+
|   name|((age >= 20) AND (age <= 25))|
+-------+-----------------------------+
|  Katie|                        false|
|Michale|                         true|
|   Simo|                         true|
|    BOb|                         null|
|   Liux|                         true|
|    UKD|                        false|
|   LLTT|                         true|
|   Simo|                         true|
|Zhangff|                        false|
|   Wang|                        false|
|   LiTT|                        false|
| LiuBin|                        false|
|   Yand|                         true|
+-------+-----------------------------+

df.select(df.name, df.age.between(20, 25), df.eyecolor).filter("eyecolor like 'b%'").show()

+-------+-----------------------------+--------+
|   name|((age >= 20) AND (age <= 25))|eyecolor|
+-------+-----------------------------+--------+
|  Katie|                        false|   brown|
|   Liux|                         true|    blue|
|   LLTT|                         true|   black|
|Zhangff|                        false|    blue|
|   LiTT|                        false|    blue|
|   Yand|                         true|   black|
+-------+-----------------------------+--------+

2 改

# 創建一個DataFrame

from pyspark.sql import Row

row = Row("id", 'name')
ID = [1, 2, 3]
NAME = ['Bob', 'Lixu', 'Yangfei']
DF = sc.parallelize([row(ID[i], NAME[i]) for i in range(len(ID))]).toDF()
DF.show()

+---+-------+
| id|   name|
+---+-------+
|  1|    Bob|
|  2|   Lixu|
|  3|Yangfei|
+---+-------+

withColumn()

# 添加或替換與現有名字相同的列   Returns a new DataFrame by adding a column or replacing the existing column that has the same name.

DF.withColumn("age", DF.id + 22).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|    Bob| 23|
|  2|   Lixu| 24|
|  3|Yangfei| 25|
+---+-------+---+

DF_2 = DF.withColumn('age_2', fn.lit(0))
# 不能用 lit([0, 0, 0]),因爲list不能直接添加到DataFrame中，必須將其轉換成新的DataFrame，然後通過join操作進行
DF_2.show()

+---+-------+-----+
| id|   name|age_2|
+---+-------+-----+
|  1|    Bob|    0|
|  2|   Lixu|    0|
|  3|Yangfei|    0|
+---+-------+-----+

from pyspark.sql import Row

consum = [423, 534, 212]
row = Row('pid', 'consum')
new_DF = sc.parallelize([row(i+1, consum[i]) for  i in range(len(consum))]).toDF()
DF_3 = DF.join(new_DF, DF.id == new_DF.pid)
DF_3.show()

+---+-------+---+------+
| id|   name|pid|consum|
+---+-------+---+------+
|  1|    Bob|  1|   423|
|  3|Yangfei|  3|   212|
|  2|   Lixu|  2|   534|
+---+-------+---+------+

DF_3.withColumn('per', DF_3.consum / DF_3.id).show()

+---+-------+---+------+-----------------+
| id|   name|pid|consum|              per|
+---+-------+---+------+-----------------+
|  1|    Bob|  1|   423|            423.0|
|  3|Yangfei|  3|   212|70.66666666666667|
|  2|   Lixu|  2|   534|            267.0|
+---+-------+---+------+-----------------+

DF_3.withColumn('pid_222', DF_3['pid'].cast('Float')).show()

+---+-------+---+------+-------+
| id|   name|pid|consum|pid_222|
+---+-------+---+------+-------+
|  1|    Bob|  1|   423|    1.0|
|  3|Yangfei|  3|   212|    3.0|
|  2|   Lixu|  2|   534|    2.0|
+---+-------+---+------+-------+

withColumnRenamed

DF_3.withColumnRenamed('pid', 'pppppppppppid').show()

+---+-------+-------------+------+
| id|   name|pppppppppppid|consum|
+---+-------+-------------+------+
|  1|    Bob|            1|   423|
|  3|Yangfei|            3|   212|
|  2|   Lixu|            2|   534|
+---+-------+-------------+------+

filter() & where()

DF_3.filter(DF_3['id'] > 2).show()

+---+-------+---+------+
| id|   name|pid|consum|
+---+-------+---+------+
|  3|Yangfei|  3|   212|
+---+-------+---+------+

DF_3.where(DF_3['id'] > 2).show()

+---+-------+---+------+
| id|   name|pid|consum|
+---+-------+---+------+
|  3|Yangfei|  3|   212|
+---+-------+---+------+

isnull() & isnan()

AAA = sc.parallelize([(123, 'Katie', 19, 'brown'),
                          (234, 'Michale', 22, None),
                          (345, 'Simo', 23, 'red'),
                          (351, 'BOb', None, 'green'),
                          (459, 'Liux', 20, 'blue'),
                          (555, 'UKD', 27, None),
                          (666, 'LLTT',34, 'black'),
                          (345, 'Simo', 23, None),
                          (236, 'Zhangff', 45, 'blue'),
                          (125, 'Wang', 99, 'white'),
                          (199, 'LiTT', 12, 'blue'),
                          (454, 'LiuBin', 32, 'pink'),
                          (378, 'Yand', 22, None)])
BBB = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyecolor", StringType(), True)
])

df1 = spark.createDataFrame(AAA, BBB)
df1.createOrReplaceTempView('df1')
df1.show()

+---+-------+----+--------+
| id|   name| age|eyecolor|
+---+-------+----+--------+
|123|  Katie|  19|   brown|
|234|Michale|  22|    null|
|345|   Simo|  23|     red|
|351|    BOb|null|   green|
|459|   Liux|  20|    blue|
|555|    UKD|  27|    null|
|666|   LLTT|  34|   black|
|345|   Simo|  23|    null|
|236|Zhangff|  45|    blue|
|125|   Wang|  99|   white|
|199|   LiTT|  12|    blue|
|454| LiuBin|  32|    pink|
|378|   Yand|  22|    null|
+---+-------+----+--------+

df1.filter(fn.isnan('eyecolor')).show() # isnan 將非數字數據篩選出來

+---+----+---+--------+
| id|name|age|eyecolor|
+---+----+---+--------+
+---+----+---+--------+

df1.filter(fn.isnull('eyecolor')).show() # isnull 將 空 數據篩選出來

+---+-------+---+--------+
| id|   name|age|eyecolor|
+---+-------+---+--------+
|234|Michale| 22|    null|
|555|    UKD| 27|    null|
|345|   Simo| 23|    null|
|378|   Yand| 22|    null|
+---+-------+---+--------+

union（）

# 合併兩個DataFrame

row1 = Row("id", 'name')
ID1 = [1, 2, 3]
NAME1 = ['Bob', 'Lixu', 'Yangfei']
DF1 = sc.parallelize([row1(ID1[i], NAME1[i]) for i in range(len(ID1))]).toDF()


row2 = Row("id", 'name')
ID2 = [1, 2, 3]
NAME2 = ['WWWWWWW', 'TTTTTTTT', 'KKKKKKK']
DF2 = sc.parallelize([row2(ID2[i], NAME2[i]) for i in range(len(ID2))]).toDF()

DF1.union(DF2).show()

+---+--------+
| id|    name|
+---+--------+
|  1|     Bob|
|  2|    Lixu|
|  3| Yangfei|
|  1| WWWWWWW|
|  2|TTTTTTTT|
|  3| KKKKKKK|
+---+--------+

join()

DF1.join(DF2, DF1.id == DF2.id).show()

+---+-------+---+--------+
| id|   name| id|    name|
+---+-------+---+--------+
|  1|    Bob|  1| WWWWWWW|
|  3|Yangfei|  3| KKKKKKK|
|  2|   Lixu|  2|TTTTTTTT|
+---+-------+---+--------+

subtract()

D1 = spark.createDataFrame(((1, "asf"),(2, "2143"),(3, "rfds"))).toDF("label", "sentence")
D1.show()

D2 = spark.createDataFrame(((1, "asf"),(2, "2143"),(4, "f8934y") )).toDF("label", "sentence")
D2.show()

+-----+--------+
|label|sentence|
+-----+--------+
|    1|     asf|
|    2|    2143|
|    3|    rfds|
+-----+--------+

+-----+--------+
|label|sentence|
+-----+--------+
|    1|     asf|
|    2|    2143|
|    4|  f8934y|
+-----+--------+

# 去掉D2中與D1重複的部分

D2.select('sentence').subtract(D1.select('sentence')).show()

+--------+
|sentence|
+--------+
|  f8934y|
+--------+

intersect()

# 求交集。 D2與D1重複的部分

D2.select('sentence').intersect(D1.select('sentence')).show()

+--------+
|sentence|
+--------+
|     asf|
|    2143|
+--------+

union() & distinct()

D2.select('sentence').union(D1.select('sentence')).distinct().show()

+--------+
|sentence|
+--------+
|    rfds|
|     asf|
|    2143|
|  f8934y|
+--------+

crosstab()

df.crosstab('age', 'eyecolor').show()

+------------+-----+----+-----+-----+----+----+---+-----+
|age_eyecolor|black|blue|brown|green|null|pink|red|white|
+------------+-----+----+-----+-----+----+----+---+-----+
|        null|    0|   0|    0|    1|   0|   0|  0|    0|
|          20|    0|   1|    0|    0|   0|   0|  0|    0|
|          21|    1|   0|    0|    0|   0|   0|  0|    0|
|          32|    0|   0|    0|    0|   0|   1|  0|    0|
|          45|    0|   1|    0|    0|   0|   0|  0|    0|
|          22|    1|   0|    0|    1|   0|   0|  0|    0|
|          27|    0|   0|    0|    0|   1|   0|  0|    0|
|          12|    0|   1|    0|    0|   0|   0|  0|    0|
|          99|    0|   0|    0|    0|   0|   0|  0|    1|
|          23|    0|   0|    0|    0|   0|   0|  2|    0|
|          19|    0|   0|    1|    0|   0|   0|  0|    0|
+------------+-----+----+-----+-----+----+----+---+-----+

groupby()

df.groupby('Age').agg({'id' : 'max'}).show()

+----+-------+
| Age|max(id)|
+----+-------+
|  19|    123|
|  22|    378|
|  32|    454|
|null|    351|
|  27|    555|
|  12|    199|
|  21|    666|
|  23|    345|
|  20|    459|
|  99|    125|
|  45|    236|
+----+-------+

df.groupby('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|  22|    2|
|  32|    1|
|null|    1|
|  27|    1|
|  12|    1|
|  21|    1|
|  23|    2|
|  20|    1|
|  99|    1|
|  45|    1|
+----+-----+

df.groupBy('age').agg(fn.count('age'), fn.avg('age'), fn.min('age'), fn.max('age')).show()

+----+----------+--------+--------+--------+
| age|count(age)|avg(age)|min(age)|max(age)|
+----+----------+--------+--------+--------+
|  19|         1|    19.0|      19|      19|
|  22|         2|    22.0|      22|      22|
|  32|         1|    32.0|      32|      32|
|null|         0|    null|    null|    null|
|  27|         1|    27.0|      27|      27|
|  12|         1|    12.0|      12|      12|
|  21|         1|    21.0|      21|      21|
|  23|         2|    23.0|      23|      23|
|  20|         1|    20.0|      20|      20|
|  99|         1|    99.0|      99|      99|
|  45|         1|    45.0|      45|      45|
+----+----------+--------+--------+--------+

foreach()函數

def pp(el):
    f1 = open("./foreach.txt", 'a+')
    print(el, file=f1)

open('./foreach.txt', 'w').close()

df.foreach(pp)
with open("./foreach.txt", 'r') as ff:
    print(ff.read())

Row(id=351, name='BOb', age=None, eyecolor='green')
Row(id=234, name='Michale', age=22, eyecolor='green')
Row(id=345, name='Simo', age=23, eyecolor='red')
Row(id=666, name='LLTT', age=21, eyecolor='black')
Row(id=345, name='Simo', age=23, eyecolor='red')
Row(id=236, name='Zhangff', age=45, eyecolor='blue')
Row(id=125, name='Wang', age=99, eyecolor='white')
Row(id=199, name='LiTT', age=12, eyecolor='blue')
Row(id=454, name='LiuBin', age=32, eyecolor='pink')
Row(id=378, name='Yand', age=22, eyecolor='black')

def pp(el):
    f1 = open("./foreach.txt", 'a+')
    print([el_1*2 for el_1 in el], file=f1)

open('./foreach.txt', 'w').close()

df.foreachPartition(pp)
with open("./foreach.txt", 'r') as ff:
    print(ff.read())

[(666, 'LLTT', 21, 'black', 666, 'LLTT', 21, 'black'), (345, 'Simo', 23, 'red', 345, 'Simo', 23, 'red'), (236, 'Zhangff', 45, 'blue', 236, 'Zhangff', 45, 'blue')]
[(125, 'Wang', 99, 'white', 125, 'Wang', 99, 'white'), (199, 'LiTT', 12, 'blue', 199, 'LiTT', 12, 'blue'), (454, 'LiuBin', 32, 'pink', 454, 'LiuBin', 32, 'pink'), (378, 'Yand', 22, 'black', 378, 'Yand', 22, 'black')]
[(123, 'Katie', 19, 'brown', 123, 'Katie', 19, 'brown'), (234, 'Michale', 22, 'green', 234, 'Michale', 22, 'green'), (345, 'Simo', 23, 'red', 345, 'Simo', 23, 'red')]
[(351, 'BOb', None, 'green', 351, 'BOb', None, 'green'), (459, 'Liux', 20, 'blue', 459, 'Liux', 20, 'blue'), (555, 'UKD', 27, None, 555, 'UKD', 27, None)]

map()

df.select('id').rdd.map(lambda x:(x, 1)).collect()

[(Row(id=123), 1),
 (Row(id=234), 1),
 (Row(id=345), 1),
 (Row(id=351), 1),
 (Row(id=459), 1),
 (Row(id=555), 1),
 (Row(id=666), 1),
 (Row(id=345), 1),
 (Row(id=236), 1),
 (Row(id=125), 1),
 (Row(id=199), 1),
 (Row(id=454), 1),
 (Row(id=378), 1)]

udf()

DD = spark.createDataFrame(((1, "asf", "5555"),(2, "2143", None),(4, "f8934y", "77"))).toDF("label", "sentence", "day")
DD.show()

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    2|    2143|null|
|    4|  f8934y|  77|
+-----+--------+----+

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import time
import datetime

# 定義一個 udf 函數 
def today(day):
    if day==None:
        return datetime.datetime.fromtimestamp(int(time.time())).strftime('%Y-%m-%d')
    else:
        return day

# 返回類型爲字符串類型
udfday = udf(today, StringType())

# 使用
DD.withColumn('day1', udfday(DD.day)).show()

+-----+--------+----+----------+
|label|sentence| day|      day1|
+-----+--------+----+----------+
|    1|     asf|5555|      5555|
|    2|    2143|null|2019-03-31|
|    4|  f8934y|  77|        77|
+-----+--------+----+----------+

drop()

DD.drop(DD.day).show()

+-----+--------+
|label|sentence|
+-----+--------+
|    1|     asf|
|    2|    2143|
|    4|  f8934y|
+-----+--------+

dropna

DD.na.drop().show() # 刪除包含 None 的行

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    4|  f8934y|  77|
+-----+--------+----+

DD.dropna(subset=['sentence', 'day']).show()

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    4|  f8934y|  77|
+-----+--------+----+

fillna

DD.fillna('50').show()      # day 列有空，day列爲string 類型，所以用字符串進行填充

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    2|    2143|  50|
|    4|  f8934y|  77|
+-----+--------+----+

DD.fillna(50).show()   # day 列有空，day列爲string 類型，所以用整型進行填充是不行的

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    2|    2143|null|
|    4|  f8934y|  77|
+-----+--------+----+

DD.na.fill('50').show()

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    2|    2143|  50|
|    4|  f8934y|  77|
+-----+--------+----+

distinct()

DDD = spark.createDataFrame(((1, "asf", "5555"),(1, "asf", "5555"),(1, "asf", "5555"),(1, "asf", "5555"),(2, "2143", None),(4, "f8934y", "77"))).toDF("label", "sentence", "day")
DDD.show()

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    1|     asf|5555|
|    1|     asf|5555|
|    1|     asf|5555|
|    2|    2143|null|
|    4|  f8934y|  77|
+-----+--------+----+

DDD.distinct().show()

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    4|  f8934y|  77|
|    1|     asf|5555|
|    2|    2143|null|
+-----+--------+----+

dropDuplicates()

DDD.select('label', 'sentence').dropDuplicates().show()

+-----+--------+
|label|sentence|
+-----+--------+
|    1|     asf|
|    2|    2143|
|    4|  f8934y|
+-----+--------+

3 格式轉換

toPandas()

pandas_df = DDD.toPandas()	
pandas_df

	label	sentence	day
0	1	asf	5555
1	1	asf	5555
2	1	asf	5555
3	1	asf	5555
4	2	2143	None
5	4	f8934y	77

createDataFrame()

spark_df = spark.createDataFrame(pandas_df)
spark_df.show()

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    1|     asf|5555|
|    1|     asf|5555|
|    1|     asf|5555|
|    2|    2143|null|
|    4|  f8934y|  77|
+-----+--------+----+

rdd()

rdd_df = DDD.rdd
rdd_df.collect()

[Row(label=1, sentence='asf', day='5555'),
 Row(label=1, sentence='asf', day='5555'),
 Row(label=1, sentence='asf', day='5555'),
 Row(label=1, sentence='asf', day='5555'),
 Row(label=2, sentence='2143', day=None),
 Row(label=4, sentence='f8934y', day='77')]

df = rdd_df.toDF()
df.show()

+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
|    1|     asf|5555|
|    1|     asf|5555|
|    1|     asf|5555|
|    1|     asf|5555|
|    2|    2143|null|
|    4|  f8934y|  77|
+-----+--------+----+

參考：

1、官方文檔：https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame
2、https://blog.csdn.net/sinat_26917383/article/details/80500349

PySpark之DataFrame的常用函數（創建、查詢、修改、轉換）

對於 json

對於CSV

創建一個DataFrame

1 查詢

show（）

printSchema()

head() take() first()

count()

alias()

isnull()

describe（）

distinct()

columns

where

filter()

orderby

sample()

when

between()

2 改

withColumn()

withColumnRenamed

filter() & where()

isnull() & isnan()

union（）

join()

subtract()

intersect()

union() & distinct()

crosstab()

groupby()

foreach()函數

map()

udf()

drop()

dropna

fillna

distinct()

dropDuplicates()

3 格式轉換

toPandas()

createDataFrame()

rdd()

參考：