import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName('EXAMPLE').getOrCreate()
對於 json
stringJSONRDD = sc.parallelize(("""
{ "id": "123",
"name": "Katie",
"age": 19,
"eyeColor": "brown"
}""",
"""{
"id": "234",
"name": "Michael",
"age": 22,
"eyeColor": "green"
}""",
"""{
"id": "345",
"name": "Simone",
"age": 23,
"eyeColor": "blue"
}""")
)
# 創建DataFrame
swimmersJSON = spark.read.json(stringJSONRDD)
# 臨時表
swimmersJSON.createOrReplaceTempView("swimmersJSON")
# 顯示
swimmersJSON.show()
+---+--------+---+-------+
|age|eyeColor| id| name|
+---+--------+---+-------+
| 19| brown|123| Katie|
| 22| green|234|Michael|
| 23| blue|345| Simone|
+---+--------+---+-------+
spark.sql("select * from swimmersJSON").collect()
[Row(age=19, eyeColor='brown', id='123', name='Katie'),
Row(age=22, eyeColor='green', id='234', name='Michael'),
Row(age=23, eyeColor='blue', id='345', name='Simone')]
swimmersJSON.show(1)
+---+--------+---+-----+
|age|eyeColor| id| name|
+---+--------+---+-----+
| 19| brown|123|Katie|
+---+--------+---+-----+
only showing top 1 row
swimmersJSON.take(1)
[Row(age=19, eyeColor='brown', id='123', name='Katie')]
swimmersJSON.printSchema()
root
|-- age: long (nullable = true)
|-- eyeColor: string (nullable = true)
|-- id: string (nullable = true)
|-- name: string (nullable = true)
對於CSV
from pyspark.sql.types import *
stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'),
(234, 'Michale', 22, 'green'),
(345, 'Simo', 23, 'red')])
schema = StructType([
StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("age", LongType(), True),
StructField("eyecolor", StringType(), True)
])
heros = spark.createDataFrame(stringCSVRDD, schema)
heros.createOrReplaceTempView('heros')
heros.show()
+---+-------+---+--------+
| id| name|age|eyecolor|
+---+-------+---+--------+
|123| Katie| 19| brown|
|234|Michale| 22| green|
|345| Simo| 23| red|
+---+-------+---+--------+
創建一個DataFrame
from pyspark.sql.types import *
RDD_CSV = sc.parallelize([(123, 'Katie', 19, 'brown'),
(234, 'Michale', 22, 'green'),
(345, 'Simo', 23, 'red'),
(351, 'BOb', None, 'green'),
(459, 'Liux', 20, 'blue'),
(555, 'UKD', 27, None),
(666, 'LLTT', 21, 'black'),
(345, 'Simo', 23, 'red'),
(236, 'Zhangff', 45, 'blue'),
(125, 'Wang', 99, 'white'),
(199, 'LiTT', 12, 'blue'),
(454, 'LiuBin', 32, 'pink'),
(378, 'Yand', 22, 'black')])
schema = StructType([
StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("age", LongType(), True),
StructField("eyecolor", StringType(), True)
])
df = spark.createDataFrame(RDD_CSV, schema)
df.createOrReplaceTempView('df')
df.show()
+---+-------+----+--------+
| id| name| age|eyecolor|
+---+-------+----+--------+
|123| Katie| 19| brown|
|234|Michale| 22| green|
|345| Simo| 23| red|
|351| BOb|null| green|
|459| Liux| 20| blue|
|555| UKD| 27| null|
|666| LLTT| 21| black|
|345| Simo| 23| red|
|236|Zhangff| 45| blue|
|125| Wang| 99| white|
|199| LiTT| 12| blue|
|454| LiuBin| 32| pink|
|378| Yand| 22| black|
+---+-------+----+--------+
1 查詢
show()
df.show()
+---+-------+----+--------+
| id| name| age|eyecolor|
+---+-------+----+--------+
|123| Katie| 19| brown|
|234|Michale| 22| green|
|345| Simo| 23| red|
|351| BOb|null| green|
|459| Liux| 20| blue|
|555| UKD| 27| null|
|666| LLTT| 21| black|
|345| Simo| 23| red|
|236|Zhangff| 45| blue|
|125| Wang| 99| white|
|199| LiTT| 12| blue|
|454| LiuBin| 32| pink|
|378| Yand| 22| black|
+---+-------+----+--------+
df.show(3)
+---+-------+---+--------+
| id| name|age|eyecolor|
+---+-------+---+--------+
|123| Katie| 19| brown|
|234|Michale| 22| green|
|345| Simo| 23| red|
+---+-------+---+--------+
only showing top 3 rows
printSchema()
df.printSchema()
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- eyecolor: string (nullable = true)
head() take() first()
df.head(4)
[Row(id=123, name='Katie', age=19, eyecolor='brown'),
Row(id=234, name='Michale', age=22, eyecolor='green'),
Row(id=345, name='Simo', age=23, eyecolor='red'),
Row(id=351, name='BOb', age=None, eyecolor='green')]
df.take(5)
[Row(id=123, name='Katie', age=19, eyecolor='brown'),
Row(id=234, name='Michale', age=22, eyecolor='green'),
Row(id=345, name='Simo', age=23, eyecolor='red'),
Row(id=351, name='BOb', age=None, eyecolor='green'),
Row(id=459, name='Liux', age=20, eyecolor='blue')]
df.first()
Row(id=123, name='Katie', age=19, eyecolor='brown')
count()
df.count()
13
alias()
df.select(df.age.alias('age_value'), 'name').collect()
[Row(age_value=19, name='Katie'),
Row(age_value=22, name='Michale'),
Row(age_value=23, name='Simo'),
Row(age_value=None, name='BOb'),
Row(age_value=20, name='Liux'),
Row(age_value=27, name='UKD'),
Row(age_value=21, name='LLTT'),
Row(age_value=23, name='Simo'),
Row(age_value=45, name='Zhangff'),
Row(age_value=99, name='Wang'),
Row(age_value=12, name='LiTT'),
Row(age_value=32, name='LiuBin'),
Row(age_value=22, name='Yand')]
df.select(df.age.alias('age_value'), df.name).show()
+---------+-------+
|age_value| name|
+---------+-------+
| 19| Katie|
| 22|Michale|
| 23| Simo|
| null| BOb|
| 20| Liux|
| 27| UKD|
| 21| LLTT|
| 23| Simo|
| 45|Zhangff|
| 99| Wang|
| 12| LiTT|
| 32| LiuBin|
| 22| Yand|
+---------+-------+
isnull()
from pyspark.sql.functions import isnull
df.filter(isnull(df.age)).show()
+---+----+----+--------+
| id|name| age|eyecolor|
+---+----+----+--------+
|351| BOb|null| green|
+---+----+----+--------+
df.filter(isnull('age')).count()
1
# 統計每個 ID 的 null值
df.rdd.map(lambda row : (row['id'], sum([c == None for c in row]))).collect()
[(123, 0),
(234, 0),
(345, 0),
(351, 1),
(459, 0),
(555, 1),
(666, 0),
(345, 0),
(236, 0),
(125, 0),
(199, 0),
(454, 0),
(378, 0)]
# 統計每一列空值所佔百分比
import pyspark.sql.functions as fn
df.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in df.columns]).show()
+----------+------------+-------------------+-------------------+
|id_missing|name_missing| age_missing| eyecolor_missing|
+----------+------------+-------------------+-------------------+
| 0.0| 0.0|0.07692307692307687|0.07692307692307687|
+----------+------------+-------------------+-------------------+
# 輸出list類型,list每個元素是row類
list = df.collect()
list
[Row(id=123, name='Katie', age=19, eyecolor='brown'),
Row(id=234, name='Michale', age=22, eyecolor='green'),
Row(id=345, name='Simo', age=23, eyecolor='red'),
Row(id=351, name='BOb', age=None, eyecolor='green'),
Row(id=459, name='Liux', age=20, eyecolor='blue'),
Row(id=555, name='UKD', age=27, eyecolor=None),
Row(id=666, name='LLTT', age=21, eyecolor='black'),
Row(id=345, name='Simo', age=23, eyecolor='red'),
Row(id=236, name='Zhangff', age=45, eyecolor='blue'),
Row(id=125, name='Wang', age=99, eyecolor='white'),
Row(id=199, name='LiTT', age=12, eyecolor='blue'),
Row(id=454, name='LiuBin', age=32, eyecolor='pink'),
Row(id=378, name='Yand', age=22, eyecolor='black')]
describe()
df.describe().show()
+-------+------------------+-------+------------------+--------+
|summary| id| name| age|eyecolor|
+-------+------------------+-------+------------------+--------+
| count| 13| 13| 12| 12|
| mean|343.84615384615387| null|30.416666666666668| null|
| stddev| 162.3596040449749| null|23.059048049945535| null|
| min| 123| BOb| 12| black|
| max| 666|Zhangff| 99| white|
+-------+------------------+-------+------------------+--------+
df.printSchema()
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- eyecolor: string (nullable = true)
distinct()
df.distinct().show()
+---+-------+----+--------+
| id| name| age|eyecolor|
+---+-------+----+--------+
|454| LiuBin| 32| pink|
|123| Katie| 19| brown|
|199| LiTT| 12| blue|
|345| Simo| 23| red|
|378| Yand| 22| black|
|234|Michale| 22| green|
|666| LLTT| 21| black|
|459| Liux| 20| blue|
|125| Wang| 99| white|
|236|Zhangff| 45| blue|
|351| BOb|null| green|
|555| UKD| 27| null|
+---+-------+----+--------+
columns
df.columns
['id', 'name', 'age', 'eyecolor']
df.name
Column<b'name'>
df['name']
Column<b'name'>
df.select('name')
DataFrame[name: string]
df.select(df.name, df.age + 1).show()
+-------+---------+
| name|(age + 1)|
+-------+---------+
| Katie| 20|
|Michale| 23|
| Simo| 24|
| BOb| null|
| Liux| 21|
| UKD| 28|
| LLTT| 22|
| Simo| 24|
|Zhangff| 46|
| Wang| 100|
| LiTT| 13|
| LiuBin| 33|
| Yand| 23|
+-------+---------+
where
df.where('age = 19').show()
+---+-----+---+--------+
| id| name|age|eyecolor|
+---+-----+---+--------+
|123|Katie| 19| brown|
+---+-----+---+--------+
filter()
df.select(df.name, df.eyecolor).filter("eyecolor like 'b%'").show()
+-------+--------+
| name|eyecolor|
+-------+--------+
| Katie| brown|
| Liux| blue|
| LLTT| black|
|Zhangff| blue|
| LiTT| blue|
| Yand| black|
+-------+--------+
orderby
df.orderBy(df.age.desc()).show()
+---+-------+----+--------+
| id| name| age|eyecolor|
+---+-------+----+--------+
|125| Wang| 99| white|
|236|Zhangff| 45| blue|
|454| LiuBin| 32| pink|
|555| UKD| 27| null|
|345| Simo| 23| red|
|345| Simo| 23| red|
|234|Michale| 22| green|
|378| Yand| 22| black|
|666| LLTT| 21| black|
|459| Liux| 20| blue|
|123| Katie| 19| brown|
|199| LiTT| 12| blue|
|351| BOb|null| green|
+---+-------+----+--------+
sample()
df.sample(withReplacement=False, fraction=0.2, seed=10).show()
+---+------+----+--------+
| id| name| age|eyecolor|
+---+------+----+--------+
|351| BOb|null| green|
|459| Liux| 20| blue|
|666| LLTT| 21| black|
|345| Simo| 23| red|
|454|LiuBin| 32| pink|
+---+------+----+--------+
# seed不變的話,兩次採樣到的結果是相同的
df.sample(withReplacement=False, fraction=0.2, seed=10).show()
+---+------+----+--------+
| id| name| age|eyecolor|
+---+------+----+--------+
|351| BOb|null| green|
|459| Liux| 20| blue|
|666| LLTT| 21| black|
|345| Simo| 23| red|
|454|LiuBin| 32| pink|
+---+------+----+--------+
df.sample(withReplacement=False, fraction=0.2, seed=3).show()
+---+------+---+--------+
| id| name|age|eyecolor|
+---+------+---+--------+
|123| Katie| 19| brown|
|345| Simo| 23| red|
|666| LLTT| 21| black|
|454|LiuBin| 32| pink|
|378| Yand| 22| black|
+---+------+---+--------+
when
import pyspark.sql.functions as fn
df.select(df.name, fn.when(df.age > 25, 100).when(df.age < 20, 0).otherwise(50)).show()
+-------+----------------------------------------------------------------+
| name|CASE WHEN (age > 25) THEN 100 WHEN (age < 20) THEN 0 ELSE 50 END|
+-------+----------------------------------------------------------------+
| Katie| 0|
|Michale| 50|
| Simo| 50|
| BOb| 50|
| Liux| 50|
| UKD| 100|
| LLTT| 50|
| Simo| 50|
|Zhangff| 100|
| Wang| 100|
| LiTT| 0|
| LiuBin| 100|
| Yand| 50|
+-------+----------------------------------------------------------------+
df.select(df.name, fn.when(df.age > 25, 100).when(df.age < 20, 0).otherwise(50).alias('age')).show()
+-------+---+
| name|age|
+-------+---+
| Katie| 0|
|Michale| 50|
| Simo| 50|
| BOb| 50|
| Liux| 50|
| UKD|100|
| LLTT| 50|
| Simo| 50|
|Zhangff|100|
| Wang|100|
| LiTT| 0|
| LiuBin|100|
| Yand| 50|
+-------+---+
between()
# 返回true 和 false
df.select(df.name, df.age.between(20, 25)).show()
+-------+-----------------------------+
| name|((age >= 20) AND (age <= 25))|
+-------+-----------------------------+
| Katie| false|
|Michale| true|
| Simo| true|
| BOb| null|
| Liux| true|
| UKD| false|
| LLTT| true|
| Simo| true|
|Zhangff| false|
| Wang| false|
| LiTT| false|
| LiuBin| false|
| Yand| true|
+-------+-----------------------------+
df.select(df.name, df.age.between(20, 25), df.eyecolor).filter("eyecolor like 'b%'").show()
+-------+-----------------------------+--------+
| name|((age >= 20) AND (age <= 25))|eyecolor|
+-------+-----------------------------+--------+
| Katie| false| brown|
| Liux| true| blue|
| LLTT| true| black|
|Zhangff| false| blue|
| LiTT| false| blue|
| Yand| true| black|
+-------+-----------------------------+--------+
2 改
# 創建一個DataFrame
from pyspark.sql import Row
row = Row("id", 'name')
ID = [1, 2, 3]
NAME = ['Bob', 'Lixu', 'Yangfei']
DF = sc.parallelize([row(ID[i], NAME[i]) for i in range(len(ID))]).toDF()
DF.show()
+---+-------+
| id| name|
+---+-------+
| 1| Bob|
| 2| Lixu|
| 3|Yangfei|
+---+-------+
withColumn()
# 添加或替換與現有名字相同的列 Returns a new DataFrame by adding a column or replacing the existing column that has the same name.
DF.withColumn("age", DF.id + 22).show()
+---+-------+---+
| id| name|age|
+---+-------+---+
| 1| Bob| 23|
| 2| Lixu| 24|
| 3|Yangfei| 25|
+---+-------+---+
DF_2 = DF.withColumn('age_2', fn.lit(0))
# 不能用 lit([0, 0, 0]),因爲list不能直接添加到DataFrame中,必須將其轉換成新的DataFrame,然後通過join操作進行
DF_2.show()
+---+-------+-----+
| id| name|age_2|
+---+-------+-----+
| 1| Bob| 0|
| 2| Lixu| 0|
| 3|Yangfei| 0|
+---+-------+-----+
from pyspark.sql import Row
consum = [423, 534, 212]
row = Row('pid', 'consum')
new_DF = sc.parallelize([row(i+1, consum[i]) for i in range(len(consum))]).toDF()
DF_3 = DF.join(new_DF, DF.id == new_DF.pid)
DF_3.show()
+---+-------+---+------+
| id| name|pid|consum|
+---+-------+---+------+
| 1| Bob| 1| 423|
| 3|Yangfei| 3| 212|
| 2| Lixu| 2| 534|
+---+-------+---+------+
DF_3.withColumn('per', DF_3.consum / DF_3.id).show()
+---+-------+---+------+-----------------+
| id| name|pid|consum| per|
+---+-------+---+------+-----------------+
| 1| Bob| 1| 423| 423.0|
| 3|Yangfei| 3| 212|70.66666666666667|
| 2| Lixu| 2| 534| 267.0|
+---+-------+---+------+-----------------+
DF_3.withColumn('pid_222', DF_3['pid'].cast('Float')).show()
+---+-------+---+------+-------+
| id| name|pid|consum|pid_222|
+---+-------+---+------+-------+
| 1| Bob| 1| 423| 1.0|
| 3|Yangfei| 3| 212| 3.0|
| 2| Lixu| 2| 534| 2.0|
+---+-------+---+------+-------+
withColumnRenamed
DF_3.withColumnRenamed('pid', 'pppppppppppid').show()
+---+-------+-------------+------+
| id| name|pppppppppppid|consum|
+---+-------+-------------+------+
| 1| Bob| 1| 423|
| 3|Yangfei| 3| 212|
| 2| Lixu| 2| 534|
+---+-------+-------------+------+
filter() & where()
DF_3.filter(DF_3['id'] > 2).show()
+---+-------+---+------+
| id| name|pid|consum|
+---+-------+---+------+
| 3|Yangfei| 3| 212|
+---+-------+---+------+
DF_3.where(DF_3['id'] > 2).show()
+---+-------+---+------+
| id| name|pid|consum|
+---+-------+---+------+
| 3|Yangfei| 3| 212|
+---+-------+---+------+
isnull() & isnan()
AAA = sc.parallelize([(123, 'Katie', 19, 'brown'),
(234, 'Michale', 22, None),
(345, 'Simo', 23, 'red'),
(351, 'BOb', None, 'green'),
(459, 'Liux', 20, 'blue'),
(555, 'UKD', 27, None),
(666, 'LLTT',34, 'black'),
(345, 'Simo', 23, None),
(236, 'Zhangff', 45, 'blue'),
(125, 'Wang', 99, 'white'),
(199, 'LiTT', 12, 'blue'),
(454, 'LiuBin', 32, 'pink'),
(378, 'Yand', 22, None)])
BBB = StructType([
StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("age", LongType(), True),
StructField("eyecolor", StringType(), True)
])
df1 = spark.createDataFrame(AAA, BBB)
df1.createOrReplaceTempView('df1')
df1.show()
+---+-------+----+--------+
| id| name| age|eyecolor|
+---+-------+----+--------+
|123| Katie| 19| brown|
|234|Michale| 22| null|
|345| Simo| 23| red|
|351| BOb|null| green|
|459| Liux| 20| blue|
|555| UKD| 27| null|
|666| LLTT| 34| black|
|345| Simo| 23| null|
|236|Zhangff| 45| blue|
|125| Wang| 99| white|
|199| LiTT| 12| blue|
|454| LiuBin| 32| pink|
|378| Yand| 22| null|
+---+-------+----+--------+
df1.filter(fn.isnan('eyecolor')).show() # isnan 將非數字數據篩選出來
+---+----+---+--------+
| id|name|age|eyecolor|
+---+----+---+--------+
+---+----+---+--------+
df1.filter(fn.isnull('eyecolor')).show() # isnull 將 空 數據篩選出來
+---+-------+---+--------+
| id| name|age|eyecolor|
+---+-------+---+--------+
|234|Michale| 22| null|
|555| UKD| 27| null|
|345| Simo| 23| null|
|378| Yand| 22| null|
+---+-------+---+--------+
union()
# 合併兩個DataFrame
row1 = Row("id", 'name')
ID1 = [1, 2, 3]
NAME1 = ['Bob', 'Lixu', 'Yangfei']
DF1 = sc.parallelize([row1(ID1[i], NAME1[i]) for i in range(len(ID1))]).toDF()
row2 = Row("id", 'name')
ID2 = [1, 2, 3]
NAME2 = ['WWWWWWW', 'TTTTTTTT', 'KKKKKKK']
DF2 = sc.parallelize([row2(ID2[i], NAME2[i]) for i in range(len(ID2))]).toDF()
DF1.union(DF2).show()
+---+--------+
| id| name|
+---+--------+
| 1| Bob|
| 2| Lixu|
| 3| Yangfei|
| 1| WWWWWWW|
| 2|TTTTTTTT|
| 3| KKKKKKK|
+---+--------+
join()
DF1.join(DF2, DF1.id == DF2.id).show()
+---+-------+---+--------+
| id| name| id| name|
+---+-------+---+--------+
| 1| Bob| 1| WWWWWWW|
| 3|Yangfei| 3| KKKKKKK|
| 2| Lixu| 2|TTTTTTTT|
+---+-------+---+--------+
subtract()
D1 = spark.createDataFrame(((1, "asf"),(2, "2143"),(3, "rfds"))).toDF("label", "sentence")
D1.show()
D2 = spark.createDataFrame(((1, "asf"),(2, "2143"),(4, "f8934y") )).toDF("label", "sentence")
D2.show()
+-----+--------+
|label|sentence|
+-----+--------+
| 1| asf|
| 2| 2143|
| 3| rfds|
+-----+--------+
+-----+--------+
|label|sentence|
+-----+--------+
| 1| asf|
| 2| 2143|
| 4| f8934y|
+-----+--------+
# 去掉D2中與D1重複的部分
D2.select('sentence').subtract(D1.select('sentence')).show()
+--------+
|sentence|
+--------+
| f8934y|
+--------+
intersect()
# 求交集。 D2與D1重複的部分
D2.select('sentence').intersect(D1.select('sentence')).show()
+--------+
|sentence|
+--------+
| asf|
| 2143|
+--------+
union() & distinct()
D2.select('sentence').union(D1.select('sentence')).distinct().show()
+--------+
|sentence|
+--------+
| rfds|
| asf|
| 2143|
| f8934y|
+--------+
crosstab()
df.crosstab('age', 'eyecolor').show()
+------------+-----+----+-----+-----+----+----+---+-----+
|age_eyecolor|black|blue|brown|green|null|pink|red|white|
+------------+-----+----+-----+-----+----+----+---+-----+
| null| 0| 0| 0| 1| 0| 0| 0| 0|
| 20| 0| 1| 0| 0| 0| 0| 0| 0|
| 21| 1| 0| 0| 0| 0| 0| 0| 0|
| 32| 0| 0| 0| 0| 0| 1| 0| 0|
| 45| 0| 1| 0| 0| 0| 0| 0| 0|
| 22| 1| 0| 0| 1| 0| 0| 0| 0|
| 27| 0| 0| 0| 0| 1| 0| 0| 0|
| 12| 0| 1| 0| 0| 0| 0| 0| 0|
| 99| 0| 0| 0| 0| 0| 0| 0| 1|
| 23| 0| 0| 0| 0| 0| 0| 2| 0|
| 19| 0| 0| 1| 0| 0| 0| 0| 0|
+------------+-----+----+-----+-----+----+----+---+-----+
groupby()
df.groupby('Age').agg({'id' : 'max'}).show()
+----+-------+
| Age|max(id)|
+----+-------+
| 19| 123|
| 22| 378|
| 32| 454|
|null| 351|
| 27| 555|
| 12| 199|
| 21| 666|
| 23| 345|
| 20| 459|
| 99| 125|
| 45| 236|
+----+-------+
df.groupby('age').count().show()
+----+-----+
| age|count|
+----+-----+
| 19| 1|
| 22| 2|
| 32| 1|
|null| 1|
| 27| 1|
| 12| 1|
| 21| 1|
| 23| 2|
| 20| 1|
| 99| 1|
| 45| 1|
+----+-----+
df.groupBy('age').agg(fn.count('age'), fn.avg('age'), fn.min('age'), fn.max('age')).show()
+----+----------+--------+--------+--------+
| age|count(age)|avg(age)|min(age)|max(age)|
+----+----------+--------+--------+--------+
| 19| 1| 19.0| 19| 19|
| 22| 2| 22.0| 22| 22|
| 32| 1| 32.0| 32| 32|
|null| 0| null| null| null|
| 27| 1| 27.0| 27| 27|
| 12| 1| 12.0| 12| 12|
| 21| 1| 21.0| 21| 21|
| 23| 2| 23.0| 23| 23|
| 20| 1| 20.0| 20| 20|
| 99| 1| 99.0| 99| 99|
| 45| 1| 45.0| 45| 45|
+----+----------+--------+--------+--------+
foreach()函數
def pp(el):
f1 = open("./foreach.txt", 'a+')
print(el, file=f1)
open('./foreach.txt', 'w').close()
df.foreach(pp)
with open("./foreach.txt", 'r') as ff:
print(ff.read())
Row(id=351, name='BOb', age=None, eyecolor='green')
Row(id=234, name='Michale', age=22, eyecolor='green')
Row(id=345, name='Simo', age=23, eyecolor='red')
Row(id=666, name='LLTT', age=21, eyecolor='black')
Row(id=345, name='Simo', age=23, eyecolor='red')
Row(id=236, name='Zhangff', age=45, eyecolor='blue')
Row(id=125, name='Wang', age=99, eyecolor='white')
Row(id=199, name='LiTT', age=12, eyecolor='blue')
Row(id=454, name='LiuBin', age=32, eyecolor='pink')
Row(id=378, name='Yand', age=22, eyecolor='black')
def pp(el):
f1 = open("./foreach.txt", 'a+')
print([el_1*2 for el_1 in el], file=f1)
open('./foreach.txt', 'w').close()
df.foreachPartition(pp)
with open("./foreach.txt", 'r') as ff:
print(ff.read())
[(666, 'LLTT', 21, 'black', 666, 'LLTT', 21, 'black'), (345, 'Simo', 23, 'red', 345, 'Simo', 23, 'red'), (236, 'Zhangff', 45, 'blue', 236, 'Zhangff', 45, 'blue')]
[(125, 'Wang', 99, 'white', 125, 'Wang', 99, 'white'), (199, 'LiTT', 12, 'blue', 199, 'LiTT', 12, 'blue'), (454, 'LiuBin', 32, 'pink', 454, 'LiuBin', 32, 'pink'), (378, 'Yand', 22, 'black', 378, 'Yand', 22, 'black')]
[(123, 'Katie', 19, 'brown', 123, 'Katie', 19, 'brown'), (234, 'Michale', 22, 'green', 234, 'Michale', 22, 'green'), (345, 'Simo', 23, 'red', 345, 'Simo', 23, 'red')]
[(351, 'BOb', None, 'green', 351, 'BOb', None, 'green'), (459, 'Liux', 20, 'blue', 459, 'Liux', 20, 'blue'), (555, 'UKD', 27, None, 555, 'UKD', 27, None)]
map()
df.select('id').rdd.map(lambda x:(x, 1)).collect()
[(Row(id=123), 1),
(Row(id=234), 1),
(Row(id=345), 1),
(Row(id=351), 1),
(Row(id=459), 1),
(Row(id=555), 1),
(Row(id=666), 1),
(Row(id=345), 1),
(Row(id=236), 1),
(Row(id=125), 1),
(Row(id=199), 1),
(Row(id=454), 1),
(Row(id=378), 1)]
udf()
DD = spark.createDataFrame(((1, "asf", "5555"),(2, "2143", None),(4, "f8934y", "77"))).toDF("label", "sentence", "day")
DD.show()
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 2| 2143|null|
| 4| f8934y| 77|
+-----+--------+----+
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import time
import datetime
# 定義一個 udf 函數
def today(day):
if day==None:
return datetime.datetime.fromtimestamp(int(time.time())).strftime('%Y-%m-%d')
else:
return day
# 返回類型爲字符串類型
udfday = udf(today, StringType())
# 使用
DD.withColumn('day1', udfday(DD.day)).show()
+-----+--------+----+----------+
|label|sentence| day| day1|
+-----+--------+----+----------+
| 1| asf|5555| 5555|
| 2| 2143|null|2019-03-31|
| 4| f8934y| 77| 77|
+-----+--------+----+----------+
drop()
DD.drop(DD.day).show()
+-----+--------+
|label|sentence|
+-----+--------+
| 1| asf|
| 2| 2143|
| 4| f8934y|
+-----+--------+
dropna
DD.na.drop().show() # 刪除包含 None 的行
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 4| f8934y| 77|
+-----+--------+----+
DD.dropna(subset=['sentence', 'day']).show()
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 4| f8934y| 77|
+-----+--------+----+
fillna
DD.fillna('50').show() # day 列有空,day列爲string 類型,所以用字符串進行填充
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 2| 2143| 50|
| 4| f8934y| 77|
+-----+--------+----+
DD.fillna(50).show() # day 列有空,day列爲string 類型,所以用整型進行填充是不行的
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 2| 2143|null|
| 4| f8934y| 77|
+-----+--------+----+
DD.na.fill('50').show()
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 2| 2143| 50|
| 4| f8934y| 77|
+-----+--------+----+
distinct()
DDD = spark.createDataFrame(((1, "asf", "5555"),(1, "asf", "5555"),(1, "asf", "5555"),(1, "asf", "5555"),(2, "2143", None),(4, "f8934y", "77"))).toDF("label", "sentence", "day")
DDD.show()
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 1| asf|5555|
| 1| asf|5555|
| 1| asf|5555|
| 2| 2143|null|
| 4| f8934y| 77|
+-----+--------+----+
DDD.distinct().show()
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 4| f8934y| 77|
| 1| asf|5555|
| 2| 2143|null|
+-----+--------+----+
dropDuplicates()
DDD.select('label', 'sentence').dropDuplicates().show()
+-----+--------+
|label|sentence|
+-----+--------+
| 1| asf|
| 2| 2143|
| 4| f8934y|
+-----+--------+
3 格式轉換
toPandas()
pandas_df = DDD.toPandas()
pandas_df
label | sentence | day | |
---|---|---|---|
0 | 1 | asf | 5555 |
1 | 1 | asf | 5555 |
2 | 1 | asf | 5555 |
3 | 1 | asf | 5555 |
4 | 2 | 2143 | None |
5 | 4 | f8934y | 77 |
createDataFrame()
spark_df = spark.createDataFrame(pandas_df)
spark_df.show()
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 1| asf|5555|
| 1| asf|5555|
| 1| asf|5555|
| 2| 2143|null|
| 4| f8934y| 77|
+-----+--------+----+
rdd()
rdd_df = DDD.rdd
rdd_df.collect()
[Row(label=1, sentence='asf', day='5555'),
Row(label=1, sentence='asf', day='5555'),
Row(label=1, sentence='asf', day='5555'),
Row(label=1, sentence='asf', day='5555'),
Row(label=2, sentence='2143', day=None),
Row(label=4, sentence='f8934y', day='77')]
df = rdd_df.toDF()
df.show()
+-----+--------+----+
|label|sentence| day|
+-----+--------+----+
| 1| asf|5555|
| 1| asf|5555|
| 1| asf|5555|
| 1| asf|5555|
| 2| 2143|null|
| 4| f8934y| 77|
+-----+--------+----+
參考:
1、官方文檔:https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame
2、https://blog.csdn.net/sinat_26917383/article/details/80500349