Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 2.1.0
/_/
>>> from pyspark.sql import HiveContext
>>> HiveCtx=HiveContext(sc)
>>> sale=HiveCtx.sql("select * from yxpt.pi_cust_item_month")
>>> sale.select("cust_id","item_id").show(5)
+------------+--------+
| cust_id| item_id|
+------------+--------+
|110113104354|33010104|
|110108204119|22240103|
|110105103680|31010403|
|110102102613|35260104|
|110229100367|90110115|
+------------+--------+
only showing top 5 rows
>>> test_sale=HiveCtx.sql("select cust_id,item_id,date1,qty_sold from yxpt.pi_cust_item_month limit 200")
>>> test_sale.show(5)
+------------+--------+------+--------+
| cust_id| item_id| date1|qty_sold|
+------------+--------+------+--------+
|110117100211|35510201|201605| 1.00|
|110106202805|43010104|201605| 5.00|
|110114205326|53020103|201605| 4.00|
|110105207751|53020411|201605| 60.00|
|110101204183|34030219|201605| 2.00|
+------------+--------+------+--------+
>>> test_sale.filter(test_sale['qty_sold']>50).show()
+------------+--------+------+--------+
| cust_id| item_id| date1|qty_sold|
+------------+--------+------+--------+
|110105108630|53020411|201608| 154.00|
|110107100625|53020411|201608| 445.00|
|110117100192|32010101|201608| 75.00|
+------------+--------+------+--------+
>>> sale.groupBy('cust_id').count().show(5)#查詢每個零售戶有多少條記錄
+------------+-----+
| cust_id|count|
+------------+-----+
|110229100172| 1536|
|110101203818| 3487|
|110108108378| 4586|
|110115203451| 1513|
|110105108113| 5427|
+------------+-----+
only showing top 5 rows
>>>
>>> #DataFrame對象沒有map屬性,需要先轉化爲彈性分佈式數據集
>>> teenagers.rdd.map(lambda p:"Name:" + p.name).collect()
[u'Name:Jack', u'Name:Jessie']
>>> teenNames=teenagers.rdd.map(lambda p:"Name:" + p.name)
>>> for item in teenNames.collect():
... print item
Name:Jack
Name:Jessie
>>>#之前調用了Row模塊來構建新的DataFrame,這裏使用自定義的方法構建DataFrame
>>> schemaString="name age"
>>> from pyspark.sql.types import *
>>> fields=[StructField(field_name,StringType(),True) for field_name in schemaString.split()]
>>> schema=StructType(fields)
>>> lines.collect()
[('Leon', 43), ('Rachell', 74), ('Jack', 23), ('Tom', 0), ('Jessie', 31)]
>>> spark.createDataFrame(lines,schema).show() #按指定格式構建DataFrame
+-------+---+
| name|age|
+-------+---+
| Leon| 43|
|Rachell| 74|
| Jack| 23|
| Tom| 0|
| Jessie| 31|
+-------+---+
>>> #df=spark.read.load("home/remoteuser/superadmin/zhaoyangguanlian.csv",format="csv",sep=",",inferSchema="true",header="true")
>>> lines=sc.parallelize({('Tom',0),('Jack',23),('Leon',43),('Jessie',31),('Rachell',89)})
>>> lines.count()
5
>>> people=lines.map(lambda p: Row(name=p[0],age=int(p[1])))
>>> schemaPeople=spark.createDataFrame(people)
>>> schemaPeople
DataFrame[age: bigint, name: string]
>>> schemaPeople.show()
+---+-------+
|age| name|
+---+-------+
| 43| Leon|
| 0| Tom|
| 23| Jack|
| 89|Rachell|
| 31| Jessie|
+---+-------+
>>> schemaPeople.createOrReplaceTempView('people')
>>> teenagers=spark.sql('select name from people where age>=15 and age<=35')
>>> teenagers.show()
+------+
| name|
+------+
| Jack|
|Jessie|
+------+
>>> squaresDF=spark.createDataFrame(sc.parallelize(range(1,6)).map(lambda i: Row(single=i,double=i**2)))
>>> squaresDF.show()
+------+------+
|double|single|
+------+------+
| 1| 1|
| 4| 2|
| 9| 3|
| 16| 4|
| 25| 5|
+------+------+
>>> squaresDF.write.parquet("hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table/key=1") #寫入分區
>>> cubesDF=spark.createDataFrame(sc.parallelize(range(6,11)).map(lambda i:Row(single=i,triple=i**3)))
>>> cubesDF.show()
+------+------+
|single|triple|
+------+------+
| 6| 216|
| 7| 343|
| 8| 512|
| 9| 729|
| 10| 1000|
+------+------+
>>> cubesDF.write.parquet("hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table/key=2") #存入分區
>>> mergedDF=spark.read.option("mergeSchema","true").parquet("hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table")
>>> mergedDF.printSchema()
root
|-- double: long (nullable = true)
|-- single: long (nullable = true)
|-- triple: long (nullable = true)
|-- key: integer (nullable = true)
>>> mergedDF.show()
+------+------+------+---+
|double|single|triple|key|
+------+------+------+---+
| null| 9| 729| 2|
| null| 10| 1000| 2|
| 16| 4| null| 1|
| 25| 5| null| 1|
| null| 6| 216| 2|
| null| 7| 343| 2|
| null| 8| 512| 2|
| 1| 1| null| 1|
| 4| 2| null| 1|
| 9| 3| null| 1|
+------+------+------+---+
>>> from os.path import expanduser,join,abspath
>>> abspath('spark-warehouse')
'/usr/inspur/2.5.0.0-1245/spark2/bin/spark-warehouse'
>>> warehouse_location=abspath('spark-warehouse')
>>> spark=SparkSession.builder.appName("Python Spark SQL Hive integration example").config("spark.sql.warehouse.dir",warehouse_location).enableHiveSupport().getOrCreate()
>>> spark.sql("CREATE TABLE IF NOT EXISTS src (key INT,value INT)")
DataFrame[]
>>> spark.sql("SELECT * FROM src")
DataFrame[key: int, value: int]
>>> #spark.sql("LOAD DATA LOCAL INPATH 'hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table/key=1' INTO TABLE src")
#不能將/hive/warehouse裏面的squaresDF或cubesDF導入/inspur/2.5.0.0-1245/spark2/bin/spark-warehouse新建的src表
>>>
>>>
>>> schemaPeople.show()
+---+-------+
|age| name|
+---+-------+
| 43| Leon|
| 0| Tom|
| 23| Jack|
| 89|Rachell|
| 31| Jessie|
+---+-------+
>>> schemaPeople.select(schemaPeople["age"]+1).show()
+---------+
|(age + 1)|
+---------+
| 44|
| 1|
| 24|
| 90|
| 32|
+---------+
>>> schemaPeople.select(schemaPeople["age"]/3).show()
+------------------+
| (age / 3)|
+------------------+
|14.333333333333334|
| 0.0|
| 7.666666666666667|
|29.666666666666668|
|10.333333333333334|
+------------------+
>>> schemaPeople.select("age").show()
+---+
|age|
+---+
| 43|
| 0|
| 23|
| 89|
| 31|
+---+
>>> schemaPeople2=schemaPeople.select(schemaPeople["age"]+1,schemaPeople["name"])
>>> schemaPeople2.show()
+---------+-------+
|(age + 1)| name|
+---------+-------+
| 44| Leon|
| 1| Tom|
| 24| Jack|
| 90|Rachell|
| 32| Jessie|
+---------+-------+
>>> data.show(5)
+------------+-------------+-------------+
| cust_id|sum(qty_sold)|sum(amt_sold)|
+------------+-------------+-------------+
|110106106866| 499.00| 54176.67|
|110102105975| 1019.00| 113162.04|
|110105208183| 4098.00| 526847.96|
|110229100172| 402.00| 28817.84|
|110114104030| 1036.00| 89332.29|
+------------+-------------+-------------+
only showing top 5 rows
>>> sale.show(5)
+------------+--------+------+--------+--------+
| cust_id| item_id| date1|qty_sold|amt_sold|
+------------+--------+------+--------+--------+
|110105103343|31010702|201702| 2.00| 150.52|
|110105109235|22240116|201702| 15.00| 1828.50|
|110115205453|11018817|201702| 1.00| 93.28|
|110108109775|33010115|201702| 1.00| 190.80|
|110108203778|53010236|201702| 0.00| 0.00|
+------------+--------+------+--------+--------+
only showing top 5 rows
>>> sale.groupBy("cust_id").pivot("date1").agg(func.sum("qty_sold")).show(5) #分組聚合
+------------+-------+-------+-------+
| cust_id| 201701| 201702| 201703|
+------------+-------+-------+-------+
|110115104854| 605.00| 354.00| 353.00|
|110229101511| 125.00| 139.00| 192.00|
|110228202045| 48.00| null| 14.00|
|110115205562|1191.00| 855.00| 774.00|
|110106207490| 771.00|1169.00|1389.00|
+------------+-------+-------+-------+
>>> from pyspark.ml.feature import VectorAssembler #特徵向量化
>>> data_sample=sc.parallelize([(0.0,0.0,0.0),(0.1,0.1,0.1),(0.2,0.2,0.2),(9.0,9.0,9.0),(9.1,9.1,9.1),(9.2,9.2,9.2)] )
>>> df=sqlContext.createDataFrame(data_sample,["features1","features2","features3"]) #建表方法一
>>> df.show()
+---------+---------+---------+
|features1|features2|features3|
+---------+---------+---------+
| 0.0| 0.0| 0.0|
| 0.1| 0.1| 0.1|
| 0.2| 0.2| 0.2|
| 9.0| 9.0| 9.0|
| 9.1| 9.1| 9.1|
| 9.2| 9.2| 9.2|
+---------+---------+---------+
>>> data_sample=[[0.0,0.0,0.0],[0.1,0.1,0.1],[0.2,0.2,0.2],[9.0,9.0,9.0],[9.1,9.1,9.1],[9.2,9.2,9.2]]
>>> data_sample
[[0.0, 0.0, 0.0], [0.1, 0.1, 0.1], [0.2, 0.2, 0.2], [9.0, 9.0, 9.0], [9.1, 9.1, 9.1], [9.2, 9.2, 9.2]]
>>> df=spark.createDataFrame(data_sample,["feature1","feature2","feature3"]) #建表方法二
>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
| 0.0| 0.0| 0.0|
| 0.1| 0.1| 0.1|
| 0.2| 0.2| 0.2|
| 9.0| 9.0| 9.0|
| 9.1| 9.1| 9.1|
| 9.2| 9.2| 9.2|
+--------+--------+--------+
>>> df=spark.createDataFrame(data_sample).toDF("feature1","feature2",'feature3') #建表方法三
>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
| 0.0| 0.0| 0.0|
| 0.1| 0.1| 0.1|
| 0.2| 0.2| 0.2|
| 9.0| 9.0| 9.0|
| 9.1| 9.1| 9.1|
| 9.2| 9.2| 9.2|
+--------+--------+--------+
>>> data_sample=[[0.3,0.3,0.3],[0.1,0.1,0.1],[0.2,0.2,0.2],[9.0,9.0,9.0],[9.1,9.1,9.1],[9.2,9.2,9.2]]
>>> df=spark.createDataFrame(data_sample).toDF("feature1","feature2",'feature3')
>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
| 0.3| 0.3| 0.3|
| 0.1| 0.1| 0.1|
| 0.2| 0.2| 0.2|
| 9.0| 9.0| 9.0|
| 9.1| 9.1| 9.1|
| 9.2| 9.2| 9.2|
+--------+--------+--------+
>>> assembler=VectorAssembler(inputCols=["feature1","feature2","feature3"],outputCol="features")
>>> assembler.transform(df).show()
+--------+--------+--------+-------------+
|feature1|feature2|feature3| features|
+--------+--------+--------+-------------+
| 0.3| 0.3| 0.3|[0.3,0.3,0.3]|
| 0.1| 0.1| 0.1|[0.1,0.1,0.1]|
| 0.2| 0.2| 0.2|[0.2,0.2,0.2]|
| 9.0| 9.0| 9.0|[9.0,9.0,9.0]|
| 9.1| 9.1| 9.1|[9.1,9.1,9.1]|
| 9.2| 9.2| 9.2|[9.2,9.2,9.2]|
+--------+--------+--------+-------------+
>>> df_vector=assembler.transform(df)
>>> df_vector.select(df_vector["features"]).show()
+-------------+
| features|
+-------------+
|[0.3,0.3,0.3]|
|[0.1,0.1,0.1]|
|[0.2,0.2,0.2]|
|[9.0,9.0,9.0]|
|[9.1,9.1,9.1]|
|[9.2,9.2,9.2]|
+-------------+
>>> kmeans=KMeans(k=2,seed=1)
>>> model=kmeans.fit(df_vector.select(df_vector["features"]))
18/03/27 20:27:20 WARN KMeans: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
18/03/27 20:27:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
18/03/27 20:27:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
18/03/27 20:27:21 WARN KMeans: The input data was not directly cached, which may hurt performance if its parent RDDs are also uncached.
>>> centers=model.clusterCenters()
>>> len(centers)
2
>>> centers
[array([ 9.1, 9.1, 9.1]), array([ 0.2, 0.2, 0.2])]
>>> transformed=model.transform(df_vector.select(df_vector["features"]))
>>> transformed.show()
+-------------+----------+
| features|prediction|
+-------------+----------+
|[0.3,0.3,0.3]| 1|
|[0.1,0.1,0.1]| 1|
|[0.2,0.2,0.2]| 1|
|[9.0,9.0,9.0]| 0|
|[9.1,9.1,9.1]| 0|
|[9.2,9.2,9.2]| 0|
+-------------+----------+
>>> from pyspark.ml.feature import MinMaxScaler
>>> df_vector.show()
+--------+--------+--------+-------------+
|feature1|feature2|feature3| features|
+--------+--------+--------+-------------+
| 0.3| 0.3| 0.3|[0.3,0.3,0.3]|
| 0.1| 0.1| 0.1|[0.1,0.1,0.1]|
| 0.2| 0.2| 0.2|[0.2,0.2,0.2]|
| 9.0| 9.0| 9.0|[9.0,9.0,9.0]|
| 9.1| 9.1| 9.1|[9.1,9.1,9.1]|
| 9.2| 9.2| 9.2|[9.2,9.2,9.2]|
+--------+--------+--------+-------------+
>>> mmScaler=MinMaxScaler(inputCol="features",outputCol="minmaxscaler") #歸一化,最小的變爲0,最大的變爲1,其餘值按一定比例分佈在01區間
>>> mm_model=mmScaler.fit(df_vector.select(df_vector["features"]))
>>> mm_model.transform(df_vector.select(df_vector["features"])).show()
+-------------+--------------------+
| features| minmaxscaler|
+-------------+--------------------+
|[0.3,0.3,0.3]|[0.02197802197802...|
|[0.1,0.1,0.1]| [0.0,0.0,0.0]|
|[0.2,0.2,0.2]|[0.01098901098901...|
|[9.0,9.0,9.0]|[0.97802197802197...|
|[9.1,9.1,9.1]|[0.98901098901098...|
|[9.2,9.2,9.2]| [1.0,1.0,1.0]|
+-------------+--------------------+
>>> mm_model.originalMin
DenseVector([0.1, 0.1, 0.1])
>>> from pyspark.sql import SparkSession
>>> spark=SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option","some-value").getOrCreate()
>>> from pyspark.sql import Row
>>> import pyspark.sql.functions as func
>>> sc=spark.sparkContext
>>> from pyspark.sql.types import *
>>> from os.path import expanduser,join,abspath
>>> sale=spark.sql("select cust_id,item_id,date1,qty_sold,amt_sold from yxpt.pi_cust_item_month where date1 between '201701' and '201703'")
>>> sale.show(5)
+------------+--------+------+--------+--------+
| cust_id| item_id| date1|qty_sold|amt_sold|
+------------+--------+------+--------+--------+
|110112205155|33010107|201701| 2.00| 286.20|
|110105103824|31010403|201701| 3.00| 1749.00|
|110102100215|32010108|201701| 12.00|10176.00|
|110108210202|43020107|201701| 7.00| 2151.80|
|110117100471|37020308|201701| 1.00| 61.50|
+------------+--------+------+--------+--------+
only showing top 5 rows
>>> data=sale.groupBy("cust_id").agg(func.sum("qty_sold"),func.sum("amt_sold"))
>>> data.show(5)
+------------+-------------+-------------+
| cust_id|sum(qty_sold)|sum(amt_sold)|
+------------+-------------+-------------+
|110106106866| 499.00| 54176.67|
|110102105975| 1019.00| 113162.04|
|110105208183| 4098.00| 526847.96|
|110229100172| 402.00| 28817.84|
|110114104030| 1036.00| 89332.29|
+------------+-------------+-------------+
only showing top 5 rows
>>> from pyspark.ml.clustering import KMeans
>>> from pyspark.ml.linalg import Vectors
>>>
>>> data_sample=sc.parallelize([(0.3,0.3,0.3),(0.1,0.1,0.1),(0.2,0.2,0.2),(9.0,9.0,9.0),(9.1,9.1,9.1),(9.2,9.2,9.2)])
>>> df=spark.createDataFrame(data_sample,["feature1","feature2","feature3"])
>>> df.show(5)
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
| 0.3| 0.3| 0.3|
| 0.1| 0.1| 0.1|
| 0.2| 0.2| 0.2|
| 9.0| 9.0| 9.0|
| 9.1| 9.1| 9.1|
+--------+--------+--------+
only showing top 5 rows
>>> from pyspark.ml.feature import VectorAssembler
>>> assembler=VectorAssembler(inputCols=["feature1","feature2","feature3"],outputCol="features")
>>> output=assembler.transform(df)
>>> output.show()
+--------+--------+--------+-------------+
|feature1|feature2|feature3| features|
+--------+--------+--------+-------------+
| 0.3| 0.3| 0.3|[0.3,0.3,0.3]|
| 0.1| 0.1| 0.1|[0.1,0.1,0.1]|
| 0.2| 0.2| 0.2|[0.2,0.2,0.2]|
| 9.0| 9.0| 9.0|[9.0,9.0,9.0]|
| 9.1| 9.1| 9.1|[9.1,9.1,9.1]|
| 9.2| 9.2| 9.2|[9.2,9.2,9.2]|
+--------+--------+--------+-------------+
>>> from pyspark.ml.feature import MinMaxScaler #歸一化
>>> mmScaler=MinMaxScaler(inputCol="features",outputCol="minmaxscaler")
>>> mm_model=mmScaler.fit(output.select(output["features"]))
>>> mm_output=mm_model.transform(output.select(output["features"]))
>>> mm_output.show()
+-------------+--------------------+
| features| minmaxscaler|
+-------------+--------------------+
|[0.3,0.3,0.3]|[0.02197802197802...|
|[0.1,0.1,0.1]| [0.0,0.0,0.0]|
|[0.2,0.2,0.2]|[0.01098901098901...|
|[9.0,9.0,9.0]|[0.97802197802197...|
|[9.1,9.1,9.1]|[0.98901098901098...|
|[9.2,9.2,9.2]| [1.0,1.0,1.0]|
+-------------+--------------------+
>>> kmeans=KMeans(k=2,seed=1)
>>> model=kmeans.fit(mm_output.selectExpr("minmaxscaler as features"))
18/03/28 14:49:39 WARN KMeans: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
18/03/28 14:49:39 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
18/03/28 14:49:39 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
18/03/28 14:49:40 WARN KMeans: The input data was not directly cached, which may hurt performance if its parent RDDs are also uncached.
>>> transformed=model.transform(mm_output.selectExpr("minmaxscaler as features"))
>>> transformed.show()
+--------------------+----------+
| features|prediction|
+--------------------+----------+
|[0.02197802197802...| 1|
| [0.0,0.0,0.0]| 1|
|[0.01098901098901...| 1|
|[0.97802197802197...| 0|
|[0.98901098901098...| 0|
| [1.0,1.0,1.0]| 0|
+--------------------+----------+
>>> model.computeCost(mm_output.selectExpr("minmaxscaler as features"))
0.0014491003501979412
>>> from pyspark.sql.functions import col, countDistinct
>>>
>>>
>>>
>>>
>>> sale.show(5)
+------------+--------+------+--------+--------+
| cust_id| item_id| date1|qty_sold|amt_sold|
+------------+--------+------+--------+--------+
|110112205155|33010107|201701| 2.00| 286.20|
|110105103824|31010403|201701| 3.00| 1749.00|
|110102100215|32010108|201701| 12.00|10176.00|
|110108210202|43020107|201701| 7.00| 2151.80|
|110117100471|37020308|201701| 1.00| 61.50|
+------------+--------+------+--------+--------+
only showing top 5 rows
>>> sale.registerTempTable("sale_table")
>>> sale_months=spark.sql("select cust_id,sum(qty_sold),sum(amt_sold),count(distinct date1) months from sale_table group by cust_id") >>> sale_months
DataFrame[cust_id: string, sum(qty_sold): decimal(32,2), sum(amt_sold): decimal(32,2), months: bigint]
>>> sale_months.show(5)
+------------+-------------+-------------+------+
| cust_id|sum(qty_sold)|sum(amt_sold)|months|
+------------+-------------+-------------+------+
|110115104854| 1312.00| 166518.50| 3|
|110229101511| 456.00| 35549.02| 3|
|110228202045| 62.00| 6889.47| 2|
|110115205562| 2820.00| 292720.53| 3|
|110106207490| 3329.00| 359208.48| 3|
+------------+-------------+-------------+------+
only showing top 5 rows
>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
| 0.3| 0.3| 0.3|
| 0.1| 0.1| 0.1|
| 0.2| 0.2| 0.2|
| 9.0| 9.0| 9.0|
| 9.1| 9.1| 9.1|
| 9.2| 9.2| 9.2|
+--------+--------+--------+
>>> df.select("feature1").show()
+--------+
|feature1|
+--------+
| 0.3|
| 0.1|
| 0.2|
| 9.0|
| 9.1|
| 9.2|
+--------+
>>> df.select(df["feature1"]+1).show()
+--------------+
|(feature1 + 1)|
+--------------+
| 1.3|
| 1.1|
| 1.2|
| 10.0|
| 10.1|
| 10.2|
+--------------+
>>> df.select(df["feature1"]/df["feature2"]).show()
+---------------------+
|(feature1 / feature2)|
+---------------------+
| 1.0|
| 1.0|
| 1.0|
| 1.0|
| 1.0|
| 1.0|
+---------------------+
>>> sale_months.show(5)
+------------+-------------+-------------+------+
| cust_id|sum(qty_sold)|sum(amt_sold)|months|
+------------+-------------+-------------+------+
|110115104854| 1312.00| 166518.50| 3|
|110115205562| 2820.00| 292720.53| 3|
|110229101511| 456.00| 35549.02| 3|
|110228202045| 62.00| 6889.47| 2|
|110106207490| 3329.00| 359208.48| 3|
+------------+-------------+-------------+------+
only showing top 5 rows
>>> sale_average=sale_months.select(sale_months["cust_id"],sale_months["sum(qty_sold)"]/sale_months["months"],sale_months["sum(amt_sold)"]/sale_months["months"])
>>> sale_average.show(5)
+------------+------------------------+------------------------+
| cust_id|(sum(qty_sold) / months)|(sum(amt_sold) / months)|
+------------+------------------------+------------------------+
|110115104854| 437.333333333333333| 55506.16666666666...|
|110229101511| 152.000000000000000| 11849.67333333333...|
|110228202045| 31.000000000000000| 3444.735000000000000|
|110115205562| 940.000000000000000| 97573.51000000000...|
|110106207490| 1109.666666666666667| 119736.1600000000...|
+------------+------------------------+------------------------+
only showing top 5 rows
Spark大數據分析——pyspark(三)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.