Spark大數據分析——pyspark（三）

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.0
      /_/
>>> from pyspark.sql import HiveContext
>>> HiveCtx=HiveContext(sc)
>>> sale=HiveCtx.sql("select * from yxpt.pi_cust_item_month")
>>> sale.select("cust_id","item_id").show(5)
+------------+--------+
|     cust_id| item_id|
+------------+--------+
|110113104354|33010104|
|110108204119|22240103|
|110105103680|31010403|
|110102102613|35260104|
|110229100367|90110115|
+------------+--------+
only showing top 5 rows
>>> test_sale=HiveCtx.sql("select cust_id,item_id,date1,qty_sold from yxpt.pi_cust_item_month limit 200")
>>> test_sale.show(5)
+------------+--------+------+--------+
|     cust_id| item_id| date1|qty_sold|
+------------+--------+------+--------+
|110117100211|35510201|201605|    1.00|
|110106202805|43010104|201605|    5.00|
|110114205326|53020103|201605|    4.00|
|110105207751|53020411|201605|   60.00|
|110101204183|34030219|201605|    2.00|
+------------+--------+------+--------+
>>> test_sale.filter(test_sale['qty_sold']>50).show()
+------------+--------+------+--------+
|     cust_id| item_id| date1|qty_sold|
+------------+--------+------+--------+
|110105108630|53020411|201608|  154.00|
|110107100625|53020411|201608|  445.00|
|110117100192|32010101|201608|   75.00|
+------------+--------+------+--------+

>>> sale.groupBy('cust_id').count().show(5)#查詢每個零售戶有多少條記錄
+------------+-----+
|     cust_id|count|
+------------+-----+
|110229100172| 1536|
|110101203818| 3487|
|110108108378| 4586|
|110115203451| 1513|
|110105108113| 5427|
+------------+-----+
only showing top 5 rows
>>>
>>> #DataFrame對象沒有map屬性，需要先轉化爲彈性分佈式數據集
>>> teenagers.rdd.map(lambda p:"Name:" + p.name).collect()
[u'Name:Jack', u'Name:Jessie']
>>> teenNames=teenagers.rdd.map(lambda p:"Name:" + p.name)
>>> for item in teenNames.collect():
...     print item
Name:Jack
Name:Jessie

>>>#之前調用了Row模塊來構建新的DataFrame,這裏使用自定義的方法構建DataFrame
>>> schemaString="name age"
>>> from pyspark.sql.types import *
>>> fields=[StructField(field_name,StringType(),True) for field_name in schemaString.split()]
>>> schema=StructType(fields)
>>> lines.collect()
[('Leon', 43), ('Rachell', 74), ('Jack', 23), ('Tom', 0), ('Jessie', 31)]
>>> spark.createDataFrame(lines,schema).show() #按指定格式構建DataFrame
+-------+---+
|   name|age|
+-------+---+
|   Leon| 43|
|Rachell| 74|
|   Jack| 23|
|    Tom|  0|
| Jessie| 31|
+-------+---+

>>> #df=spark.read.load("home/remoteuser/superadmin/zhaoyangguanlian.csv",format="csv",sep=",",inferSchema="true",header="true")
>>> lines=sc.parallelize({('Tom',0),('Jack',23),('Leon',43),('Jessie',31),('Rachell',89)})
>>> lines.count()
5
>>> people=lines.map(lambda p: Row(name=p[0],age=int(p[1])))
>>> schemaPeople=spark.createDataFrame(people)
>>> schemaPeople
DataFrame[age: bigint, name: string]
>>> schemaPeople.show()
+---+-------+
|age|   name|
+---+-------+
| 43|   Leon|
|  0|    Tom|
| 23|   Jack|
| 89|Rachell|
| 31| Jessie|
+---+-------+

>>> schemaPeople.createOrReplaceTempView('people')
>>> teenagers=spark.sql('select name from people where age>=15 and age<=35')
>>> teenagers.show()
+------+
|  name|
+------+
|  Jack|
|Jessie|
+------+

>>> squaresDF=spark.createDataFrame(sc.parallelize(range(1,6)).map(lambda i: Row(single=i,double=i**2)))
>>> squaresDF.show()
+------+------+
|double|single|
+------+------+
|     1|     1|
|     4|     2|
|     9|     3|
|    16|     4|
|    25|     5|
+------+------+

>>> squaresDF.write.parquet("hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table/key=1") #寫入分區
>>> cubesDF=spark.createDataFrame(sc.parallelize(range(6,11)).map(lambda i:Row(single=i,triple=i**3)))
>>> cubesDF.show()
+------+------+
|single|triple|
+------+------+
|     6|   216|
|     7|   343|
|     8|   512|
|     9|   729|
|    10|  1000|
+------+------+
>>> cubesDF.write.parquet("hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table/key=2")  #存入分區
>>> mergedDF=spark.read.option("mergeSchema","true").parquet("hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table")
>>> mergedDF.printSchema()
root
 |-- double: long (nullable = true)
 |-- single: long (nullable = true)
 |-- triple: long (nullable = true)
 |-- key: integer (nullable = true)

>>> mergedDF.show()
+------+------+------+---+
|double|single|triple|key|
+------+------+------+---+
|  null|     9|   729|  2|
|  null|    10|  1000|  2|
|    16|     4|  null|  1|
|    25|     5|  null|  1|
|  null|     6|   216|  2|
|  null|     7|   343|  2|
|  null|     8|   512|  2|
|     1|     1|  null|  1|
|     4|     2|  null|  1|
|     9|     3|  null|  1|
+------+------+------+---+

>>> from os.path import expanduser,join,abspath
>>> abspath('spark-warehouse')
'/usr/inspur/2.5.0.0-1245/spark2/bin/spark-warehouse'
>>> warehouse_location=abspath('spark-warehouse')
>>> spark=SparkSession.builder.appName("Python Spark SQL Hive integration example").config("spark.sql.warehouse.dir",warehouse_location).enableHiveSupport().getOrCreate()
>>> spark.sql("CREATE TABLE IF NOT EXISTS src (key INT,value INT)")
DataFrame[]
>>> spark.sql("SELECT * FROM src")
DataFrame[key: int, value: int]
>>> #spark.sql("LOAD DATA LOCAL INPATH 'hdfs://bjychdfs/apps/hive/warehouse/yxpt.db/test_table/key=1' INTO TABLE src")
#不能將/hive/warehouse裏面的squaresDF或cubesDF導入/inspur/2.5.0.0-1245/spark2/bin/spark-warehouse新建的src表
>>>
>>>
>>> schemaPeople.show()
+---+-------+
|age|   name|
+---+-------+
| 43|   Leon|
|  0|    Tom|
| 23|   Jack|
| 89|Rachell|
| 31| Jessie|
+---+-------+

>>> schemaPeople.select(schemaPeople["age"]+1).show()
+---------+
|(age + 1)|
+---------+
|       44|
|        1|
|       24|
|       90|
|       32|
+---------+

>>> schemaPeople.select(schemaPeople["age"]/3).show()
+------------------+
|         (age / 3)|
+------------------+
|14.333333333333334|
|               0.0|
| 7.666666666666667|
|29.666666666666668|
|10.333333333333334|
+------------------+
>>> schemaPeople.select("age").show()
+---+
|age|
+---+
| 43|
|  0|
| 23|
| 89|
| 31|
+---+
>>> schemaPeople2=schemaPeople.select(schemaPeople["age"]+1,schemaPeople["name"])
>>> schemaPeople2.show()
+---------+-------+
|(age + 1)|   name|
+---------+-------+
|       44|   Leon|
|        1|    Tom|
|       24|   Jack|
|       90|Rachell|
|       32| Jessie|
+---------+-------+

>>> data.show(5)
+------------+-------------+-------------+
|     cust_id|sum(qty_sold)|sum(amt_sold)|
+------------+-------------+-------------+
|110106106866|       499.00|     54176.67|
|110102105975|      1019.00|    113162.04|
|110105208183|      4098.00|    526847.96|
|110229100172|       402.00|     28817.84|
|110114104030|      1036.00|     89332.29|
+------------+-------------+-------------+
only showing top 5 rows

>>> sale.show(5)
+------------+--------+------+--------+--------+
|     cust_id| item_id| date1|qty_sold|amt_sold|
+------------+--------+------+--------+--------+
|110105103343|31010702|201702|    2.00|  150.52|
|110105109235|22240116|201702|   15.00| 1828.50|
|110115205453|11018817|201702|    1.00|   93.28|
|110108109775|33010115|201702|    1.00|  190.80|
|110108203778|53010236|201702|    0.00|    0.00|
+------------+--------+------+--------+--------+
only showing top 5 rows

>>> sale.groupBy("cust_id").pivot("date1").agg(func.sum("qty_sold")).show(5)  #分組聚合
+------------+-------+-------+-------+
|     cust_id| 201701| 201702| 201703|
+------------+-------+-------+-------+
|110115104854| 605.00| 354.00| 353.00|
|110229101511| 125.00| 139.00| 192.00|
|110228202045|  48.00|   null|  14.00|
|110115205562|1191.00| 855.00| 774.00|
|110106207490| 771.00|1169.00|1389.00|
+------------+-------+-------+-------+

>>> from pyspark.ml.feature import VectorAssembler #特徵向量化
>>> data_sample=sc.parallelize([(0.0,0.0,0.0),(0.1,0.1,0.1),(0.2,0.2,0.2),(9.0,9.0,9.0),(9.1,9.1,9.1),(9.2,9.2,9.2)]                               )
>>> df=sqlContext.createDataFrame(data_sample,["features1","features2","features3"])  #建表方法一
>>> df.show()
+---------+---------+---------+
|features1|features2|features3|
+---------+---------+---------+
|      0.0|      0.0|      0.0|
|      0.1|      0.1|      0.1|
|      0.2|      0.2|      0.2|
|      9.0|      9.0|      9.0|
|      9.1|      9.1|      9.1|
|      9.2|      9.2|      9.2|
+---------+---------+---------+

>>> data_sample=[[0.0,0.0,0.0],[0.1,0.1,0.1],[0.2,0.2,0.2],[9.0,9.0,9.0],[9.1,9.1,9.1],[9.2,9.2,9.2]]
>>> data_sample
[[0.0, 0.0, 0.0], [0.1, 0.1, 0.1], [0.2, 0.2, 0.2], [9.0, 9.0, 9.0], [9.1, 9.1, 9.1], [9.2, 9.2, 9.2]]
>>> df=spark.createDataFrame(data_sample,["feature1","feature2","feature3"])  #建表方法二
>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
|     0.0|     0.0|     0.0|
|     0.1|     0.1|     0.1|
|     0.2|     0.2|     0.2|
|     9.0|     9.0|     9.0|
|     9.1|     9.1|     9.1|
|     9.2|     9.2|     9.2|
+--------+--------+--------+

>>> df=spark.createDataFrame(data_sample).toDF("feature1","feature2",'feature3')  #建表方法三
>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
|     0.0|     0.0|     0.0|
|     0.1|     0.1|     0.1|
|     0.2|     0.2|     0.2|
|     9.0|     9.0|     9.0|
|     9.1|     9.1|     9.1|
|     9.2|     9.2|     9.2|
+--------+--------+--------+

>>> data_sample=[[0.3,0.3,0.3],[0.1,0.1,0.1],[0.2,0.2,0.2],[9.0,9.0,9.0],[9.1,9.1,9.1],[9.2,9.2,9.2]]
>>> df=spark.createDataFrame(data_sample).toDF("feature1","feature2",'feature3')
>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
|     0.3|     0.3|     0.3|
|     0.1|     0.1|     0.1|
|     0.2|     0.2|     0.2|
|     9.0|     9.0|     9.0|
|     9.1|     9.1|     9.1|
|     9.2|     9.2|     9.2|
+--------+--------+--------+

>>> assembler=VectorAssembler(inputCols=["feature1","feature2","feature3"],outputCol="features")
>>> assembler.transform(df).show()
+--------+--------+--------+-------------+
|feature1|feature2|feature3|     features|
+--------+--------+--------+-------------+
|     0.3|     0.3|     0.3|[0.3,0.3,0.3]|
|     0.1|     0.1|     0.1|[0.1,0.1,0.1]|
|     0.2|     0.2|     0.2|[0.2,0.2,0.2]|
|     9.0|     9.0|     9.0|[9.0,9.0,9.0]|
|     9.1|     9.1|     9.1|[9.1,9.1,9.1]|
|     9.2|     9.2|     9.2|[9.2,9.2,9.2]|
+--------+--------+--------+-------------+

>>> df_vector=assembler.transform(df)
>>> df_vector.select(df_vector["features"]).show()
+-------------+
|     features|
+-------------+
|[0.3,0.3,0.3]|
|[0.1,0.1,0.1]|
|[0.2,0.2,0.2]|
|[9.0,9.0,9.0]|
|[9.1,9.1,9.1]|
|[9.2,9.2,9.2]|
+-------------+

>>> kmeans=KMeans(k=2,seed=1)
>>> model=kmeans.fit(df_vector.select(df_vector["features"]))
18/03/27 20:27:20 WARN KMeans: The input data is not directly cached, which may hurt performance if its parent RDDs                                are also uncached.
18/03/27 20:27:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
18/03/27 20:27:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
18/03/27 20:27:21 WARN KMeans: The input data was not directly cached, which may hurt performance if its parent RDDs                                are also uncached.
>>> centers=model.clusterCenters()
>>> len(centers)
2
>>> centers
[array([ 9.1,  9.1,  9.1]), array([ 0.2,  0.2,  0.2])]
>>> transformed=model.transform(df_vector.select(df_vector["features"]))
>>> transformed.show()
+-------------+----------+
|     features|prediction|
+-------------+----------+
|[0.3,0.3,0.3]|         1|
|[0.1,0.1,0.1]|         1|
|[0.2,0.2,0.2]|         1|
|[9.0,9.0,9.0]|         0|
|[9.1,9.1,9.1]|         0|
|[9.2,9.2,9.2]|         0|
+-------------+----------+

>>> from pyspark.ml.feature import MinMaxScaler
>>> df_vector.show()
+--------+--------+--------+-------------+
|feature1|feature2|feature3|     features|
+--------+--------+--------+-------------+
|     0.3|     0.3|     0.3|[0.3,0.3,0.3]|
|     0.1|     0.1|     0.1|[0.1,0.1,0.1]|
|     0.2|     0.2|     0.2|[0.2,0.2,0.2]|
|     9.0|     9.0|     9.0|[9.0,9.0,9.0]|
|     9.1|     9.1|     9.1|[9.1,9.1,9.1]|
|     9.2|     9.2|     9.2|[9.2,9.2,9.2]|
+--------+--------+--------+-------------+
>>> mmScaler=MinMaxScaler(inputCol="features",outputCol="minmaxscaler") #歸一化，最小的變爲0，最大的變爲1，其餘值按一定比例分佈在01區間
>>> mm_model=mmScaler.fit(df_vector.select(df_vector["features"]))
>>> mm_model.transform(df_vector.select(df_vector["features"])).show()
+-------------+--------------------+
|     features|        minmaxscaler|
+-------------+--------------------+
|[0.3,0.3,0.3]|[0.02197802197802...|
|[0.1,0.1,0.1]|       [0.0,0.0,0.0]|
|[0.2,0.2,0.2]|[0.01098901098901...|
|[9.0,9.0,9.0]|[0.97802197802197...|
|[9.1,9.1,9.1]|[0.98901098901098...|
|[9.2,9.2,9.2]|       [1.0,1.0,1.0]|
+-------------+--------------------+
>>> mm_model.originalMin
DenseVector([0.1, 0.1, 0.1])


>>> from pyspark.sql import SparkSession
>>> spark=SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option","some-value").getOrCreate()
>>> from pyspark.sql import Row
>>> import pyspark.sql.functions as func
>>> sc=spark.sparkContext
>>> from pyspark.sql.types import *
>>> from os.path import expanduser,join,abspath
>>> sale=spark.sql("select cust_id,item_id,date1,qty_sold,amt_sold from yxpt.pi_cust_item_month where date1 between '201701' and '201703'")
>>> sale.show(5)
+------------+--------+------+--------+--------+
|     cust_id| item_id| date1|qty_sold|amt_sold|
+------------+--------+------+--------+--------+
|110112205155|33010107|201701|    2.00|  286.20|
|110105103824|31010403|201701|    3.00| 1749.00|
|110102100215|32010108|201701|   12.00|10176.00|
|110108210202|43020107|201701|    7.00| 2151.80|
|110117100471|37020308|201701|    1.00|   61.50|
+------------+--------+------+--------+--------+
only showing top 5 rows

>>> data=sale.groupBy("cust_id").agg(func.sum("qty_sold"),func.sum("amt_sold"))
>>> data.show(5)
+------------+-------------+-------------+
|     cust_id|sum(qty_sold)|sum(amt_sold)|
+------------+-------------+-------------+
|110106106866|       499.00|     54176.67|
|110102105975|      1019.00|    113162.04|
|110105208183|      4098.00|    526847.96|
|110229100172|       402.00|     28817.84|
|110114104030|      1036.00|     89332.29|
+------------+-------------+-------------+
only showing top 5 rows

>>> from pyspark.ml.clustering import KMeans
>>> from pyspark.ml.linalg import Vectors
>>>
>>> data_sample=sc.parallelize([(0.3,0.3,0.3),(0.1,0.1,0.1),(0.2,0.2,0.2),(9.0,9.0,9.0),(9.1,9.1,9.1),(9.2,9.2,9.2)])
>>> df=spark.createDataFrame(data_sample,["feature1","feature2","feature3"])
>>> df.show(5)
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
|     0.3|     0.3|     0.3|
|     0.1|     0.1|     0.1|
|     0.2|     0.2|     0.2|
|     9.0|     9.0|     9.0|
|     9.1|     9.1|     9.1|
+--------+--------+--------+
only showing top 5 rows

>>> from pyspark.ml.feature import VectorAssembler
>>> assembler=VectorAssembler(inputCols=["feature1","feature2","feature3"],outputCol="features")
>>> output=assembler.transform(df)
>>> output.show()
+--------+--------+--------+-------------+
|feature1|feature2|feature3|     features|
+--------+--------+--------+-------------+
|     0.3|     0.3|     0.3|[0.3,0.3,0.3]|
|     0.1|     0.1|     0.1|[0.1,0.1,0.1]|
|     0.2|     0.2|     0.2|[0.2,0.2,0.2]|
|     9.0|     9.0|     9.0|[9.0,9.0,9.0]|
|     9.1|     9.1|     9.1|[9.1,9.1,9.1]|
|     9.2|     9.2|     9.2|[9.2,9.2,9.2]|
+--------+--------+--------+-------------+

>>> from pyspark.ml.feature import MinMaxScaler #歸一化
>>> mmScaler=MinMaxScaler(inputCol="features",outputCol="minmaxscaler")
>>> mm_model=mmScaler.fit(output.select(output["features"]))
>>> mm_output=mm_model.transform(output.select(output["features"]))
>>> mm_output.show()
+-------------+--------------------+
|     features|        minmaxscaler|
+-------------+--------------------+
|[0.3,0.3,0.3]|[0.02197802197802...|
|[0.1,0.1,0.1]|       [0.0,0.0,0.0]|
|[0.2,0.2,0.2]|[0.01098901098901...|
|[9.0,9.0,9.0]|[0.97802197802197...|
|[9.1,9.1,9.1]|[0.98901098901098...|
|[9.2,9.2,9.2]|       [1.0,1.0,1.0]|
+-------------+--------------------+

>>> kmeans=KMeans(k=2,seed=1)
>>> model=kmeans.fit(mm_output.selectExpr("minmaxscaler as features"))
18/03/28 14:49:39 WARN KMeans: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
18/03/28 14:49:39 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
18/03/28 14:49:39 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
18/03/28 14:49:40 WARN KMeans: The input data was not directly cached, which may hurt performance if its parent RDDs are also uncached.
>>> transformed=model.transform(mm_output.selectExpr("minmaxscaler as features"))
>>> transformed.show()
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[0.02197802197802...|         1|
|       [0.0,0.0,0.0]|         1|
|[0.01098901098901...|         1|
|[0.97802197802197...|         0|
|[0.98901098901098...|         0|
|       [1.0,1.0,1.0]|         0|
+--------------------+----------+
>>> model.computeCost(mm_output.selectExpr("minmaxscaler as features"))
0.0014491003501979412
>>> from pyspark.sql.functions import col, countDistinct
>>>
>>>
>>>
>>>
>>> sale.show(5)
+------------+--------+------+--------+--------+
|     cust_id| item_id| date1|qty_sold|amt_sold|
+------------+--------+------+--------+--------+
|110112205155|33010107|201701|    2.00|  286.20|
|110105103824|31010403|201701|    3.00| 1749.00|
|110102100215|32010108|201701|   12.00|10176.00|
|110108210202|43020107|201701|    7.00| 2151.80|
|110117100471|37020308|201701|    1.00|   61.50|
+------------+--------+------+--------+--------+
only showing top 5 rows

>>> sale.registerTempTable("sale_table")
>>> sale_months=spark.sql("select cust_id,sum(qty_sold),sum(amt_sold),count(distinct date1) months from sale_table group by cust_id")   >>> sale_months
DataFrame[cust_id: string, sum(qty_sold): decimal(32,2), sum(amt_sold): decimal(32,2), months: bigint]
>>> sale_months.show(5)
+------------+-------------+-------------+------+
|     cust_id|sum(qty_sold)|sum(amt_sold)|months|
+------------+-------------+-------------+------+
|110115104854|      1312.00|    166518.50|     3|
|110229101511|       456.00|     35549.02|     3|
|110228202045|        62.00|      6889.47|     2|
|110115205562|      2820.00|    292720.53|     3|
|110106207490|      3329.00|    359208.48|     3|
+------------+-------------+-------------+------+
only showing top 5 rows

>>> df.show()
+--------+--------+--------+
|feature1|feature2|feature3|
+--------+--------+--------+
|     0.3|     0.3|     0.3|
|     0.1|     0.1|     0.1|
|     0.2|     0.2|     0.2|
|     9.0|     9.0|     9.0|
|     9.1|     9.1|     9.1|
|     9.2|     9.2|     9.2|
+--------+--------+--------+

>>> df.select("feature1").show()
+--------+
|feature1|
+--------+
|     0.3|
|     0.1|
|     0.2|
|     9.0|
|     9.1|
|     9.2|
+--------+

>>> df.select(df["feature1"]+1).show()
+--------------+
|(feature1 + 1)|
+--------------+
|           1.3|
|           1.1|
|           1.2|
|          10.0|
|          10.1|
|          10.2|
+--------------+

>>> df.select(df["feature1"]/df["feature2"]).show()
+---------------------+
|(feature1 / feature2)|
+---------------------+
|                  1.0|
|                  1.0|
|                  1.0|
|                  1.0|
|                  1.0|
|                  1.0|
+---------------------+

>>> sale_months.show(5)
+------------+-------------+-------------+------+
|     cust_id|sum(qty_sold)|sum(amt_sold)|months|
+------------+-------------+-------------+------+
|110115104854|      1312.00|    166518.50|     3|
|110115205562|      2820.00|    292720.53|     3|
|110229101511|       456.00|     35549.02|     3|
|110228202045|        62.00|      6889.47|     2|
|110106207490|      3329.00|    359208.48|     3|
+------------+-------------+-------------+------+
only showing top 5 rows

>>> sale_average=sale_months.select(sale_months["cust_id"],sale_months["sum(qty_sold)"]/sale_months["months"],sale_months["sum(amt_sold)"]/sale_months["months"])
>>> sale_average.show(5)
+------------+------------------------+------------------------+
|     cust_id|(sum(qty_sold) / months)|(sum(amt_sold) / months)|
+------------+------------------------+------------------------+
|110115104854|     437.333333333333333|    55506.16666666666...|
|110229101511|     152.000000000000000|    11849.67333333333...|
|110228202045|      31.000000000000000|    3444.735000000000000|
|110115205562|     940.000000000000000|    97573.51000000000...|
|110106207490|    1109.666666666666667|    119736.1600000000...|
+------------+------------------------+------------------------+
only showing top 5 rows
Spark大數據分析——pyspark（三）

redis的key亂碼問題和值自增問題

CORS error 但是 status code 是200 OK

一個開源且全面的C#算法實戰教程

一款.NET開源、功能強大、跨平臺的繪圖庫 - OxyPlot

壓縮上傳的GPU數據的方案

OpenTelemetry 實踐指南：歷史、架構與基本概念

需求管理祕籍：從混亂到有序，讓你的項目高效運轉

使用skopeo同步鏡像

用光線投射法渲染規則模型

NLP—TextRank算法獲取文本關鍵詞和摘要

tSNE—高維數據降維可視化（理論部分）

Dask-大規模數據存儲與讀取、並行計算

Python點滴(八)—pandas中的透視表

Vim_Linux指令_Git

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結