Pyspark
- 安裝
pip install pyspark -i https://pypi.tuna.tsinghua.edu.cn/simple
- 求和
from pyspark import SparkContext,SparkConf
#
conf = SparkConf().setAppName("test").setMaster("local")
sc = SparkContext(conf=conf)
ll=[1,2,3,4]
rdd=sc.parallelize(ll,2)
rddsum=rdd.map(lambda x:x+1).reduce(lambda x,y:x+y)
print(rddsum)
- 單詞數統計
- 文件內容
a,b,c,d
aa,bb,cc,dd
- 統計每行的單詞數
rdd=sc.textFile("data")
# 統計每行的單詞數
linecnt=rdd.map(lambda x:len(x.split(","))).take(2)
print(linecnt)
# 返回 [4, 4]
Spark
- 通過intellij idea,新建maven項目,幾個要點:
- src文件夾 需要make directory as source directory
- scala配置
- jdk配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>Spark</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.15</version>
</dependency>
</dependencies>
</project>
求和操作~
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
object rdd {
def main(args: Array[String]): Unit = {
// 配置
val conf=new SparkConf().setAppName("rdd").setMaster("local")
val sc=new SparkContext(conf)
// rdd is short for resilent distribute dataset
// 讀取數據 - 自定義
val data=Array(1,2,3,4,5)
val distData=sc.parallelize(data,numSlices = 2)
println(distData.map(line=>line+1).reduce((a,b)=>a+b))
}}