import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.FlatMapFunction2;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD;
import scala.Function1;
import scala.Tuple2;
import scala.collection.TraversableOnce;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class WordCount1 {
public static void main(String[] args) {
// 配置spark任務,設置使用本地線程數爲4個
SparkConf conf = new SparkConf().setMaster("local[4]").setAppName("wordCount");
// 獲取context對象
JavaSparkContext sc = new JavaSparkContext(conf);
// 讀取文件
JavaRDD<String> file = sc.textFile("./data/data");
// JavaRDD<String> rdd = file.flatMap(new FlatMapFunction<String, String>() {
// @Override
// public Iterator<String> call(String s) throws Exception {
// String[] s1 = s.split(" ");
// Iterator<String> iterator = Arrays.asList(s1).iterator();
//
// return iterator;
// }
// });
JavaRDD<String> rdd = file.flatMap((String s) -> Arrays.asList(s.split(" ")).iterator());
JavaPairRDD<String, Integer> rdd1 = rdd.mapToPair((String s) -> new Tuple2<String, Integer>(s, 1));
JavaPairRDD<String, Integer> rdd2 = rdd1.reduceByKey((Integer v1, Integer v2) -> v1 + v2);
List<Tuple2<String, Integer>> collect = rdd2.collect();
for (Tuple2<String, Integer> res : collect) {
System.out.println(res);
}
sc.stop();
}
}
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
// 創建SparkContext
val conf = new SparkConf().setMaster("local[4]").setAppName("test")
val sc = new SparkContext(conf)
val res = sc.textFile("./data/data")
val res1 = res.flatMap(_.split(" "))
val res2 = res1.map(_ -> 1)
val res3 = res2.reduceByKey(_ + _)
res3.foreach(println)
sc.stop()
}
}
<properties>
<scala.version>2.11.8</scala.version>
<spark.version>2.2.0</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.1.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass></mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>