spark2.2.0 wordcount JAVA版

    1. 本地运行代码

     注意点:spark2.2.0-FlatMapFunction中call返回类型从Iterable变成了Iterator      

package cn.spark.study.core;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class WordCountLocal {
	public static void main(String[] args) {
		SparkConf conf =new SparkConf().setAppName("WordCountLocal").setMaster("local");
		JavaSparkContext sc =new JavaSparkContext(conf);
		JavaRDD<String> lines=sc.textFile("spark.txt");
		JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String,String>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public Iterator<String> call(String line) throws Exception {
				
				return Arrays.asList(line.split(" ")).iterator();
			}
			
		});
		JavaPairRDD<String,Integer> pairs =words.mapToPair(new PairFunction<String,String,Integer>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public Tuple2<String, Integer> call(String word) throws Exception {
				
		
				return new Tuple2<String,Integer>(word,1);
			}
			
		});
		JavaPairRDD<String,Integer> wordCounts =pairs.reduceByKey(new Function2<Integer,Integer,Integer>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public Integer call(Integer v1, Integer v2) throws Exception {
				
				return v1+v2;
			}
			
		});
		wordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public void call(Tuple2<String, Integer> wordcount) throws Exception {
				System.out.println(wordcount._1+" appeared "+wordcount._2+"times");
				
			}
			
		});
		sc.close();
		
	}
}

2.通过spark-submit 在Cluster运行

(1)上传文件到Hadoop集群上 hadoop fs -put spark.txt 

  (2)

package cn.spark.study.core;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class WordCountCluster {
	public static void main(String[] args) {
		SparkConf conf =new SparkConf()
				.setAppName("WordCountCluster");
		JavaSparkContext sc =new JavaSparkContext(conf);
		JavaRDD<String> lines=sc.textFile("hdfs://172.16.2.235:9000/user/root/README.md");
		JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String,String>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public Iterator<String> call(String line) throws Exception {
				
				return Arrays.asList(line.split(" ")).iterator();
			}
			
		});
		JavaPairRDD<String,Integer> pairs =words.mapToPair(new PairFunction<String,String,Integer>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public Tuple2<String, Integer> call(String word) throws Exception {
				
		
				return new Tuple2<String,Integer>(word,1);
			}
			
		});
		JavaPairRDD<String,Integer> wordCounts =pairs.reduceByKey(new Function2<Integer,Integer,Integer>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public Integer call(Integer v1, Integer v2) throws Exception {
				
				return v1+v2;
			}
			
		});
		wordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>(){

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			public void call(Tuple2<String, Integer> wordcount) throws Exception {
				System.out.println(wordcount._1+" appeared "+wordcount._2+"times");
				
			}
			
		});
		sc.close();
		
	}
}
 (3) 使用maven插件打包

右键项目->RunAs->Run Configuration->MavenBuild ->右键->New

Name :spark-study-java

Base directory:项目根目录

Goals :clean package

Apply-> Run

(4)将target 目录下的spark-study-java-0.0.1-SNAPSHOT-jar-with-dependencies.jar传到集群上

(5)写一个脚本word.sh

/opt/spark/bin/spark-submit \
--class cn.spark.sparktest.core.WordCountCluster \
--num-executors 3 \
--driver-memory 1G \
--executor-memory 1G \
--executor-cores 1 \
/root/SparkJava/spark-study-java-0.0.1-SNAPSHOT-jar-with-dependencies.jar\

第一行: spark-submit命令的位置

第二行:WordCountCluster 的全类名

第三行 :num-executors集群节点个数

第四行: dirver内存

第五行   节点内存

第六行  节点的核数

第六行 打包文件的位置

./word.sh运行若不能运行 chmod 777 word.sh给权限

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章