1. 本地運行代碼
注意點:spark2.2.0-FlatMapFunction中call返回類型從Iterable變成了Iterator
package cn.spark.study.core;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class WordCountLocal {
public static void main(String[] args) {
SparkConf conf =new SparkConf().setAppName("WordCountLocal").setMaster("local");
JavaSparkContext sc =new JavaSparkContext(conf);
JavaRDD<String> lines=sc.textFile("spark.txt");
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String,String>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
JavaPairRDD<String,Integer> pairs =words.mapToPair(new PairFunction<String,String,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String,Integer>(word,1);
}
});
JavaPairRDD<String,Integer> wordCounts =pairs.reduceByKey(new Function2<Integer,Integer,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
wordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public void call(Tuple2<String, Integer> wordcount) throws Exception {
System.out.println(wordcount._1+" appeared "+wordcount._2+"times");
}
});
sc.close();
}
}
2.通過spark-submit 在Cluster運行
(1)上傳文件到Hadoop集羣上 hadoop fs -put spark.txt
(2)
package cn.spark.study.core;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class WordCountCluster {
public static void main(String[] args) {
SparkConf conf =new SparkConf()
.setAppName("WordCountCluster");
JavaSparkContext sc =new JavaSparkContext(conf);
JavaRDD<String> lines=sc.textFile("hdfs://172.16.2.235:9000/user/root/README.md");
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String,String>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(" ")).iterator();
}
});
JavaPairRDD<String,Integer> pairs =words.mapToPair(new PairFunction<String,String,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String,Integer>(word,1);
}
});
JavaPairRDD<String,Integer> wordCounts =pairs.reduceByKey(new Function2<Integer,Integer,Integer>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
wordCounts.foreach(new VoidFunction<Tuple2<String,Integer>>(){
/**
*
*/
private static final long serialVersionUID = 1L;
public void call(Tuple2<String, Integer> wordcount) throws Exception {
System.out.println(wordcount._1+" appeared "+wordcount._2+"times");
}
});
sc.close();
}
}
(3) 使用maven插件打包
右鍵項目->RunAs->Run Configuration->MavenBuild ->右鍵->New
Name :spark-study-java
Base directory:項目根目錄
Goals :clean package
Apply-> Run
(4)將target 目錄下的spark-study-java-0.0.1-SNAPSHOT-jar-with-dependencies.jar傳到集羣上
(5)寫一個腳本word.sh
/opt/spark/bin/spark-submit \
--class cn.spark.sparktest.core.WordCountCluster \
--num-executors 3 \
--driver-memory 1G \
--executor-memory 1G \
--executor-cores 1 \
/root/SparkJava/spark-study-java-0.0.1-SNAPSHOT-jar-with-dependencies.jar\
第一行: spark-submit命令的位置
第二行:WordCountCluster 的全類名
第三行 :num-executors集羣節點個數
第四行: dirver內存
第五行 節點內存
第六行 節點的核數
第六行 打包文件的位置
./word.sh運行若不能運行 chmod 777 word.sh給權限