應用程序開發
1.將spark的jar加入到項目的lib中,並加入到項目的classpath中
依賴spark-core
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.0.2</version>
</dependency>
如果操作hdfs的話,還依賴hdfs
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.2.0</version>
</dependency>
或者
將/usr/local/myspark/spark/spark-1.0.2-bin-hadoop2/lib下的jarr加入到項目的lib中,並加入到項目的classpath中
2.代碼實例
package org.test.myspark;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class SparkWordCount {
/**
* 單詞統計
*/
public static void main(String[] args) {
SparkConf conf=new SparkConf().setAppName("spark_wordcount").setMaster("yarn-cluster");
JavaSparkContext jsc=new JavaSparkContext(conf);
//將文件內容讀取爲一行一行的字符串
JavaRDD<String> lines=jsc.textFile("hdfs://master:9000/wordcount_input/file2");
//將每一行內容轉換爲一個一個的word
JavaRDD<String> words=lines.flatMap(new FlatMapFunction<String,String>(){
@Override
public Iterable<String> call(String s) throws Exception {
System.out.println("line="+s);
String[] linewords= s.split(" ");
for(String lw:linewords){
System.out.println("words="+lw);
}
return Arrays.asList(linewords);
}
});
//將每一個word計數1
//map to pair
JavaPairRDD<String,Integer> wordonepairs=words.mapToPair(new PairFunction<String,String,Integer>(){
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s,1);
}
});
//對每個單詞進行計數統計
//action
JavaPairRDD<String,Integer> wordcounts=wordonepairs.reduceByKey(new Function2<Integer,Integer,Integer>(){
@Override
public Integer call(Integer a, Integer b) throws Exception {
return a+b;
}
});
//獲取結果
List<Tuple2<String,Integer>> results=wordcounts.collect();
for(Tuple2<String,Integer> tuple:results){
System.out.println(tuple._1+":"+tuple._2);
}
}
}
3.打包
將項目的classes拷貝到C:\Users\dingzhf\Desktop\logs
cmd
>cd C:\Users\dingzhf\Desktop\logs\classes
>jar -cvf wordcount.jar .
將wordcount.jar拷貝到10.41.2.82的/opt目錄下
4.運行
在10.41.2.82上運行以下命令:
/usr/local/myspark/spark/spark-1.0.2-bin-hadoop2/bin/spark-submit --class org.test.myspark.SparkWordCount --master yarn-cluster --num-executors 3 --driver-memory 4g --executor-memory 2g --executor-cores
1 /opt/wordcount.jar
【
/usr/local/myspark/spark/spark-1.0.2-bin-hadoop2/bin/spark-submit --class org.test.myspark.SparkWordCount --master yarn-cluster --num-executors 3 --driver-memory 4g --executor-memory 2g --executor-cores
1 /opt/wordcount.jar
】
查看結果:
http://master:8088/proxy/application_1409622175934_0004/A
點擊logs: