Spark使用

                            Spark使用

  1. Mven eclipse的安裝 http://blog.csdn.net/qjyong/article/details/9098213
  2. eclipse 建立maven工程

File->New->Project->Maven->Maven Project->quick start

  1. 添加依賴

   https://mvnrepository.com/   找依賴庫  

 

  <dependency> <!-- Spark dependency -->

      <groupId>org.apache.spark</groupId>

      <artifactId>spark-core_2.10</artifactId>

      <version>1.2.0</version>

</dependency>

 

    <dependency> <!-- Hadoop dependency -->

      <groupId>org.apache.hadoop</groupId>

      <artifactId>hadoop-client</artifactId>

      <version>2.6.0</version>

    </dependency>

 

   <dependency>

       <groupId>org.apache.spark</groupId>

       <artifactId>spark-mllib_2.10</artifactId>

       <version>1.0.0</version>

       <scope>provided</scope>

   </dependency>

 

   <dependency>

      <groupId>org.apache.spark</groupId>

      <artifactId>spark-sql_2.10</artifactId>

      <version>1.5.2</version>

    </dependency>

 

  1. 編寫程序

3.1利用sqoop數據的導入到Hive HDFS  

sqoop import

--connect jdbc:mysql://49.123.21.100:3306/bbk

--username root

--password root

--table C

--fields-terminated-by '\t' -m 1

 

3.2 編寫spark

SparkConf sparkConf = new SparkConf().setAppName("TestKMeans");

 JavaSparkContext sc = new JavaSparkContext(sparkConf);

 

String filePath = "hdfs:/user/kpnm/C ";

 

JavaRDD<String> lines = sc.textFile(filePath);

 

 

3.2 函數式編程

                      實現方法            用途

    Function<T,R>      R call(T)      map() filter()

 

例1 filter

JavaRDD <String> dongtang = lines.filter(new Containsdt())

 

Class Containsdt implements Function<String ,Boolean>(){

 

    Public Boolean call(String X){

return  X.contains(“東塘店”)

}

 

}

  

例2 map

SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local")

JavaSparkContext ctx = new JavaSparkContext(sparkConf);

 

         

JavaRDD <Integer> data= ctx.parallelize(Arrays.asList(1,2,3,4,5,6));

RDD=

1    2

2    3

3

4

5

6     7

 

 

idd.map(fun);

 

fun=new Function<Integer,Integer>(){

 

             public Integer call(Integer x){

                return x+1;

             }

            

          }

 

                            實現方法            用途

    Function<T1,T2,R>    R call(T1,T2)    aggregate() fold()

 

例3 aggregate

List<Integer> data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);

JavaRDD<Integer> javaRDD = ctx.parallelize(data,3);

 

Integer aggregateRDD = javaRDD.aggregate(2, new Function2<Integer, Integer, Integer>() {   

            @Override   

            public Integer call(Integer v1, Integer v2) throws Exception {       

                return v1 + v2;   

            }

        }, new Function2<Integer, Integer, Integer>() {   

            @Override   

            public Integer call(Integer v1, Integer v2) throws Exception {         

                return v1 + v2;   

            }

        });

        System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + aggregateRDD);

         //結果輸出 27

 

例4 fold

 

List<Integer> data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);

JavaRDD<Integer> javaRDD = ctx.parallelize(data,3);

       

System.out.println(javaRDD.fold(1, new Function2<Integer,Integer,Integer>(){

           public Integer call(Integer x,Integer y){

              return x+y;

           }

        }));

//輸出結果 23

 

 

                        實現方法                     用途   FlatMapFunction<T,R>    Iterable<R> call(T)      flatMap()

 

例4 flatMap

JavaRDD<String> sjdd=ctx.parallelize(Arrays.asList("hello world  how are you","i am fine","thanks for you"));

         

sjdd.foreach(new VoidFunction<String>(){

             public void call(String x){

               

                System.out.println(x);

               

             }

 });

          

sjdd=sjdd.flatMap(new FlatMapFunction<String,String>(){

            

             public Iterable<String> call(String x){

                return Arrays.asList(x.split(" "));

               

             }

            

});

System.out.println("處理之後的數據");

        

sjdd.foreach(new VoidFunction<String>(){

            

                public void call(String x){

               

                System.out.println(x);

               

             }

           });

 

之前結果:

"hello world  how are you"

"i am fine"

"thanks for you"

 

之後結果

 hello

 World

 How

 Are

 you

 i

 am

 fine

 thanks

 For

 You

 

PariRDD 或者DoubleRDD 相關的函數和類同樣需要 一些的相關接口類

5 提交作業

Maven 打包

 Maven install 將所有的依賴包充入jar中

 

sudo spark-submit \

 --class com.mycompany.app.yang.App \

 --executor-memory 5G \

 --total-executor-cores 5 \

 --driver-class-path /home/kpnm/mysql-connector-java-5.1.41-bin.jar\

 /home/kpnm/yxs/yang-0.0.1-SNAPSHOT.jar

 

其中紅色的表示必須。

 

趨勢分析:

 

package com.mycompany.app.yang;

 

import scala.Tuple2;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaPairRDD;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.FlatMapFunction;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.api.java.function.Function2;

import org.apache.spark.api.java.function.PairFunction;

import org.apache.spark.api.java.function.VoidFunction;

 

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Collection;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Properties;

import java.util.regex.Pattern;

 

import org.apache.spark.sql.DataFrame;

import org.apache.spark.sql.DataFrameReader;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.SQLContext;

import org.apache.spark.sql.hive.HiveContext;

import org.apache.spark.sql.types.DataTypes;

import org.apache.spark.sql.types.StructField;

import org.apache.spark.sql.types.StructType;

 

 

public class App

{

   private static final Pattern SPACE = Pattern.compile(" ");

     public static void main(String[] args) throws Exception {

    

SparkConf sparkConf = new SparkConf().setAppName("sql").setMaster("local").set("spark.kryoserializer.buffer.max","128");

 

JavaSparkContext ctx = new JavaSparkContext(sparkConf);

 

SQLContext sqlContext = new SQLContext(ctx);

  

HiveContext hiveCtx=new HiveContext(ctx);

      

DataFrame rdd;         

         

rdd=hiveCtx.sql("select SHOP,CATE,ACCTURE,PERIOD,ACCOUNT,collect_set(VALUE) from C group by SHOP,CATE,ACCTURE,ACCOUNT,PERIOD,ACCOUNT");

          

JavaRDD<Row> jdd=rdd.toJavaRDD();

jdd =jdd.map(new linearegression()).collect());

          

jdd.saveAsTextFile("/Home/kpnm/yxs/result.txt");

          

          

       ctx.stop();

      

     }

    

};

 

class linearegression implements Function<Row, List<Double>>{

    public List<Double> call(Row s){

       String str=s.get(5).toString();

       String datas=str.substring(13, str.length()-1);

      

       String[] data=datas.split(",");

      

       List<Double> list= new ArrayList <Double>();

       double k;

       for(int i=0;i<data.length-2;i++){

         

          k=(Double.valueOf(data[i+2]) - Double.valueOf(data[i]))/2;

         

          list.add(Double.valueOf(k));

         

       }

      

        return list;

       

    }

}

 

查看任務執行情況

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章