spqrk通過rdd和dataset實現相同sql操作原薦

原創

2018-12-02 21:31

待分析的數據文件格式，內容爲nginx的日誌記錄，內容已經被處理過，已經轉爲只記錄ip和url的txt文件，內容如下：

rdd讀取txt文件：

    public static void rdd(){
        SparkConf conf = new SparkConf().setAppName("name_rdd").setMaster("local[4]");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> logRdd = sc.textFile(textFile);

        JavaRDD<LogInfo> infoRdd = logRdd.map(new Function<String, LogInfo>() {
            public LogInfo call(String line) throws Exception{
                String[] strs = line.split(" ");
                LogInfo log = new LogInfo();
                if(strs.length == 2){
                    //LogInfo log = new LogInfo();
                    log.setIp(strs[0]);
                    log.setUrl(strs[1]);
                }
                else{
                    log.setIp("");
                    log.setUrl("");
                }
                return log;
            }
        });

//        rdd1(infoRdd);
//        rdd2(infoRdd);
//        rdd3(infoRdd);
    }

dataset讀取txt文件

    public static void dataset(){
        spark = SparkSession
                .builder()
                .appName("name_dataset")
                .master("local[4]")
                .getOrCreate();
        Dataset<String> logDataset = spark.read().textFile(textFile);
        Dataset<LogInfo> mapDataset = logDataset.map(new MapFunction<String, LogInfo>() {
            public LogInfo call(String line) throws Exception{
                String[] strs = line.split(" ");
                LogInfo log = new LogInfo();
                if(strs.length == 2){
                    //LogInfo log = new LogInfo();
                    log.setIp(strs[0]);
                    log.setUrl(strs[1]);
                }
                return log;
            }
        }, Encoders.bean(LogInfo.class));

//        dataset1(mapDataset);
//        dataset2(mapDataset);
//        dataset3(mapDataset);
    }

1、sql查詢select * from visit_info where ip = '115.217.254.106'

rdd實現

    public static void rdd1(JavaRDD<LogInfo> infoRdd){
        System.out.println("infoRdd.count()==================" + infoRdd.count());
        JavaRDD<LogInfo> filterRdd = infoRdd.filter(new Function<LogInfo, Boolean>(){
            public Boolean call(LogInfo log) throws Exception{
                if(log.getIp() != null && log.getIp().equals("115.217.254.106")){
                    return true;
                }
                return false;
            }
        });
        System.out.println("filterRdd.count()==================" + filterRdd.count());
    }

dataset實現

public static void dataset1( Dataset<LogInfo> mapDataset){
        //方式一
        Dataset<LogInfo> filter1Dataset = mapDataset.filter(new FilterFunction<LogInfo>(){
            public boolean call(LogInfo log) throws Exception{
                if(log.getIp() != null && log.getIp().equals("115.217.254.106")){
                    return true;
                }
                return false;
            }
        });
        filter1Dataset.show();
        //方式二
        Dataset<LogInfo> filter2Dataset = mapDataset.filter(" ip = '115.217.254.106' ");
        mapDataset.show();
        //方式三
        Dataset<LogInfo> selectDataset = mapDataset.where(" ip = '115.217.254.106' ");
        selectDataset.show();
        //方式四
        mapDataset.createOrReplaceTempView("visit_info");
        Dataset<Row> sqlDataset = spark.sql("select * from visit_info where ip = '115.217.254.106' ");
        sqlDataset.show();
    }

2、sql查詢select url,count(1) as co from visit_info group by url order by co desc limit 0,10

rdd實現

    public static void rdd2(JavaRDD<LogInfo> infoRdd){
        JavaPairRDD<String, Long> pairRDD =  infoRdd.mapToPair(new PairFunction<LogInfo, String, Long>(){
            public Tuple2<String, Long> call(LogInfo t) throws Exception{
                return new Tuple2(t.getUrl(), 1L);
            }
        });

        JavaPairRDD<String, Long> reduceRDD = pairRDD.reduceByKey(new Function2<Long, Long, Long>(){
            public Long call(Long v1, Long v2) throws Exception{
                return (Long)(v1 + v2);
            }
        });

        JavaRDD<LogInfo> countRdd = reduceRDD.map((t)->{
            LogInfo log = new LogInfo();
            log.setUrl(t._1);
            log.setCount(t._2);
            return log;
        });

        JavaRDD<LogInfo> sortRdd = countRdd.sortBy(log->{
            return log.getCount();
        }, false, countRdd.getNumPartitions());
        sortRdd.take(10).forEach(t->{
            System.out.println("sortRdd==================" + t.getUrl() + " " + t.getCount());
        });
    }

dataset實現

	public static void dataset2(Dataset<LogInfo> mapDataset){
        //方式一
        mapDataset.select("url").groupBy("url").agg(count("url").as("co")).orderBy(functions.desc("co")).show();
        //方式二
        mapDataset.createOrReplaceTempView("visit_info");
        Dataset<Row> sqlDataset = spark.sql(" select url,count(1) as co from visit_info group by url order by co desc ");
        sqlDataset.show();
    }

3、sql查詢select distinct url from visit_info

rdd實現

	public static void rdd3(JavaRDD<LogInfo> infoRdd){
        JavaPairRDD<String, Long> pairRDD =  infoRdd.mapToPair(new PairFunction<LogInfo, String, Long>(){
            public Tuple2<String, Long> call(LogInfo t) throws Exception{
                return new Tuple2(t.getUrl(), 1L);
            }
        });
        pairRDD.distinct().collect().forEach(t->{
            System.out.println("distinct==================" + t._1 + " " + t._2);
        });
    }

dataset實現

    public static void dataset3(Dataset<LogInfo> mapDataset){
        // 方式一
        mapDataset.select("url").distinct().show();
        // 方式二
        mapDataset.createOrReplaceTempView("visit_info");
        Dataset<Row> sqlDataset = spark.sql(" select distinct url from visit_info ");
        sqlDataset.show();
    }

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

spqrk通過rdd和dataset實現相同sql操作原薦

MySQL 核心模塊揭祕 | 18 期 | 鎖在內存里長什麼樣*

使用perf工具生成火焰圖

HttpSecurity 是如何組裝過濾器鏈的

數說海南——近6年海南各市縣人口簡單看

長序列中Transformers的高級注意力機制總結

大齡程序員思考

響應式界面控件DevExtreme * 更強的數據分析和可視化功能

log4j日誌配置信息轉

JApiTest的selenium測試腳本原

spqrk通過rdd和dataset實現相同sql操作原薦

使用zt-exec庫定時清理linux休眠進程原

在php的yii2框架中整合hbase庫原薦

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

spqrk通過rdd和dataset實現相同sql操作 原 薦

spqrk通過rdd和dataset實現相同sql操作原薦