【hadoop】MapReduce數據清洗及導入數據到hive倉庫

搜狗用戶查詢日誌分析綜合案例

一.MapReduce數據清洗

1.數據清洗要求

（1）解決亂碼問題
（2）過濾少於6個字段的行
（3）統一字段之間的分隔符（統一用逗號）
（3）在每行前添加年，月，日字段。
清洗前的數據

清洗後的數據

2.準備原始數據

說明：1.該數據來自搜狗實驗室數據下載地址
2.年月日三個字段在文件名中
3.該數據已經上傳到HDFS

3.代碼詳解

這個案例只需要Mapper類和Driver類即可
Mapper類

package xiexianyouSogouLogClean;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.tracing.SpanReceiverInfo;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SogouCleanMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 將value數據轉換成bytes數組
        byte[] bytes = value.getBytes();
        // 獲取數組的長度
        int length = bytes.length;
        // 將bytes數組根據gbk編碼解碼成字符串
        // 每次轉換一行
        String line = new String(bytes, 0, length, "gbk");
        //將每行數據根據正則表達式分隔成數組 \s+ 代表一個或多個空白符
        String[] words = line.split("\\s+");
        //過濾掉不足6個字段的行
        if (words.length != 6) {
            // return代表直接結束本行操作
            return;
        }
        //通過上下文對象，獲取當前文件名
        InputSplit inputSplit = (InputSplit) context.getInputSplit();
        String filename = ((FileSplit) inputSplit).getPath().getName();

        //根據文件名獲取時間，使用正則表達式
        Pattern compile = Pattern.compile("\\d+");
        Matcher matcher = compile.matcher(filename);

        if (matcher.find()){
        	//取出文件名中的日期
            String time = matcher.group();
            //將time轉換成 2006,08,04 這種類型
            char[] chars = time.toCharArray();
            ArrayList<Character> list = new ArrayList<>();
            String newString = "";
             // 遍歷這個char數組，將元素添加到list中
            for (int i = 0; i < chars.length; i++) {
                list.add(chars[i]);
                switch (i) {
                    case 3 :
                        list.add(',');
                        break;
                    case 5 :
                        list.add(',');
                        break;
                }
            }
            //將list 每個元素拼接成字符串
            for (Character character : list) {
                newString += character;
            }
            //將當前時間添加到words數組的第一個位置
            String id = words[0];
            words[0] = newString + "," + id;
        }

        // 將每行的數組根據 , 組合成字符串
        // join是String類中的靜態方法，可以直接類名.方法調用
        String newline = String.join(",", words);
        //將處理好的數據傳輸到下一階段
        //new Text(newline)將String轉換成hadoop中的數據類型 Text
        //NullWritable.get()返回該類的單個實例
        context.write(new Text(newline), NullWritable.get());

    }
}

Driver類

package xiexianyouSogouLogClean;

import hiveMapReduce.hiveDriver;
import hiveMapReduce.hiveMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;

public class SogouCleanDriver {
    public static void main(String[] args) throws Exception {
        // 創建一個configuration對象
        Configuration conf = new Configuration();
        // 根據配置文件創建job對象
        // 這裏需要拋出異常
        Job job = Job.getInstance(conf);
        // SogouCleanDriver.class 表示獲取SogouCleanDriver對象
        job.setJarByClass(SogouCleanDriver.class);
        job.setMapperClass(SogouCleanMapper.class);

        //設置輸入格式和輸出格式
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        // 設置輸出鍵值對類型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 獲取輸出路徑
        Path path=new Path(args[1]);
        FileSystem fileSystem=path.getFileSystem(conf);
        // 判斷輸出路徑是否存在，存在則刪除
        if (fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }

        // 設置輸出路徑和輸入路徑
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //提交
        boolean result = job.waitForCompletion(true);
        if (result) {
            System.out.print("牛逼");
        }

    }
}

4.打jar包上傳到集羣運行

（如何打jar包和上傳jar包這裏不在贅述）

5.運行jar包

6.查看結果文件

4個文件的中數據都聚合在了這一個文件裏

二.導入數據到hive倉庫

1.創建存放數據的臨時表

create table sogou2_test(
year int,
month int,
day int,
userid string,
keyword string,
rank int,
clickid int,
url string,
detail string) 
row format delimited fields terminated by',';

2.創建動態分區表

year ,month ,day 爲分區字段

create table sogou2(
userid string,
keyword string,
rank int,
clickid int,
url string,
detail string) 
partitioned by(year int,month int,day int) 
row format delimited fields terminated by',';

3.將臨時表sogou2_test數據導入sogou2

導入語句
注意：字段不能用 * 代替，分區字段應該寫最後。

insert into sogou2 partition(year,month,day) 
select userid,keyword,rank,clickid,url,detail,year,month,day from sogou2_test;

4.查看導入情況

數據已經成功的分區
文件夾大小不知道爲什麼顯示 0 但是裏面的文件中的的確確有數據。

5.查看分區中的數據

三.總結

1.本案例總體上比較簡單
2.難點在於如何獲取到文件名中的日期
3.本題解法不唯一，如果有更好的解法，歡迎大佬到評論區討論

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

【hadoop】MapReduce數據清洗及導入數據到hive倉庫

搜狗用戶查詢日誌分析綜合案例

一.MapReduce數據清洗

1.數據清洗要求

2.準備原始數據

3.代碼詳解

4.打jar包上傳到集羣運行

5.運行jar包

6.查看結果文件

二.導入數據到hive倉庫

1.創建存放數據的臨時表

2.創建動態分區表

3.將臨時表sogou2_test數據導入sogou2

4.查看導入情況

5.查看分區中的數據

三.總結

【python&爬蟲】快速入門Scrapy框架

【python&爬蟲】快速入門JSON和JSONPath

【spark學習】SparkStreaming將採集結果存儲MySQL數據庫

【python&爬蟲】selenium爬取淘寶商品圖片

【hadoop】MapReduce數據清洗及導入數據到hive倉庫

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結