SpringBoot集成Hadoop系列二 ---- MapReduce詞頻統計

繼上篇SpringBoot集成Hadoop系列一 ---- 對HDFS的文件操作建的工程,接下來使用MapReduce進行一些數據文件的統計開發.這裏做一個很經典的統計功能,詞頻統計.

代碼:

package com.hadoop.reduce.mapper;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

/**
 * 統計單個字符出現的次數
 * @author linhaiy
 * @date 2019.05.18
 */
public class WordCountMap extends Mapper<Object, Text, Text, IntWritable> {

	private final static IntWritable one = new IntWritable(1);
	private Text word = new Text();

	/**
	 * 讀取 sgyy.txt或者dpcq.txt 內容格式爲小說內容
	 * @param key
	 * 默認情況下，是mapreduce所讀取到的一行文本的起始偏移量
	 * @param value
	 * 默認情況下，是mapreduce所讀取到的一行文本的內容，hadoop中的序列化類型爲Text
	 * @param context
	 * 是用戶自定義邏輯處理完成後輸出的KEY，在此處是單詞，String
	 * @throws IOException
	 * @throws InterruptedException
	 */
	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
		// 防止中文亂碼
		String line = new String(value.getBytes(), 0, value.getLength(), "UTF-8").trim();
		if (StringUtils.isNotEmpty(line)) {
			// 使用分詞器，分隔文件行內容根據常用的短語分隔，比如我們，被分隔成 <我,1>,<們,1><我們,1>
			byte[] btValue = line.getBytes();
			InputStream inputStream = new ByteArrayInputStream(btValue);
			Reader reader = new InputStreamReader(inputStream);
			IKSegmenter ikSegmenter = new IKSegmenter(reader, true);
			Lexeme lexeme;
			while ((lexeme = ikSegmenter.next()) != null) {
				word.set(lexeme.getLexemeText());
				context.write(word, one);
			}
		}
	}
}

package com.hadoop.reduce.reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 統計單個字符出現的次數
 * @author linhaiy
 * @date 2019.05.18
 */
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
	private IntWritable result = new IntWritable();
	private String text = "孫權";
	private int textSum = 0;
	private List<String> textList = null;

	public WordCountReduce() {
		textList = new ArrayList<>();
		textList.add("曹操");
		textList.add("孫權");
	}

	/**
	 * @param key
	 * 第一個Text: 是傳入的單詞名稱，是Mapper中傳入的
	 * @param values
	 * 第二個：LongWritable 是該單詞出現了多少次，這個是mapreduce計算出來的，比如 hello出現了11次
	 * @param context
	 * 第三個Text: 是輸出單詞的名稱 ，這裏是要輸出到文本中的內容
	 * @throws IOException
	 * @throws InterruptedException
	 */
	@Override
	public void reduce(Text key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {
		int sum = 0;
		for (IntWritable val : values) {
			sum += val.get();
		}
		result.set(sum);
		context.write(key, result);

		String keyStr = key.toString();
		// 未使用分詞器，需要根據map傳過來的行內容檢索並累加
		// boolean isHas = keyStr.contains(text);
		// if (isHas) {
		// textSum++;
		// System.out.println("============ " + text + " 統計分詞爲: " + textSum + "
		// ============");
		// }

		// 使用分詞器，內容已經被統計好了，直接輸出即可
		if (textList.contains(keyStr)) {
			System.out.println("============ " + keyStr + " 統計分詞爲: " + sum + " ============");
		}
	}
}

package com.hadoop.reduce.service;

import java.io.IOException;

import javax.annotation.PostConstruct;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

import com.hadoop.reduce.bean.StaffProvincePartitioner;
import com.hadoop.reduce.bean.WeiboInputFormat;
import com.hadoop.reduce.mapper.CounterMapper;
import com.hadoop.reduce.mapper.FriendsMapper;
import com.hadoop.reduce.mapper.JoinMapper;
import com.hadoop.reduce.mapper.StaffMap;
import com.hadoop.reduce.mapper.WeatherMap;
import com.hadoop.reduce.mapper.WeiboMapper;
import com.hadoop.reduce.mapper.WordCount;
import com.hadoop.reduce.mapper.WordCountMap;
import com.hadoop.reduce.model.GroupSortModel;
import com.hadoop.reduce.model.OrderInfo;
import com.hadoop.reduce.model.StaffModel;
import com.hadoop.reduce.model.Weibo;
import com.hadoop.reduce.reducer.FriendsReduce;
import com.hadoop.reduce.reducer.JoinReduce;
import com.hadoop.reduce.reducer.StaffReduce;
import com.hadoop.reduce.reducer.WeatherReduce;
import com.hadoop.reduce.reducer.WeiboReduce;
import com.hadoop.reduce.reducer.WordCountReduce;
import com.hadoop.util.GroupSort;

/**
 * Map/Reduce工具類
 * @author linhaiy
 * @date 2019.05.18
 */
@Component
public class ReduceJobsUtils {

	@Value("${hdfs.path}")
	private String path;

	private static String hdfsPath;

	/**
	 * 獲取HDFS配置信息
	 * 
	 * @return
	 */
	public static Configuration getConfiguration() {
		Configuration configuration = new Configuration();
		configuration.set("fs.defaultFS", hdfsPath);
		configuration.set("mapred.job.tracker", hdfsPath);
		// 運行在yarn的集羣模式
		// configuration.set("mapreduce.framework.name", "yarn");
		// 這個配置是讓main方法尋找該機器的mr環境
		// configuration.set("yarn.resourcemanmager.hostname", "node1");
		return configuration;
	}

	/**
	 * 獲取單詞統計的配置信息
	 * 
	 * @param jobName
	 * @param inputPath
	 * @param outputPath
	 * @throws IOException
	 * @throws ClassNotFoundException
	 * @throws InterruptedException
	 */
	public static void getWordCountJobsConf(String jobName, String inputPath, String outputPath)
			throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = getConfiguration();
		Job job = Job.getInstance(conf, jobName);

		job.setMapperClass(WordCountMap.class);
		job.setCombinerClass(WordCountReduce.class);
		job.setReducerClass(WordCountReduce.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		// 小文件合併設置
		job.setInputFormatClass(CombineTextInputFormat.class);
		// 最大分片
		CombineTextInputFormat.setMaxInputSplitSize(job, 4 * 1024 * 1024);
		// 最小分片
		CombineTextInputFormat.setMinInputSplitSize(job, 2 * 1024 * 1024);

		FileInputFormat.addInputPath(job, new Path(inputPath));
		FileOutputFormat.setOutputPath(job, new Path(outputPath));
		job.waitForCompletion(true);
	}
	
	@PostConstruct
	public void getPath() {
		hdfsPath = this.path;
	}

	public static String getHdfsPath() {
		return hdfsPath;
	}
}

package com.hadoop.reduce.service;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.springframework.stereotype.Service;
import com.hadoop.hdfs.service.HdfsService;

/**
 * 單詞統計
 * @author linhaiy
 * @date 2019.05.18
 */
@Service
public class MapReduceService {

	// 默認reduce輸出目錄
	private static final String OUTPUT_PATH = "/output";

	/**
	 * 單詞統計，統計某個單詞出現的次數
	 * @param jobName
	 * @param inputPath
	 * @throws Exception
	 */
	public void wordCount(String jobName, String inputPath) throws Exception {
		if (StringUtils.isEmpty(jobName) || StringUtils.isEmpty(inputPath)) {
			return;
		}
		// 輸出目錄 = output/當前Job,如果輸出路徑存在則刪除，保證每次都是最新的
		String outputPath = OUTPUT_PATH + "/" + jobName;
		if (HdfsService.existFile(outputPath)) {
			HdfsService.deleteFile(outputPath);
		}
		ReduceJobsUtils.getWordCountJobsConf(jobName, inputPath, outputPath);
	}
}

package com.hadoop.reduce.controller;

import org.apache.commons.lang.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.RestController;
import com.hadoop.reduce.service.MapReduceService;
import com.hadoop.util.Result;

/**
 * MapReduce處理控制層
 * @author linhaiy
 * @date 2019.05.18
 */
@RestController
@RequestMapping("/hadoop/reduce")
public class MapReduceAction {

	@Autowired
	MapReduceService mapReduceService;

	/**
	 * 單詞統計(統計指定key單詞的出現次數)
	 * @param jobName
	 * @param inputPath
	 * @return
	 * @throws Exception
	 */
	@RequestMapping(value = "wordCount", method = RequestMethod.POST)
	@ResponseBody
	public Result wordCount(@RequestParam("jobName") String jobName, @RequestParam("inputPath") String inputPath)
			throws Exception {
		if (StringUtils.isEmpty(jobName) || StringUtils.isEmpty(inputPath)) {
			return new Result(Result.FAILURE, "請求參數爲空");
		}
		mapReduceService.wordCount(jobName, inputPath);
		return new Result(Result.SUCCESS, "單詞統計成功");
	}
}

SpringBoot集成Hadoop系列二 ---- MapReduce詞頻統計

繼上篇SpringBoot集成Hadoop系列一 ---- 對HDFS的文件操作建的工程,接下來使用MapReduce進行一些數據文件的統計開發.這裏做一個很經典的統計功能,詞頻統計.

代碼:

浩鯨科技高級工程師面試總結

Linux搭建sqoop數據遷移工具及簡單的數據遷移測試

SpringCloud + JWT + Spring-Security實現Restful API在網關層的權限管理和Token管理

SpringBoot + Kafka模擬系統心跳監測告警功能

SpringBoot + Redis實現事件的發佈訂閱功能

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結