-
易於編程
-
擴展性良好
-
高容錯性
-
簡單的數據統計,比如網站的pv、uv
-
搜索引擎建立索引
-
在搜索引擎中統計最流行的搜索詞
-
統計搜索的詞頻率
-
複雜的數據分析算法實現
-
DAG計算,當多個應用程序存在依賴關係,且後一個應用的輸入來自於前一個輸出這個情況是不適合用MapReduce的
上面我們總結出了大致的執行過程,接下來看看具體的執行過程如何
5. 數據接下來傳給Reduce進行處理,它處理完後,生成key3和value3
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//使用空隔進行分詞
String[] str = value.toString().split(" ");
//for循環處理
for(int i = 0; i < str.length; i++){
//new Text,new IntWritable進行可序列化
context.write(new Text(str[i]),new IntWritable(1));
}
}
}
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//數據進行分組合並輸出
int sum = 0;
for(IntWritable i:values){
sum = sum+i.get();
}
context.write(key,new IntWritable(sum));
}
}
public class MRRunJob {
public static void main(String[] args) {
Configuration conf = new Configuration();
//NameNode入口
conf.set("fs.defaultFS","hdfs://192.168.2.4:8020");
Job job = null;
try {
job = Job.getInstance(conf,"mywc");
} catch (IOException e) {
e.printStackTrace();
}
//主類
job.setJarByClass(MRRunJob.class);
//Mapper類
job.setMapperClass(WordCountMapper.class);
//Reducer類
job.setReducerClass(WordCountReducer.class);
//Map輸出的value類型
job.setOutputKeyClass(Text.class);
//Map輸出的value類型
job.setOutputValueClass(IntWritable.class);
try {
//讀取文件位置
job.setWorkingDirectory(new Path("/"));
System.out.println(job.getWorkingDirectory());
FileInputFormat.addInputPath(job,new Path("/usr/input/data/wc/"));
//處理完成後數據存儲的位置(注意:如果輸出文件夾存在則會報錯)
FileOutputFormat.setOutputPath(job,new Path("/usr/output/data/wc/"));
job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}
public class ETLMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//使用,號對文本內容進行分隔
String[] strArray = value.toString().split(",");
String strContent = "";
for(int i=0;i<strArray.length;i++){
//把下標爲第三個數據轉換爲yyyy-MM-dd HH:mm:ss的時間格式
if(i == 3){
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
//把原數據的秒轉爲毫秒
String str = sdf.format(Long.parseLong(strArray[i]+"000"));
strContent = strContent + str + ",";
} else {
strContent = strContent + strArray[i] + ",";
}
}
context.write(new Text(strContent),NullWritable.get());
}
}
public class ETLReducer extends Reducer<Text,NullWritable,NullWritable,Text> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//把Mapper傳過來的內容寫入磁盤
context.write(NullWritable.get(),key);
}
}
public class MRRunJob {
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://192.168.2.4:8020");
//初始化fs
FileSystem fs = null;
try {
fs = FileSystem.get(conf);
} catch (IOException e) {
e.printStackTrace();
}
Job job = null;
try {
job = Job.getInstance(conf,"mywc");
} catch (IOException e) {
e.printStackTrace();
}
//主方法
job.setJarByClass(MRRunJob.class);
//Mapper方法
job.setMapperClass(ETLMapper.class);
//Reducer方法
job.setReducerClass(ETLReducer.class);
//Map輸出的key類型
job.setOutputKeyClass(Text.class);
//Map輸出的value類型
job.setOutputValueClass(NullWritable.class);
try {
//讀取文件位置
Path inputPath = new Path("/usr/input/data/etl/");
//如果這個文件不存在則新增
if(!fs.exists(inputPath)){
fs.mkdirs(inputPath);
}
//需要把本地的文件上傳到HDFS
Path src = new Path("D:\\IdeaProjects\\MapReduceTest\\ETLDemo\\etl01.txt");
fs.copyFromLocalFile(src,inputPath);
FileInputFormat.addInputPath(job,inputPath);
//處理完成後文件輸出位置
Path outputPath = new Path("/usr/output/data/etl01/");
//如果這個輸出的目錄存在則刪除
if(fs.exists(outputPath)){
fs.delete(outputPath,true);
}
FileOutputFormat.setOutputPath(job,outputPath);
boolean f = job.waitForCompletion(true);
} catch (Exception e) {
e.printStackTrace();
}
}
}