實踐內容:
編寫MapReduce程序分析氣象數據集(ftp://ftp.ncdc.noaa.gov/pub/data/noaa
上2018年中國地區監測站的數據),分析出2018年每個月出現最高溫度的監測站的溫度、溼度、緯度、經度、大氣壓力等信息。要求:
1、輸出格式
201801 320,330,52130,122520,10264
201802 350,360,12330,543270,10463
2、溫度、溼度、緯度、經度、大氣壓力等信息用一個自定義類來封裝。
3、shuffle階段使用合併(Combine)操作。
4、將結果分別輸出到2-4個文件(Partitioner分區)。
氣象數據格式說明:
1-4 0169
5-10 501360 # USAF weather station identifier
11-15 99999 # WBAN weather station identifier
16-23 20170101 # 記錄日期
24-27 0000 # 記錄時間
28 4
29-34 +52130 # 緯度(1000倍)
35-41 +122520 # 經度(1000倍)
42-46 FM-12
47-51 +0433 # 海拔(米)
52-56 99999
57-60 V020
61-63 220 # 風向
64 1 # 質量代碼
65 N
66-69 0010
70 1
71-75 02600 # 雲高(米)
76 1
77 9
78 9
79-84 003700 # 能見距離(米)
85 1
86 9
87 9
88-92 -0327 # 空氣溫度(攝氏度10)
93 1
94-98 -0363 # 露點溫度(攝氏度10)
99 1
100-104 10264 # 大氣壓力
105 1
思路:
從一大串文本中提取出需要的數據(溫度,溼度,經緯度等)可以使用字符串的子字符串來做到,這一部分交給map來做。將map處理後,可以得到一系列的<時間,Bean>這樣的鍵值對,例如<“20190101”,MyBean1>。這寫數據將交由reduce進行處理,reduce對這些來自map的數據再處理,在處理之前,首先定義partioner將這些數據按月份劃分爲兩個區(1月 ~ 6月,7月 ~ 12月,劃分的區的大小要根據實際的reducer來設定)。之後,reduce會對這這些被劃分好的數據(也就是每一個月份的數據),找出這裏面這個月的溫度最大值,將它寫出去。
項目結構:
實際操作:
MyDriver.java
這裏需要注意的地方是
要設置task任務數,這裏的任務數和待會分區的個數是一致的
package com.jxufe.xzy.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.TestMiniMRClientCluster;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;public class MyDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://Master:9000");
conf.set("fs.hdfs.impl","org.apache.hadoop.hdfs.DistributedFileSystem");
Job job = Job.getInstance(conf); job.setJarByClass(MyDriver.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setCombinerClass(MyReducer.class);
job.setPartitionerClass(MyPartioner.class);
job.setNumReduceTasks(2);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MyBean.class); FileInputFormat.setInputPaths(job, new Path("/input/cndcdata.txt"));
FileOutputFormat.setOutputPath(job, new Path("/output"));
System.exit(job.waitForCompletion(true)?0:1);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
Mapper類:MyMapper.java
package com.jxufe.xzy.hadoop;import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class MyMapper extends Mapper<LongWritable, Text,Text,MyBean> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 提取信息
String strValue = value.toString();
String dateStr = strValue.substring(15,23);
Text text = new Text();
//初始化mybean
String temprature = strValue.substring(87,92).substring(2,5);
String humidity = strValue.substring(93,98).substring(2,5);
String latitude = strValue.substring(28,34).substring(1,6);
String longtitude = strValue.substring(34,41).s
MyBean mb = new MyBean(temprature,humidity,latitude,longtitude,presture);
text.set(dateStr);
context.write(text,mb);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
自定義實體類:MyBean.java
這裏值得注意的一點是,序列化與反序列化的順序要一致,一一對應
package com.jxufe.xzy.hadoop;import org.apache.hadoop.io.Writable;import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;public class MyBean implements Writable{
private String temprature;
private String humidity;
private String latitude;
private String longitude;
private String pressure; public MyBean(){ } public MyBean( String temprature, String humidity, String latitude, String longitude, String pressure) {
this.temprature = temprature;
this.humidity = humidity;
this.latitude = latitude;
this.longitude = longitude;
this.pressure = pressure;
} public String getTemprature() {
return temprature;
} public void setTemprature(String temprature) {
this.temprature = temprature;
} public String getHumidity() {
return humidity;
} public void setHumidity(String humidity) {
this.humidity = humidity;
} public String getLatitude() {
return latitude;
} public void setLatitude(String latitude) {
this.latitude = latitude;
} public String getLongitude() {
return longitude;
} public void setLongitude(String longitude) {
this.longitude = longitude;
} public String getPressure() {
return pressure;
} public void setPressure(String pressure) {
this.pressure = pressure;
}
@Override
public String toString() {
return temprature + "," + humidity +
"," + latitude + "," + longitude + "," + pressure;
} @Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(temprature);
dataOutput.writeUTF(latitude);
dataOutput.writeUTF(longitude);
dataOutput.writeUTF(pressure);
dataOutput.writeUTF(humidity);
} @Override
public void readFields(DataInput dataInput) throws IOException {
this.temprature = dataInput.readUTF();
this.latitude = dataInput.readUTF();
this.longitude = dataInput.readUTF();
this.pressure = dataInput.readUTF();
this.humidity = dataInput.readUTF();
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
自定義分區:MyPartioner.java
package com.jxufe.xzy.hadoop;import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;public class MyPartioner extends Partitioner<Text, MyBean> {
@Override
public int getPartition(Text key, MyBean value, int reducerNum) {
Date date = parseDate(key.toString());
//設置兩個分區(1,6)月爲一個分區,(7,12)月爲一個分區
if(date.getMonth() < 6){
return 0;
}else {
return 1;
}
} public static Date parseDate(String dateStr) { DateFormat f1 = new SimpleDateFormat("yyyyMMdd");
Date date = null;
try {
date = f1.parse(dateStr);
} catch (ParseException e) {
e.printStackTrace();
}
return date;
}// @Override
// public int getPartition(Object o, Object o2, int i) {
// return 0;
// }
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
Reduce類:MyReduce.java
package com.jxufe.xzy.hadoop;import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;public class MyReducer extends Reducer<Text,MyBean,Text,MyBean> {
public void reduce(Text key, Iterable<MyBean> values, Context context) throws IOException, InterruptedException {
// 找最大值
int max = 0;
String dateStr = key.toString().substring(0,6);
Text keyOut = new Text();
keyOut.set(dateStr);
MyBean mb = new MyBean();
for (MyBean value : values){
if (Integer.parseInt(value.getTemprature()) > max){
max = Integer.parseInt(value.getTemprature());
mb = value;
}
}
context.write(keyOut,mb);
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
運行效果: