3.1.8 NLineInputFormat使用案例
1.需求
- 對每個單詞進行個數統計,要求根據每個輸入文件的行數來規定輸出多少個切片。此案例要求每三行放入一個切片中。
(1)輸入數據
banzhang ni hao
xihuan hadoop banzhang
banzhang ni hao
xihuan hadoop banzhang
banzhang ni hao
xihuan hadoop banzhang
banzhang ni hao
xihuan hadoop banzhang
banzhang ni hao
xihuan hadoop banzhang banzhang ni hao
xihuan hadoop banzhang
(2)期望輸出數據
Number of splits:4
2.需求分析
- 使用本地的Hadoop3.1.2進行測試輸入的數據,得到輸出的數據
3.代碼實現
(1)編寫Mapper類
/**
* @Author zhangyong
* @Date 2020/3/6 9:17
* @Version 1.0
*/
public class NLineMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text k = new Text ();
private LongWritable v = new LongWritable (1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1 獲取一行
String line = value.toString ();
// 2 切割
String[] splited = line.split (" ");
// 3 循環寫出
for (int i = 0; i < splited.length; i++) {
k.set (splited[i]);
context.write (k, v);
}
}
}
(2)編寫Reducer類
/**
* @Author zhangyong
* @Date 2020/3/6 9:18
* @Version 1.0
*/
public class NLineReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
LongWritable v = new LongWritable ();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum = 0L;
// 1 彙總
for (LongWritable value : values) {
sum += value.get ();
}
v.set (sum);
// 2 輸出
context.write (key, v);
}
}
(3)編寫Driver類
/**
* @Author zhangyong
* @Date 2020/3/6 9:18
* @Version 1.0
*/
public class NLineDriver {
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
// 數據輸入路徑和輸出路徑
args = new String[2];
args[0] = "src/main/resources/nlinei/";
args[1] = "src/main/resources/nlineo";
Configuration cfg = new Configuration();
cfg.set("mapreduce.framework.name", "local");
cfg.set("fs.defaultFS", "file:///");
final FileSystem filesystem = FileSystem.get(cfg);
if (filesystem.exists(new Path(args[1]))) {
filesystem.delete(new Path(args[1]), true);
}
Job job = Job.getInstance (cfg);
// 7設置每個切片InputSplit中劃分三條記錄
NLineInputFormat.setNumLinesPerSplit (job, 3);
// 8使用NLineInputFormat處理記錄數
job.setInputFormatClass (NLineInputFormat.class);
// 2設置jar包位置,關聯mapper和reducer
job.setJarByClass (NLineDriver.class);
job.setMapperClass (NLineMapper.class);
job.setReducerClass (NLineReducer.class);
// 3設置map輸出kv類型
job.setMapOutputKeyClass (Text.class);
job.setMapOutputValueClass (LongWritable.class);
// 4設置最終輸出kv類型
job.setOutputKeyClass (Text.class);
job.setOutputValueClass (LongWritable.class);
// 5設置輸入輸出數據路徑
FileInputFormat.setInputPaths (job, new Path (args[0]));
FileOutputFormat.setOutputPath (job, new Path (args[1]));
// 6提交job
job.waitForCompletion (true);
}
}