MapReduce一次讀取多個文件
求出每一行數據所在的文件名
學習目標(獲取數據所在文件的名稱)
主要還是map中的改變
在map中通過context調用getInputSplit()這個方法,再將此對象強轉爲FileSplit就能通過getPath()方法的getName()方法獲取文件名了
上代碼:
package com.damo01;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* @author 嗨皮騷
* @version v 1.0
* @date 2019/11/18
*/
public class MyMapper extends Mapper<LongWritable, Text,Text,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//通過context調用getInputSplit()這個方法,因爲我們要讀取多個文件,所以要強轉爲FileSplit
FileSplit fileSplit = (FileSplit) context.getInputSplit();
//fileSplit調用getPath獲取路徑信息
Path path = fileSplit.getPath();
//通過路徑信息path獲取文件的名字
String name = path.getName();
context.write(new Text(name),value);
//這樣我們就將文件的名字及其每一行的內容傳給reduce了
}
}
後面的代碼和以前一樣,沒有什麼特別的操作
package com.damo01;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author 嗨皮騷
* @version v 1.0
* @date 2019/11/18
*/
public class MyReduce extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//遍歷拿到的同一個key的數據,直接輸出便是
for (Text value : values) {
context.write(new Text(key.toString()),value);
}
}
}
Driver的代碼和以前一樣,沒有什麼特別的操作
ackage com.damo01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* @author 嗨皮騷
* @version v 1.0
* @date 2019/11/18
*/
public class MyDriver extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
//實例一個job對象參數(Configuration對象,隨便一個字符串的名字)
Job job = Job.getInstance(new Configuration(), "demo01");
//這一步是爲了獲取文件或目錄內的文件,我使用的是本地的地址
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("E:\\教學\\學習資料3\\11.18\\day24"));
//這一步是爲了讓jar包在集羣中跑起來
job.setJarByClass(MyDriver.class);
//配置map類
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//配置reduce類
job.setReducerClass(MyReduce.class);
//配置輸出類型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//這一步是輸出結果的目錄
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("E:\\教學\\學習資料3\\11.18\\day24\\結果"));
//開啓job的運行,並打印信息,返回狀態碼
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
//運行MapReduce
int run = ToolRunner.run(new MyDriver(), args);
System.out.println(run==0?"成功":"失敗");
}
}
Hadoop的pom依賴
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>RELEASE</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>