環境配置
- jdk 1.8.212 (一定要1.8, 很重要!!!)
- hadoop 3.1.2
- Windows 10
https://github.com/TheLastArcher/HDFS-Configuration
先下載這個壓縮包 把bin etc 倆文件夾覆蓋hadoop-3.1.2 下就可以了;
我的hadoop -3.1.2 放在D:/hadoop-3.1.2; (主要和變量路徑配置有關 bin,etc 關係不大)
打開cmd
輸入d:
然後 start-all
會得到四個 一直存在的term
看看jps
來倆網站測試
有問題的直接下面評論
然後就是maven 模式寫Java代碼了
下個 IDEA 挺好用的
然後
再然後
然後就建完了
打開下面這東西
敲上紅框內代碼 然後 點import changes
接下來 IDEA會自動下載所有的依賴文件 會好幾分鐘 ,如果是第一次的話,
下載完後,以後的依賴建立就不用下載,會很快就 idex 到了
然後就得到這些依賴了
接下來D:\hadoop-3.1.2\etc\hadoop 下的
這倆文件放在
直接複製粘貼就好
然後到此爲止 環境就建好了,可以寫java代碼了
附帶一張代碼 和一丁點的數據,注意:數據是GB級別的txt文件 僅供參考
數據:
10001,2007-02-20 00:02:27,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:05:36,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:08:45,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:11:55,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:15:04,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:21:22,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:24:31,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:37:08,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:40:17,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:43:26,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:46:35,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:49:44,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:52:53,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:56:02,121.423167,31.165233, 7,116,3
10001,2007-02-20 00:59:11,121.423167,31.165233, 7,116,3
10001,2007-02-20 01:05:29,121.423167,31.165233, 7,116,3
10001,2007-02-20 01:08:38,121.423167,31.165233, 7,116,3
10001,2007-02-20 01:11:47,121.423167,31.165233, 7,116,3
10001,2007-02-20 01:14:57,121.423167,31.165233, 7,116,3
10001,2007-02-20 01:18:06,121.423167,31.165233, 7,116,3
10001,2007-02-20 01:21:15,121.423167,31.165233, 7,116,3
10001,2007-02-20 01:24:24,121.423167,31.165233, 7,116,3
java 裏面導入的地址和導出地址都要寫這樣的地址
hdfs://localhost:9000/data/input/
hdfs://localhost:9000/clear2/output2
代碼
package mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.LinkedList;
import java.util.*;
public class DataClear {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// 設置任務名稱
Job job = new Job(conf, "Rec_Clear");
job.setJarByClass(DataClear.class);
// map類
job.setMapperClass(ClearMapper.class);
// map輸出k-v數據類型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 結果數據輸出類型
job.setReducerClass(ClearReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 輸入路徑
FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/data/input/"));
// 輸出路徑
String outputPath = "hdfs://localhost:9000/clear2/output2";
FileOutputFormat.setOutputPath(job, new Path(outputPath));
// 提交作業 判斷退出條件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class ClearMapper extends Mapper<LongWritable,Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String [] split = line.split(",");
int len = split.length;
if(len == 7){
Text k = new Text(split[0]);
context.write(k,new Text(split[1]+","+split[2]+","+split[3]));
}
}
}
public static class ClearReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String str = new String();
for(Text x:values){
str += x.toString();
str += "\n";
}
context.write(key, new Text(str));
}
}
}
See You .