pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>MongMapReduce</groupId>
<artifactId>MongMapReduce</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>casbah-core_2.11</artifactId>
<version>3.1.1</version>
</dependency>
</dependencies>
</project>
自定义的InputFormat
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.*;
import org.bson.Document;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class MongoDBInputFormat extends InputFormat <LongWritable, Document>{
public MongoDBInputFormat(){
}
public static class MongoDBInputSplit extends InputSplit implements Writable{
private long start;
private long end;
public MongoDBInputSplit(){
}
public MongoDBInputSplit(long start, long end) {
this.start = start;
this.end = end;
}
public long getStart() {
return start;
}
public void setStart(int start) {
this.start = start;
}
public long getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
public long getLength() throws IOException, InterruptedException {
return this.end-this.start;
}
public String[] getLocations() throws IOException, InterruptedException {
return new String[0];
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(start);
dataOutput.writeLong(end);
}
public void readFields(DataInput dataInput) throws IOException {
this.start=dataInput.readLong();
this.end=dataInput.readLong();
}
}
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
String uri = context.getConfiguration().get("input");
String[] datas = uri.split("://");
String dbsName = datas[1].split("\\.")[0];
String tableName = datas[1].split("\\.")[1];
List<InputSplit> list = new ArrayList<InputSplit>();
MongoClient client = new MongoClient(datas[0],27017);
MongoDatabase database = client.getDatabase(dbsName);
MongoCollection<Document> collection = database.getCollection(tableName);
long count = collection.count();
long chunk = 2;
long chunksize = (count/2);
//判断是否数据足够。
if(chunksize==0){
if(count!=0){
MongoDBInputSplit mi = new MongoDBInputSplit(0,count);
list.add(mi);
}else{
new Exception("没有数据");
}
}else{
//将数据进行切片,也就是一个map里面有一个切片,一个切片有上面定义的chunk = 2 条数据。
for(int i = 0;i<chunksize;i++){
MongoDBInputSplit mi = null;
if(i+1==chunksize){
mi = new MongoDBInputSplit(i*chunk,count);
list.add(mi);
}else{
mi = new MongoDBInputSplit(i*chunk,i*chunk+chunk);
list.add(mi);
}
}
}
//切片集合。
return list;
}
public static class MongoDBRecordReader extends RecordReader<LongWritable, Document>{
private MongoDBInputSplit split;
//从MongDb中查出来的结果集
private MongoCursor<Document> dbcursor;
//定义索引,每次都会被初始化成0,也就是只能读取自己切片中的 k,v
private int index;
private LongWritable k; //偏移量,再下面会自动封装成切片数据的开始,就会知道读多少行 ,对应map泛型的第一个值。
private Document v; //每次读到的结果,会通过返回出去,对应 map泛型的第二个。
//数据库的一些信息。
String ip;
String dbsName;
String tableName;
public MongoDBRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
super();
initialize(split,context);
String uri = context.getConfiguration().get("input");
String[] datas = uri.split("://");
this.ip = datas[0];
this.dbsName = datas[1].split("\\.")[0];
this.tableName = datas[1].split("\\.")[1];
}
public MongoDBRecordReader(){
}
//初始化,将一些对象new出来,并把得到的切片(1个)强转。
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.split = (MongoDBInputSplit)split;
this.k = new LongWritable();
v = new Document();
}
//读取数据,并把数据封装到当前MongoDBRecordReader的k v中。
public boolean nextKeyValue() throws IOException, InterruptedException {
//判断dbcursor是否为null
if(this.dbcursor == null){
//获取dbcursor的值
// 获取集合
MongoClient client = new MongoClient(ip,27017);
MongoDatabase database = client.getDatabase(dbsName);
MongoCollection<Document> collection = database.getCollection(tableName);
//获取游标
dbcursor = collection.find().skip((int) this.split.start).limit((int) this.split.getLength()).iterator();
}
//操作游标
boolean hasNext = this.dbcursor.hasNext();
if(hasNext){
//获取游标的下一个值
Document next = this.dbcursor.next();
//下一个的key
this.k.set(this.split.start+index);
index ++;
//下一个value
this.v = next;
}
return hasNext;
}
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return this.k;
}
public Document getCurrentValue() throws IOException, InterruptedException {
return this.v;
}
public float getProgress() throws IOException, InterruptedException {
return 0;
}
public void close() throws IOException {
}
}
public RecordReader<LongWritable, Document> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
return new MongoDBRecordReader(split,context);
}
}
自定义的OutputFormat
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.bson.Document;
import java.io.IOException;
public class MongoDBOutputFormat <K,V> extends OutputFormat<K,V> {
public static class MongoDBRecordWriter<K,V> extends RecordWriter<K, V>{
public MongoCollection<Document> collection = null;
public MongoDBRecordWriter(){
}
public MongoDBRecordWriter(TaskAttemptContext context){
//获取mongodb的连接
String uri = context.getConfiguration().get("output");
String[] datas = uri.split("://");
String ip = datas[0];
String dbsName = datas[1].split("\\.")[0];
String tableName = datas[1].split("\\.")[1];
MongoClient client = new MongoClient(ip,27017);
collection = client.getDatabase(dbsName).getCollection(tableName);
}
public void write(K key, V value) throws IOException, InterruptedException {
collection.insertOne(new Document(key.toString(),value.toString()));
}
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
}
}
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
return new MongoDBRecordWriter<K, V>(context);
}
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
}
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
return new FileOutputCommitter(null, context);
}
}
自定义的Driver类:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.bson.Document;
import java.io.IOException;
public class WordCount {
public static class MyMapper extends Mapper<LongWritable, Document,Text,IntWritable>{
IntWritable iw = new IntWritable(1);
Text text = new Text();
@Override
protected void map(LongWritable key, Document value, Context context) throws IOException, InterruptedException {
String line = value.toString();
System.out.println(line);
String str = (String)value.get("str");
text.set(str);
context.write(text,iw);
}
}
public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
IntWritable iw = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int num = 0;
for(IntWritable value:values){
num+=value.get();
}
iw.set(num);
context.write(key,iw);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("input","localhost://db1.in1");
conf.set("output","localhost://db1.out10");
Job job = Job.getInstance(conf);
job.setJarByClass(WordCount.class);
//指定本业务job要使用的mapper业务类
job.setMapperClass(MyMapper.class);
//指定本业务job要使用的reducer业务类
job.setReducerClass(MyReducer.class);
//指定map输出的类型是什么
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//指定最终输出数据的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(MongoDBInputFormat.class);
job.setOutputFormatClass(MongoDBOutputFormat.class);
//提交到yarn
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}