MapReduce編程案例
- 用
mapreduce
解決問題的關鍵是確定key
,只有key
相同的結果纔會到同一個reduce
中進行處理 - 默認分區使用
HashPartitoner
,hashCode%reduceNum
所有分區個數與reduce
個數有關,但是可以自定義Patitionner
- 沒有
reduce
就沒有shuffle
過程了,數據進入mapper
處理後會直接輸出,不再進行分區及之後的操作
reduce端join算法實現
- 需求:
訂單數據表t_order
id | date | pid | amount |
---|---|---|---|
1001 | 20191210 | P0001 | 2 |
1002 | 20191210 | P0001 | 3 |
1002 | 20191210 | P0002 | 3 |
商品信息表t_product
id | pname | category_id | price |
---|---|---|---|
1001 | 小米8 | 2000 | 2 |
1002 | apple X | 8000 | 3 |
假如數據量巨大,兩表的數據是以文件的形式存儲在HDFS中,需要用mapreduce
程序來實現一下SQL
查詢運算:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id
- 實現機制:
通過將關聯的條件作爲map
輸出的key
,將兩表滿足join
條件的數據並攜帶數據所來源的文件信息,發往同一個reduce task
,在reduce
中進行數據的串聯
關聯後結果集
/**
* 關聯後結果集
* @author lxf
* @version v1.0
* @date 2018/4/9 10:56
*/
public class JoinBean implements Writable {
//order
private int orderId;
private String date;
private String pid;
private int amount;
//product
private String productId;
private String name;
private String category_id;
private float price;
public JoinBean() {
}
public void set(int id, String date, String pid, int amount, String productId,String name, String category_id, float price) {
this.orderId = id;
this.date = date;
this.pid = pid;
this.amount = amount;
this.productId = productId;
this.name = name;
this.category_id = category_id;
this.price = price;
}
public int getOrderId() {
return orderId;
}
public void setOrderId(int orderId) {
this.orderId = orderId;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCategory_id() {
return category_id;
}
public void setCategory_id(String category_id) {
this.category_id = category_id;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public String getProductId() {
return productId;
}
public void setProductId(String productId) {
this.productId = productId;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(orderId);
dataOutput.writeUTF(date);
dataOutput.writeUTF(pid);
dataOutput.writeInt(amount);
dataOutput.writeUTF(productId);
dataOutput.writeUTF(name);
dataOutput.writeUTF(category_id);
dataOutput.writeFloat(price);
}
public void readFields(DataInput dataInput) throws IOException {
//order
orderId = dataInput.readInt();
date = dataInput.readUTF();
pid = dataInput.readUTF();
amount = dataInput.readInt();
//product
productId = dataInput.readUTF();
name = dataInput.readUTF();
category_id = dataInput.readUTF();
price = dataInput.readFloat();
}
@Override
public String toString() {
return
"orderId=" + orderId +
", date='" + date + '\'' +
", pid='" + pid + '\'' +
", amount=" + amount +
", productId=" + productId +
", name='" + name + '\'' +
", category_id='" + category_id + '\'' +
", price=" + price;
}
}
訂單Bean
/**
* 訂單Bean,沒有作key並且不需要比較,所以不需要實現WritableComparable接口
* @author lxf
* @version v1.0
* @date 2018/4/9 10:07
*/
public class OrderBean implements Writable {
private int id;
private String date;
private String pid;
private int amount;
public OrderBean() {
}
public void set(int id, String date, String pid, int amount) {
this.id = id;
this.date = date;
this.pid = pid;
this.amount = amount;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(id);
dataOutput.writeUTF(date);
dataOutput.writeUTF(pid);
dataOutput.writeInt(amount);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readInt();
date = dataInput.readUTF();
pid = dataInput.readUTF();
amount = dataInput.readInt();
}
@Override
public String toString() {
return "id=" + id +
", date='" + date + '\'' +
", pid='" + pid + '\'' +
", amount=" + amount;
}
}
產品Bean
/**
* 產品Bean
* @author lxf
* @version v1.0
* @date 2018/4/9 10:07
*/
public class ProductBean implements Writable{
private String id;
private String name;
private String category_id;
private float price;
public ProductBean() {
}
public void set(String id, String name, String category_id, float price) {
this.id = id;
this.name = name;
this.category_id = category_id;
this.price = price;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCategory_id() {
return category_id;
}
public void setCategory_id(String category_id) {
this.category_id = category_id;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(id);
dataOutput.writeUTF(name);
dataOutput.writeUTF(category_id);
dataOutput.writeFloat(price);
}
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readUTF();
name = dataInput.readUTF();
category_id = dataInput.readUTF();
price = dataInput.readFloat();
}
@Override
public String toString() {
return "id=" + id +
", name='" + name + '\'' +
", category_id='" + category_id + '\'' +
", price=" + price;
}
}
Mapper
/**
* @author lxf
* @version v1.0
* @date 2018/4/9 10:06
*/
public class RJoinMapper extends Mapper<LongWritable,Text,Text,ObjectWritable>{
private ProductBean productBean = new ProductBean();
private OrderBean orderBean = new OrderBean();
ObjectWritable objectWritable = new ObjectWritable();
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
System.out.println(line);
//獲取切片信息,切片信息中包含文件信息
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
String[] fields = line.split(",");
String pid = "";
//通過文件名確定所屬文件
if(name.startsWith("product")){
pid = fields[0];
productBean.set(pid,fields[1],fields[2],Float.parseFloat(fields[3]));
objectWritable.set(productBean);
}else {
pid = fields[2];
orderBean.set(Integer.parseInt(fields[0]),fields[1],pid,Integer.parseInt(fields[3]));
objectWritable.set(orderBean);
}
k.set(pid);
context.write(k,objectWritable);
}
}
Reducer
/**
* 以關聯字段pid作爲key
* 具有相同key的訂單、產品信息進入到同一個reducer中,在reducer中進行join操作
* 若某個產品的訂單量太大,而其它產品的訂單量很少,則造成某個reducer繁忙,其它reducer閒置,
* 造成數據傾斜.
* 爲解決此問題,可使用map端join
* 提前把所有產品信息加載到內存中,map讀取每行數據時直接與產品信息進行join
* @author lxf
* @version v1.0
* @date 2018/4/9 10:06
*/
public class RJoinReducer extends Reducer<Text,ObjectWritable,JoinBean,NullWritable>{
ProductBean productBean = new ProductBean();
JoinBean joinBean = new JoinBean();
@Override
protected void reduce(Text key, Iterable<ObjectWritable> values, Context context) throws IOException, InterruptedException {
ArrayList<OrderBean> orderBeans = new ArrayList<OrderBean>();
//區分出訂單與商品
for (ObjectWritable value : values) {
try {
Object obj = value.get();
if (obj instanceof ProductBean) {
BeanUtils.copyProperties(productBean, obj);
} else {
OrderBean orderBean = new OrderBean();
BeanUtils.copyProperties(orderBean,obj);
orderBeans.add(orderBean);
}
} catch (Exception e) {
e.printStackTrace();
}
}
//拼接兩類數據,形成最終結果
for (OrderBean orderBean : orderBeans) {
joinBean.set(orderBean.getId(),orderBean.getDate(),orderBean.getPid(),orderBean.getAmount(),
productBean.getId(),productBean.getName(),productBean.getCategory_id(),productBean.getPrice());
context.write(joinBean,NullWritable.get());
}
}
}
Driver
/**
* reduce端進行map
* mapreduce join
* @author lxf
* @version v1.0
* @date 2018/4/9 10:04
*/
public class RJoinDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//1.指定本程序的jar包所在的本地路徑;2.配置集羣信息;3.提交到集羣執行
job.setJar("D:\\Document\\ideal\\hadoop_itcast\\out\\artifacts\\rjoin_jar\\rjoin.jar");
//指定本業務job要使用mapper/Reducer業務類
job.setMapperClass(RJoinMapper.class);
job.setReducerClass(RJoinReducer.class);
//指定mapper輸出數據kv類型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ObjectWritable.class);
//指定最終輸出數據kv類型
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
//指定job的輸入原始文件所在目錄
FileInputFormat.setInputPaths(job,new Path(args[0]));
//指定job的輸出結果所在目錄
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//將job中配置的相關參數,以及job所用的java類所在的jar包,提交給yarn去運行
/*job.submit();*/
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
- 缺點:這種方式中,
join
的操作是在reduce
階段完成,reduce
端的處理壓力太大,map
節點的運算負載則很低,資源利用率不高,且在reduce
階段極易產生數據傾斜 - 解決方案:
map
端join
實現方式
map端join算法實現
- 原理闡述
適用於關聯表中有小表的情形;
可以將小表分發到所有的map
節點,這樣,map
節點就可以在本地對自己所讀到的大表數據進行join
並輸出最終結果,可以大大提高join
操作的併發度,加快處理速度 - 實現示例
–先在mapper
類中預先定義好小表,並用distributedcache
機制將小表的數據分發到每一個maptask
執行節點,從而每一個maptask
節點可以從本地加載到小表的數據,進而在本地即可實現join
進行join
–引入實際場景中的解決方案:一次加載數據庫或者用distributedcache
distributedcache
分佈式緩存
hadoop
提供distributedcache
分佈式緩存,可以方便的把所需要的數據加載到分佈式計算框架中
有了分佈式緩存,程序如果依賴第三方jar
有兩種解決方法
- 法1. 把所依賴的包打到一個包中,但此時文件會比較大
- 法2. 使用分佈式緩存
job.addArchiveToClassPath(Path archive);
指定所依賴jar包的路徑(本地或hdfs)
job.addArchiveToClassPath(archive); // 緩存jar包到task運行節點的classpath中,archive爲jar包路徑,可以爲本地路徑,也可以是hdfs路徑
job.addFileToClassPath(file); // 緩存普通文件到task運行節點的classpath中
job.addCacheArchive(uri); // 緩存壓縮包文件到task運行節點的工作目錄
job.addCacheFile(uri) // 緩存普通文件到task運行節點的工作目錄
修改後的Mapper
/**
* 描述:
* map端join解決數據傾斜
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 11:20
*/
public class MapSideJoinMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
/**
* 用一個hashmap來加載保存產品信息表
*/
private HashMap<String,String> productInfoMap = new HashMap<String,String>();
Text k = new Text();
/**
* 通過閱讀父類Mapper的源碼,發現 setup方法是在maptask處理數據之前調用一次 可以用來做一些初始化工作
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("product.txt")));
String line;
while (StringUtils.isNotEmpty(line = reader.readLine())){
String[] fields = line.split(",");
//pid,pname
productInfoMap.put(fields[0],fields[1]);
}
reader.close();
}
/**
* 由於已經持有完整的產品信息表,所以在map方法中就能實現join邏輯了
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String orderLine = value.toString();
String[] orderFields = orderLine.split(",");
String productName = productInfoMap.get(orderFields[2]);
k.set(orderLine + "," + productName);
context.write(k,NullWritable.get());
}
}
驅動程序
public class MapSideJoinDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MapSideJoinDriver.class);
// job.setJar("D:/SoftDatas/IdeaProjects/hadoop_itcast/out/artifacts/mapSideJoin_jar/mapSideJoin.jar");
job.setMapperClass(MapSideJoinMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,new Path("D:/localHadoop/data/mapSideJoin/input"));
FileOutputFormat.setOutputPath(job,new Path("D:/localHadoop/data/mapSideJoin/output"));
// 指定需要緩存一個文件到所有的maptask運行節點工作目錄
/* job.addArchiveToClassPath(archive); */ // 緩存jar包到task運行節點的classpath中
/* job.addFileToClassPath(file); */ // 緩存普通文件到task運行節點的classpath中
/* job.addCacheArchive(uri); */ // 緩存壓縮包文件到task運行節點的工作目錄
/* job.addCacheFile(uri) */ // 緩存普通文件到task運行節點的工作目錄
job.addCacheFile(new URI("file:///D:/localHadoop/data/mapSideJoin/cache/product.txt"));
//map端join的邏輯不需要reduce階段,設置reducetask數量爲0,否則默認有一個reducer
job.setNumReduceTasks(0);
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
倒排索引建立
需求:有大量的文本(文檔、網頁),需要建立搜索索引,計算每個單詞在每個文件中的數量
a.txt
hello tom
hello jerry
hello tom
b.txt
hello jerry
hello jerry
tom jerry
c.txt
hello jerry
hello tom
rs.txt結果表
hello a.txt-->3 b.txt-->2 c.txt-->2
jerry a.txt-->1 b.txt-->3 c.txt-->1
tom a.txt-->2 b.txt-->1 c.txt--1
分析:
數據不能通過一個mapreduce
解決,可以通過多個mapreduce
串連計算
- 先以
單詞-文件名
爲key,計算出每個單詞在每個文件中出現的個數<單詞-文件名,num>
- 再以單詞爲
key
計算每個單詞在各個文件中出現的次數hello a.txt-->3 b.txt-->2 c.txt-->2
實現
第一步:程序代碼
/**
* 以`單詞-文件名`爲key,計算每個單詞出現的次數
*/
public class IndexStepOneMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
Text k = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
String fileName = split.getPath().getName();
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
k.set(word + "--" + fileName);
context.write(k,v);
}
}
}
public class IndexStepOneReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable wordCount : values) {
count += wordCount.get();
}
v.set(count);
context.write(key,v);
}
}
/**
* 描述:
* 索引排序
* 單詞在每個文件中的數量
* hello a.txt 3 b.txt 2 c.txt 1
* hadoop a.txt 3 b.txt 2 c.txt 1
*
* 中間結果
*
* hello--a.txt 3
* hello--b.txt 2
* hello--c.txt 2
* jerry--a.txt 1
* jerry--b.txt 3
* jerry--c.txt 1
* tom--a.txt 2
* tom--b.txt 1
* tom--c.txt 1
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 14:55
*/
public class IndexStepOneDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepOneDriver.class);
job.setMapperClass(IndexStepOneMapper.class);
job.setReducerClass(IndexStepOneReducer.class);
//mapper reducer中輸出類型一致,所以可以配置一次
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job,new Path("D:/localHadoop/data/indexStepOne/input"));
FileOutputFormat.setOutputPath(job,new Path("D:/localHadoop/data/indexStepOne/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
第二步:程序代碼
/**
* 描述:
* hello--a.txt 3
* hello--b.txt 2
* hello--c.txt 2
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 15:20
*/
public class IndexStepTwoMapper extends Mapper<LongWritable,Text,Text,Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] wordFileCounts = line.split("--");
k.set(wordFileCounts[0]);
v.set(wordFileCounts[1]);
context.write(k,v);
}
}
public class IndexStepTwoReducer extends Reducer<Text,Text,Text,Text> {
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for (Text value : values) {
sb.append(value + " ");
}
v.set(sb.toString());
context.write(key,v);
}
}
/**
* 描述:
* 將中間結果處理爲最終結果
* hello a.txt 3 b.txt 2 c.txt 1
* hadoop a.txt 3 b.txt 2 c.txt 1
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 15:18
*/
public class IndexStepTwoDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepTwoDriver.class);
job.setMapperClass(IndexStepTwoMapper.class);
job.setReducerClass(IndexStepTwoReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path("D:/localHadoop/data/indexStepOne/output"));
FileOutputFormat.setOutputPath(job,new Path("D:/localHadoop/data/indexStepTwo/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
社交粉絲數據分析
以下是qq的好友列表數據,冒號前是一個用,冒號後是該用戶的所有好友(數據中的好友關係是單向的),求出哪些人兩兩之間有共同好友,及他倆的共同好友都有誰?
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
分析:
- 先求出A/B/C…都是那些人的共同好友
A:B C D
B:A
C:A B
D:A C
- 將好友所屬的人兩兩合併,即可得兩兩共同好友
B C: A
B D: A
C D: A
僞代碼分析
第一步
map
讀一行 A:B,C,D,F,E,O
輸出 <B,A><C,A><D,A><F,A><E,A><O,A>
再讀一行 B:A,C,E,K
輸出 <A,B><C,B><E,B><K,B>
reduce
拿到的數據比如<C,A><C,B><C,E><C,F><C,G>......
輸出:
<A-B,C>
<A-E,C>
<A-F,C>
<A-G,C>
<B-E,C>
<B-F,C>....
第二步
map
讀入一行<A-B,C>
直接輸出<A-B,C>
reduce
讀入數據 <A-B,C><A-B,F><A-B,G>.......
輸出: A-B C,F,G,.....
StepOne
public class ShareFriendsStepOneMapper extends Mapper<LongWritable,Text,Text,Text> {
Text k = new Text();
Text v = new Text();
/**
*
* @param key
* @param value A:B,C,D,F,E,O
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] personFriends = line.split(":");
String person = personFriends[0];
String[] friends = personFriends[1].split(",");
v.set(person);
for (String friend : friends) {
k.set(friend);
// 輸出<好友,人>
context.write(k,v);
}
}
}
/**
* 描述:
* <好友:人,人,人>
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 16:28
*/
public class ShareFriendsStepOneReducer extends Reducer<Text,Text,Text,Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void reduce(Text friend, Iterable<Text> persons, Context context) throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text person : persons) {
sb.append(person).append(",");
}
k.set(friend);
v.set(sb.toString());
context.write(k,v);
}
}
public class ShareFriendsStepOneDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ShareFriendsStepOneDriver.class);
job.setMapperClass(ShareFriendsStepOneMapper.class);
job.setReducerClass(ShareFriendsStepOneReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path("D:/localHadoop/data/shareFriendsStepOne/input"));
FileOutputFormat.setOutputPath(job,new Path("D:/localHadoop/data/shareFriendsStepOne/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
StepTwo
/**
* 描述:
* A I,K,C,B,G,F,H,O,D,
* B A,F,J,E,
* C A,E,B,H,F,G,K,
* D G,C,K,A,L,F,E,H,
* E G,M,L,H,A,F,B,D,
* F L,M,D,C,G,A,
* G M,
* H O,
* I O,C,
* J O,
* K B,
* L D,E,
* M E,F,
* O A,H,I,J,F,
* 好友 人,人,人,人
* <p>
* 獲取:人人,好友
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 16:29
*/
public class ShareFriendsStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
/**
* 拿到的數據是上一個步驟的輸出結果
* A I,K,C,B,G,F,H,O,D,
* 友 人,人,人
*
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] friendPersons = line.split("\t");
String friend = friendPersons[0];
String[] persons = friendPersons[1].split(",");
//避免出現A-B,B-A相同關係進入不同reducer中
Arrays.sort(persons);
v.set(friend);
for (int i = 0; i < persons.length - 2; i++) {
for (int j = i + 1; j < persons.length - 1; j++) {
// 發出 <人-人,好友> ,這樣,相同的“人-人”對的所有好友就會到同1個reduce中去
k.set(persons[i] + "-" + persons[j]);
context.write(k, v);
}
}
}
}
/**
* 描述:
* 輸入:人人,好友
* 輸出:人人 好友,好友,好友,好友
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 16:29
*/
public class ShareFriendsStepTwoReducer extends Reducer<Text,Text,Text,Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void reduce(Text personPerson, Iterable<Text> friends, Context context) throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text friend : friends) {
sb.append(friend).append(",");
}
k.set(personPerson);
v.set(sb.toString());
context.write(k,v);
}
}
/**
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/10 16:28
*/
public class ShareFriendsStepTwoDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ShareFriendsStepTwoDriver.class);
job.setMapperClass(ShareFriendsStepTwoMapper.class);
job.setReducerClass(ShareFriendsStepTwoReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job,new Path("D:/localHadoop/data/shareFriendsStepOne/output"));
FileOutputFormat.setOutputPath(job,new Path("D:/localHadoop/data/shareFriendsStepTwo/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
web日誌預處理
# ip timestamp method refferr stateCode bytes os
194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
101.226.68.137 - - [18/Sep/2013:06:49:45 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
60.208.6.156 - - [18/Sep/2013:06:49:48 +0000] "GET /wp-content/uploads/2013/07/rcassandra.png HTTP/1.0" 200 185524 "http://cos.name/category/software/packages/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/A00n" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
- 需求:
對web訪問日誌中的各字段識別切分
去除日誌中不合法的記錄
根據KPI統計需求,生成各類訪問請求過濾數據 - 實現代碼:
a) 定義一個bean,用來記錄日誌數據中的各數據字段
public class WebLogBean {
private String remote_addr;// 記錄客戶端的ip地址
private String remote_user;// 記錄客戶端用戶名稱,忽略屬性"-"
private String time_local;// 記錄訪問時間與時區
private String request;// 記錄請求的url與http協議
private String status;// 記錄請求狀態;成功是200
private String body_bytes_sent;// 記錄發送給客戶端文件主體內容大小
private String http_referer;// 用來記錄從那個頁面鏈接訪問過來的
private String http_user_agent;// 記錄客戶瀏覽器的相關信息
private boolean valid = true;// 判斷數據是否合法
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
return http_referer;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
/**
* 使用二進行控制分隔符,避免與正文內容衝突
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.valid);
sb.append("\001").append(this.remote_addr);
sb.append("\001").append(this.remote_user);
sb.append("\001").append(this.time_local);
sb.append("\001").append(this.request);
sb.append("\001").append(this.status);
sb.append("\001").append(this.body_bytes_sent);
sb.append("\001").append(this.http_referer);
sb.append("\001").append(this.http_user_agent);
return sb.toString();
}
}
b)定義一個parser用來解析過濾web訪問日誌原始記錄
public class WebLogParser {
public static WebLogBean parser(String line) {
WebLogBean webLogBean = new WebLogBean();
String[] arr = line.split(" ");
if (arr.length > 11) {
webLogBean.setRemote_addr(arr[0]);
webLogBean.setRemote_user(arr[1]);
webLogBean.setTime_local(arr[3].substring(1));
webLogBean.setRequest(arr[6]);
webLogBean.setStatus(arr[8]);
webLogBean.setBody_bytes_sent(arr[9]);
webLogBean.setHttp_referer(arr[10]);
if (arr.length > 12) {
webLogBean.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
webLogBean.setHttp_user_agent(arr[11]);
}
if (Integer.parseInt(webLogBean.getStatus()) >= 400) {// 大於400,HTTP錯誤
webLogBean.setValid(false);
}
} else {
webLogBean.setValid(false);
}
return webLogBean;
}
public static String parserTime(String time) {
time.replace("/", "-");
return time;
}
}
c) mapreduce程序
public class WeblogPreProcess {
static class WeblogPreProcessMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Text k = new Text();
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
WebLogBean webLogBean = WebLogParser.parser(line);
if (!webLogBean.isValid())
return;
k.set(webLogBean.toString());
context.write(k, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WeblogPreProcess.class);
job.setMapperClass(WeblogPreProcessMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
自定義inputFormat
無論hdfs
還是mapreduce
,對於小文件都有損效率,實踐中,又難免面臨處理大量小文件的場景,此時,就需要有相應解決方案
小文件的優化無非以下幾種方式:
- 在數據採集的時候,就將小文件或小批數據合成大文件再上傳
HDFS
- 在業務處理之前,在
HDFS
上使用mapreduce
程序(集羣方式)對小文件進行合併 - 在
mapreduce
處理時,可採用combineInputFormat
提高效率
實現
本節實現的是上述第二種方式
程序的核心機制:
- 自定義一個
InputFormat
,設置切片規則,使每個小文件不可分片,避免每個文件使用一個mapper
,這樣可以使多個文件使用同一個mapper
去合併數據 - 改寫
RecordReader
,實現一次讀取一個完整文件封裝爲KV
(文件名,文件內容),可以確保合併後的文件有序,並且可以方便找到原始文件內容 - 在輸出時使用
SequenceFileOutPutFormat
輸出合併文件
代碼如下:
自定義InputFromat
public class WholeFileInputFormat extends
FileInputFormat<NullWritable, BytesWritable> {
//設置每個小文件不可分片,保證一個小文件生成一個key-value鍵值對
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
WholeFileRecordReader reader = new WholeFileRecordReader();
reader.initialize(split, context);
return reader;
}
}
自定義RecordReader
/**
*
* RecordReader的核心工作邏輯:
* 通過nextKeyValue()方法去讀取數據,構造將返回的key value
* 通過getCurrentKey 和 getCurrentValue來返回上面構造好的key和value
* @author lxf
*
*/
public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {
private FileSplit fileSplit;
private Configuration conf;
private BytesWritable value = new BytesWritable();
private boolean processed = false;
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
this.fileSplit = (FileSplit) split;
this.conf = context.getConfiguration();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!processed) {
byte[] contents = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
} finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
@Override
public NullWritable getCurrentKey() throws IOException,
InterruptedException {
return NullWritable.get();
}
@Override
public BytesWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
/**
* 返回當前進度
*/
@Override
public float getProgress() throws IOException {
return processed ? 1.0f : 0.0f;
}
@Override
public void close() throws IOException {
// do nothing
}
}
定義mapreduce
處理流程
public class SequenceFileMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
private Text filenameKey;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
InputSplit split = context.getInputSplit();
Path path = ((FileSplit) split).getPath();
filenameKey = new Text(path.toString());
}
@Override
protected void map(NullWritable key, BytesWritable value,
Context context) throws IOException, InterruptedException {
context.write(filenameKey, value);
}
}
public class SmallFilesToSequenceFileConverterDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
/*System.setProperty("HADOOP_USER_NAME", "hadoop");*/
// String[] otherArgs = new GenericOptionsParser(conf, args)
// .getRemainingArgs();
// if (otherArgs.length != 2) {
// System.err.println("Usage: combinefiles <in> <out>");
// System.exit(2);
// }
//
Job job = Job.getInstance(conf, "combine small files to sequencefile");
job.setJarByClass(SmallFilesToSequenceFileConverterDriver.class);
job.setInputFormatClass(WholeFileInputFormat.class);
// 輸出文件類型
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
// 不指定reducer默認使用一個reducer
job.setMapperClass(SequenceFileMapper.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
/**
* 第二種方式啓動mapreduce
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
args = new String[]{"D:/localHadoop/data/combineFile/input", "D:/localHadoop/data/combineFile/output"};
int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverterDriver(),
args);
System.exit(exitCode);
}
}
自定義outputFormat
現有一些原始日誌需要做增強解析處理,流程:
- 從原始日誌文件中讀取數據
- 根據日誌中的一個
URL
字段到外部知識庫中獲取信息增強到原始日誌 - 如果成功增強,則輸出到增強結果目錄;如果增強失敗,則抽取原始數據中
URL
字段輸出到待爬清單目錄
程序的關鍵點是要在一個mapreduce
程序中根據數據的不同輸出兩類結果到不同目錄,這類靈活的輸出需求可以通過自定義outputformat
來實現
實現
實現要點:
- 在
mapreduce
中訪問外部資源 - 自定義
outputformat
,改寫其中的recordwriter
,改寫具體輸出數據的方法write()
代碼實現如下:
數據庫獲取數據的工具
public class DBLoader {
public static void dbLoader(HashMap<String, String> ruleMap) {
Connection conn = null;
Statement st = null;
ResultSet res = null;
try {
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/urldb", "root", "root");
st = conn.createStatement();
res = st.executeQuery("select url,content from urlcontent");
while (res.next()) {
ruleMap.put(res.getString(1), res.getString(2));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try{
if(res!=null){
res.close();
}
if(st!=null){
st.close();
}
if(conn!=null){
conn.close();
}
}catch(Exception e){
e.printStackTrace();
}
}
}
public static void main(String[] args) {
DBLoader db = new DBLoader();
HashMap<String, String> map = new HashMap<String,String>();
db.dbLoader(map);
System.out.println(map.size());
}
}
自定義一個outputformat
/**
* 描述:
* 自定義輸出格式
*
* maptask或者reducetask在最終輸出時,先調用OutputFormat的getRecordWriter方法拿到一個RecordWriter
* 然後再調用RecordWriter的write(k,v)方法將數據寫出
* @author: lxf
* @version: v1.0
* @date: 2018/4/13 14:18
*/
public class WebLogEnhanceOutputFormat extends FileOutputFormat<Text,NullWritable> {
/**
* 獲取RecordWriter
* @param taskAttemptContext
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(taskAttemptContext.getConfiguration());
/**
* 自定義數據輸出路徑
*/
// Path enhancePath = new Path("hdfs://master:9000/flow/enhancelog/enhanced.log");
Path enhancePath = new Path("D:/localHadoop/data/weblogenhance/enhance/log.dat");
Path tocrawlPath = new Path("D:/localHadoop/data/weblogenhance/crw/url.dat");
FSDataOutputStream enhancedOs = fs.create(enhancePath);
FSDataOutputStream tocrawlOs = fs.create(tocrawlPath);
return new WebLogEnhanceRecordWriter(enhancedOs,tocrawlOs);
}
}
/**
* 描述:
* 自定義記錄輸出
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/13 14:20
*/
public class WebLogEnhanceRecordWriter extends RecordWriter<Text,NullWritable> {
FSDataOutputStream enhancedOs = null;
FSDataOutputStream tocrawlOs = null;
public WebLogEnhanceRecordWriter(FSDataOutputStream enhancedOs, FSDataOutputStream tocrawlOs) {
super();
this.enhancedOs = enhancedOs;
this.tocrawlOs = tocrawlOs;
}
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
String result = text.toString();
// 如果要寫出的數據是待爬的url,則寫入待爬清單文件 /logenhance/tocrawl/url.dat
if (result.contains("toCrawl")) {
tocrawlOs.write(result.getBytes());
} else {
// 如果要寫出的數據是增強日誌,則寫入增強日誌文件 /logenhance/enhancedlog/log.dat
enhancedOs.write(result.getBytes());
}
}
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
if (tocrawlOs != null) {
tocrawlOs.close();
}
if (enhancedOs != null) {
enhancedOs.close();
}
}
}
開發mapreduce
處理流程
/**
* 這個程序是對每個小時不斷產生的用戶上網記錄日誌進行增強(將日誌中的url所指向的網頁內容分析結果信息追加到每一行原始日誌後面)
*
* @author
*
*/
public class WebLogEnhancerMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
Map<String, String> ruleMap = new HashMap<String, String>();
Text k = new Text();
NullWritable v = NullWritable.get();
/**
* 從數據庫中加載規則信息倒ruleMap中
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
DBLoader.dbLoader(ruleMap);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 獲取一個計數器用來記錄不合法的日誌行數, 組名, 計數器名稱,輸出到屏幕
Counter counter = context.getCounter("malformed", "malformedLine");
String line = value.toString();
String[] fields = StringUtils.split(line,"\t");
try {
String url = fields[26];
String content_tag = ruleMap.get(url);
// 判斷內容標籤是否爲空,如果爲空,則只輸出url到待爬清單;如果有值,則輸出到增強日誌
if (content_tag == null) {
if(url.startsWith("http://")){
k.set(url + "\t" + "toCrawl" + "\n");
}else {
k.set(url + "\t" + "inValid" + "\n");
counter.increment(1);
}
context.write(k, v);
} else {
k.set(line + "\t" + content_tag + "\n");
context.write(k, v);
}
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
/**
* 描述:
* 現有一些原始日誌需要做增強解析處理,流程:
* 1、 從原始日誌文件中讀取數據
* 2、 根據日誌中的一個URL字段到外部知識庫中獲取信息增強到原始日誌
* 3、 如果成功增強,則輸出到增強結果目錄;如果增強失敗,則抽取原始數據中URL字段輸出到待爬清單目錄
*
* 程序的關鍵點是要在一個mapreduce程序中根據數據的不同輸出兩類結果到不同目錄,
* 這類靈活的輸出需求可以通過自定義outputformat來實現
*
* 實現要點:
* 1、 在mapreduce中訪問外部資源
* 2、 自定義outputformat,改寫其中的recordwriter,改寫具體輸出數據的方法write()
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/12 9:15
*/
public class WebLogEnhancerDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WebLogEnhancerDriver.class);
job.setMapperClass(WebLogEnhancerMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 要控制不同的內容寫往不同的目標路徑,可以採用自定義outputformat的方法
job.setOutputFormatClass(WebLogEnhanceOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("D:/localHadoop/data/weblogenhance/input"));
// 儘管我們用的是自定義outputformat,但是它是繼承製fileoutputformat
// 在fileoutputformat中,必須輸出一個_success文件,所以在此還需要設置輸出path
FileOutputFormat.setOutputPath(job, new Path("D:/localHadoop/data/weblogenhance/output"));
// 不需要reducer
job.setNumReduceTasks(0);
job.waitForCompletion(true);
System.exit(0);
}
}
自定義GroupingComparator
有如下訂單數據,現在需要求出每一個訂單中成交金額最大的一筆交易
訂單id | 商品id | 成交金額 |
---|---|---|
Order_0000001 | Pdt_01 | 222.8 |
Order_0000001 | Pdt_05 | 25.8 |
Order_0000002 | Pdt_03 | 522.8 |
Order_0000002 | Pdt_04 | 122.4 |
Order_0000002 | Pdt_05 | 722.4 |
Order_0000003 | Pdt_01 | 222.8 |
分析
方法1. 訂單id作爲key 其它數據作爲value,同一訂單發往同一reducer,reducer自定義排序,找出最大一筆交易
缺點:需要緩存所有訂單數據,資源消耗大,效率不高
方法2. 使用map reducer
框架排序機制
- 利用“訂單id和成交金額”作爲
key
,可以將map
階段讀取到的所有訂單數據按照id分區,按照金額排序,發送到reduce
- 在
reduce
端利用groupingcomparator
將訂單id
相同的kv
聚合成組,然後取第一個即是最大值
代碼
定義訂單Bean
/**
* 描述:
* 訂單Bean
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/11 16:04
*/
public class OrderBean implements WritableComparable<OrderBean> {
/**
* 使用對象爲了使用其中的compareTo()
*/
private Text orderId;
private Text productName;
private DoubleWritable amount;
public OrderBean() {
}
public void set(Text orderId, Text productName, DoubleWritable amount) {
this.orderId = orderId;
this.productName = productName;
this.amount = amount;
}
public Text getOrderId() {
return orderId;
}
public void setOrderId(Text orderId) {
this.orderId = orderId;
}
public Text getProductName() {
return productName;
}
public void setProductName(Text productName) {
this.productName = productName;
}
public DoubleWritable getAmount() {
return amount;
}
public void setAmount(DoubleWritable amount) {
this.amount = amount;
}
/**
* 用來對數據進行排序
* shuffle過程中的同一分區數據排序
* 同一訂單,按價格倒序排列
* @param orderBean
* @return
*/
@Override
public int compareTo(OrderBean orderBean) {
int tmp = this.orderId.compareTo(orderBean.getOrderId());
if (tmp == 0){
tmp = -this.amount.compareTo(orderBean.getAmount());
}
return tmp;
}
@Override
public void write(DataOutput output) throws IOException {
output.writeUTF(orderId.toString());
output.writeUTF(productName.toString());
output.writeDouble(amount.get());
}
@Override
public void readFields(DataInput input) throws IOException {
this.orderId = new Text(input.readUTF());
this.productName = new Text(input.readUTF());
this.amount = new DoubleWritable(input.readDouble());
}
@Override
public String toString() {
return "MaxOrder:" +
"orderId=" + orderId +
", productName=" + productName +
", amount=" + amount;
}
}
自定義GroupingComparator
用來對分區內數據進行分組
/**
* 描述:
* 利用reduce端的GroupingComparator來實現將一組bean看成相同的key
*
* GroupingComparator用來對數據進行分組
* @author: lxf
* @version: v1.0
* @date: 2018/4/11 16:02
*/
public class OrderIdGroupingComparator extends WritableComparator{
/**
* 傳入作爲key的bean的class類型,以及制定需要讓框架做反射獲取實例對象
*/
public OrderIdGroupingComparator() {
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean abean = (OrderBean) a;
OrderBean bbean = (OrderBean) b;
//比較兩個bean時,指定只比較bean中的orderId
return abean.getOrderId().compareTo(bbean.getOrderId());
}
}
自定義分區方法OrderIdPartitioner
對數據進行分區
/**
* 描述:
* 自定義分區方法
* OrderIdPartitioner用來對數據進行分區
* @author: lxf
* @version: v1.0
* @date: 2018/4/11 16:18
*/
public class OrderIdPartitioner extends Partitioner<OrderBean,NullWritable> {
/**
* 相同orderId的訂單bean,會發往相同的partition
* 而且,產生的分區數,是會跟用戶設置的reduce task數保持一致
* @param orderBean
* @param nullWritable
* @param numReduceTasks reducesTask個數
* @return
*/
@Override
public int getPartition(OrderBean orderBean, NullWritable nullWritable, int numReduceTasks) {
/**
* orderId相同的bean進入同一partition
*/
return (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE % numReduceTasks);
}
}
mapreduce
程序
/**
* 描述:
* 訂單id 商品id 成交金額
* Order_0000001 Pdt_01 222.8
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/11 16:02
*/
public class MaximumProductMapper extends Mapper<LongWritable,Text,OrderBean,NullWritable>{
OrderBean orderBean = new OrderBean();
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split(",");
orderBean.set(new Text(fields[0]),
new Text(fields[1]),
new DoubleWritable(Double.parseDouble(fields[2])));
context.write(orderBean,v);
}
}
/**
* 描述:
*
* @author: lxf
* @version: v1.0
* @date: 2018/4/11 16:02
*/
public class MaximumProductReducer extends Reducer<OrderBean,NullWritable,OrderBean,NullWritable>{
/**
* 到達reduce時,相同id的所有bean已經被看成一組,且金額最大的那個一排在第一位
* 程序內部不是先遍歷數據進行分區的,數據量大、資源消耗太大,
* 調用reduce()方法時,依次迭代相同分區的排序過的數據
* 1.先傳入一個key,然後調用hasNext()判斷是否存在下一個值
* 2.hasNext()存在,然後調用GroupingComparator判斷是否屬於同一組
* 3.若是同一組,繼續調用hasNext()調用步驟2
* 4.若不是同一組,調用步驟1
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
public class MaximumProductDriver {
public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MaximumProductDriver.class);
job.setMapperClass(MaximumProductMapper.class);
job.setReducerClass(MaximumProductReducer.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("D:/localHadoop/data/groupComparator/input"));
FileOutputFormat.setOutputPath(job, new Path("D:/localHadoop/data/groupComparator/output"));
//在此設置自定義的Groupingcomparator類
job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
//在此設置自定義的partitioner類
job.setPartitionerClass(OrderIdPartitioner.class);
job.setNumReduceTasks(2);
job.waitForCompletion(true);
}
}
計數器應用
在實際生產代碼中,常常需要將數據處理過程中遇到的不合規數據行進行全局計數,類似這種需求可以藉助mapreduce
框架中提供的全局計數器來實現
public class MultiOutputs {
//通過枚舉形式定義自定義計數器
enum MyCounter{MALFORORMED,NORMAL}
static class CommaMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(",");
for (String word : words) {
context.write(new Text(word), new LongWritable(1));
}
//對枚舉定義的自定義計數器加1
context.getCounter(MyCounter.MALFORORMED).increment(1);
//通過動態設置自定義計數器加1
context.getCounter("counterGroupa", "countera").increment(1);
}
}
多job串聯
一個稍複雜點的處理邏輯往往需要多個mapreduce
程序串聯處理,多job的串聯可以藉助mapreduce
框架的JobControl
實現
ControlledJob cJob1 = new ControlledJob(job1.getConfiguration());
ControlledJob cJob2 = new ControlledJob(job2.getConfiguration());
ControlledJob cJob3 = new ControlledJob(job3.getConfiguration());
cJob1.setJob(job1);
cJob2.setJob(job2);
cJob3.setJob(job3);
// 設置作業依賴關係
cJob2.addDependingJob(cJob1);
cJob3.addDependingJob(cJob2);
JobControl jobControl = new JobControl("RecommendationJob");
jobControl.addJob(cJob1);
jobControl.addJob(cJob2);
jobControl.addJob(cJob3);
// 新建一個線程來運行已加入JobControl中的作業,開始進程並等待結束
Thread jobControlThread = new Thread(jobControl);
jobControlThread.start();
while (!jobControl.allFinished()) {
Thread.sleep(500);
}
jobControl.stop();
return 0;
方法2,使用shell
腳本自定義執行順序,方便組合