1.HDFS數據完整性
HDFS會寫入所有數據的計算校驗和,並對正在讀取的數據進行校驗,默認校驗方式是RCR-32。
不只是讀寫數據時會進行校驗,datanode也會在後臺線程中運行DataBlockScanner進行校驗,定期檢查數據的缺失情況。
客戶端讀寫數據時,發現數據損壞了,向namenode彙報,拋出ChecksumException,namenode將該datanode上的數據轉移到其他的datanode,最後刪除該損壞的數據塊。
在使用open()方法讀取文件之前,將false傳送給FileSystem對象的setVerfyChecksum()方法,即可以禁止校驗和驗證,如果你讀取的文件時損壞的,那麼在文件刪除之前,你還可以恢復部分數據,以免該datanode轉移數據失敗後直接被刪除。
2.LocalFileSysytem
localfileSystem繼承於checkFileSysytem,
checkFileSysytem繼承於FilterFileSystem,
FilterFileSystem繼承於FileSysytem。
localfileSystem可以對文件進行校驗。
3.幾種壓縮方式
4.讀取壓縮數據
codecClassName加載壓縮格式。
5.對文件進行壓縮
6.壓縮解壓線程池
7.壓縮文件的處理
hdfs中每片數據128m,1GB的gzip壓縮文件分成8片,但只會被一個馬匹處理,因爲壓縮文件無法切片以後對每一片處理。
但是我們處理大文件時一定要確保數據格式可以被切片:
8.mapredeuce 的結果文件進行壓縮
9.定製comparator和WritableComparable接口
下面的程序實現了:對每個id相同的商品中挑選出價格最貴的那個。
這裏map和reduce期間經過了兩個過程:
1.因爲實現了WritableComparable接口,故會對其排序
2.因爲實現了WritableComparator接口,會將相同的kay放在一起
查找源碼可以發現,WritableComparator調用compare方法實質是調用WritableComparable接口的comareTo方法進行比較。
package com.qianliu.bigdata.mr.secondarysort;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 利用reduce端的GroupingComparator來實現將一組bean看成相同的key,相當於自定義shuffle的分組規則
* @author
*
*/
public class ItemidGroupingComparator extends WritableComparator {
//註冊OrderBean,以及制定需要讓框架做反射獲取實例對象
protected ItemidGroupingComparator() {
super(OrderBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean abean = (OrderBean) a;
OrderBean bbean = (OrderBean) b;
/*比較兩個bean時,指定只比較bean中的orderid,id相等則認爲是相等的,
其實key早就被拼接生成了id+amount的形式,如果此地方不重寫此方法,他默認比較key
*/
return abean.getItemid().compareTo(bbean.getItemid());
}
}
package com.qianliu.bigdata.mr.secondarysort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class OrderBean implements WritableComparable<OrderBean>{
private Text itemid;//商品id
private DoubleWritable amount;//商品價格double類型
public OrderBean() {
}
public OrderBean(Text itemid, DoubleWritable amount) {
set(itemid, amount);
}
public void set(Text itemid, DoubleWritable amount) {
this.itemid = itemid;
this.amount = amount;
}
public Text getItemid() {
return itemid;
}
public DoubleWritable getAmount() {
return amount;
}
@Override
public int compareTo(OrderBean o) {
int cmp = this.itemid.compareTo(o.getItemid());//compareTo是比較前後Text是否相同的一個方法,相同則返回0
if (cmp == 0) {
/*compareTo是DoubleWritable比較前後大小的方法,後面的大爲-1
*最前面加一個“負號”是因爲return的值爲正,最後job輸出時列出該數據排在前面,加符號使得大數排序後在後
*/
cmp = -this.amount.compareTo(o.getAmount());
}
return cmp;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(itemid.toString());
out.writeDouble(amount.get());
}
@Override
public void readFields(DataInput in) throws IOException {
String readUTF = in.readUTF();
double readDouble = in.readDouble();
this.itemid = new Text(readUTF);
this.amount= new DoubleWritable(readDouble);
}
@Override
public String toString() {
return itemid.toString() + "\t" + amount.get();
}
}
package com.qianliu.bigdata.mr.secondarysort;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class ItemIdPartitioner extends Partitioner<OrderBean, NullWritable>{
@Override
public int getPartition(OrderBean bean, NullWritable value, int numReduceTasks) {
//相同id的訂單bean,會發往相同的partition
//而且,產生的分區數,是會跟用戶設置的reduce task數保持一致
return (bean.getItemid().hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
}
package com.qianliu.bigdata.mr.secondarysort;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.sun.xml.bind.v2.schemagen.xmlschema.List;
/**
*
* @author [email protected]
*
*/
public class SecondarySort {
static class SecondarySortMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
OrderBean bean = new OrderBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = StringUtils.split(line, ",");
bean.set(new Text(fields[0]), new DoubleWritable(Double.parseDouble(fields[2])));
context.write(bean, NullWritable.get());
}
}
static class SecondarySortReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>{
//這裏map和reduce期間經過了兩個過程:
//1.因爲實現了WritableComparable接口,故會對其排序
//2.因爲實現了WritableComparator接口,會將相同的kay放在一起
//到達reduce時,相同id的所有bean已經被看成一組,且金額最大的那個一排在第一位
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SecondarySort.class);
job.setMapperClass(SecondarySortMapper.class);
job.setReducerClass(SecondarySortReducer.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("E:\\IDEA\\MapReduceLocalhost\\secondarysort\\input"));
FileOutputFormat.setOutputPath(job, new Path("E:\\IDEA\\MapReduceLocalhost\\secondarysort\\output"));
//在此設置自定義的Groupingcomparator類
job.setGroupingComparatorClass(ItemidGroupingComparator.class);
//在此設置自定義的partitioner類
job.setPartitionerClass(ItemIdPartitioner.class);
job.setNumReduceTasks(2);
job.waitForCompletion(true);
}
}