MapReduce I

MapReduce

MR : 編程模型。

WordCountMR

1.編寫Mapper
package com.hadoop.mr;

        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.mapreduce.Mapper;

        import java.io.IOException;

        /**
         * WordCountMapper
         */
        public class WordCountMapper extends Mapper<LongWritable, Text, Text,IntWritable> {

            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                String line = value.toString();
                String[] arr = line.split(" ");

                Text keyOut = new Text();
                IntWritable valueOut = new IntWritable(1);
                for(String word : arr){
                    keyOut.set(word);
                    context.write(keyOut,valueOut);
                }
            }
        }
2.編寫Reducer
package com.hadoop.mr;

        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Reducer;

        import java.io.IOException;

        /**
         * WordCountReducer
         */
        public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

            protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
                int count = 0 ;
                for(IntWritable iw : values){
                    count = count + iw.get() ;
                }
                context.write(key,new IntWritable(count));
            }
        }
3.編寫App
package com.it18zhang.hadoop.mr;

        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Job;
        import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
        import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

        import java.io.IOException;

        /**
         */
        public class App {
            public static void main(String[] args) throws Exception {
                Configuration conf = new Configuration();
                Job job = Job.getInstance(conf);

                job.setJobName("WordCount");
                job.setJarByClass(App.class);

                job.setMapperClass(WordCountMapper.class);
                job.setReducerClass(WordCountReducer.class);

                //添加輸入路徑
                FileInputFormat.addInputPath(job,new Path(args[0]));
                //設置輸出路徑
                FileOutputFormat.setOutputPath(job,new Path(args[1]));

                //設置mapreduce輸出
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(IntWritable.class);

                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(IntWritable.class);

                job.setNumReduceTasks(2);

                job.waitForCompletion(true) ;
            }
        }
4.運行

5.導出jar到hadoop集羣上運行
    5.1)導出jar
    5.2)部署到centos
    5.3)啓動yarn集羣
        start-yarn.sh

    5.4)查看yarn webui
        http://s100:8088/

    5.5)準備數據

    5.6)執行job
        hadoop jar my-hadoop-day04.jar com.it18zhang.hadoop.mr.App /user/centos/1.txt /user/centos/out

Combiner

是map端的reduce,預聚合。減少網絡流量。對map的每個分區進行聚合。

Mapper

run(){
        setup();
        while(...){
            //...
            map();
        }
        cleanup();
    }

Reducer

run(){
        setup();
        while(...){
            //
            reduce(key,Iteratable<IntWritable> it ...);
        }
        cleanup();
    }

file:/tmp/hadoop-Administrator/mapred/staging/Administrator897294152/.staging

splitSize

minSplitSize maxSplitSize blockSize

min = 7
max = 7
block = 32M
splitsize = 7

24 / 7 =

hello world
h#ello world

考察切片的計算法則

1.修改切片的min max
job.getConfiguration().set("mapreduce.input.fileinputformat.split.minsize","14");
        job.getConfiguration().set("mapreduce.input.fileinputformat.split.maxsize","14");

// FileInputFormat.setMinInputSplitSize(job,7);
// FileInputFormat.setMaxInputSplitSize(job,7);

MultipleInputs

多個輸入。

DBInputFormat

count           //總的記錄數,100
int chunks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);        //map個數,3
count / chunks  //每個切片的記錄數。 33

1.實現DBWritable類
package com.hadoop.mr.input.db;

        import org.apache.hadoop.io.Writable;
        import org.apache.hadoop.mapreduce.lib.db.DBWritable;

        import java.io.DataInput;
        import java.io.DataOutput;
        import java.io.IOException;
        import java.sql.PreparedStatement;
        import java.sql.ResultSet;
        import java.sql.SQLException;

        /**
         *MyDBWritable
         */
        public class MyDBWritable implements DBWritable,Writable{
            public int id ;
            public String orderno ;
            public float price ;
            public int cid ;

            public void write(PreparedStatement statement) throws SQLException {

            }

            public void readFields(ResultSet rs) throws SQLException {
                this.id = rs.getInt("id") ;
                this.orderno = rs.getString("orderno") ;
                this.price = rs.getFloat("price") ;
                this.cid = rs.getInt("cid") ;
            }

            //串行
            public void write(DataOutput out) throws IOException {
                out.writeInt(id);
                out.writeUTF(orderno);
                out.writeFloat(price);
                out.writeInt(cid);

            }
            //反串行
            public void readFields(DataInput in) throws IOException {
                this.id = in.readInt() ;
                this.orderno = in.readUTF();
                this.price = in.readFloat() ;
                this.cid = in.readInt() ;
            }
        }
2.自定義DBInputFormat
package com.hadoop.mr.input.db;

        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.mapreduce.InputSplit;
        import org.apache.hadoop.mapreduce.JobContext;
        import org.apache.hadoop.mapreduce.MRJobConfig;
        import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;

        import java.io.IOException;
        import java.sql.ResultSet;
        import java.sql.SQLException;
        import java.sql.Statement;
        import java.util.ArrayList;
        import java.util.List;

        /**
         * Created by Administrator on 2017/8/17.
         */
        public class MyDBInputFormat extends DBInputFormat<MyDBWritable>{
            public void setConf(Configuration conf) {
                super.setConf(conf);
            }

            public List<InputSplit> getSplits(JobContext job) throws IOException {

                ResultSet results = null;
                Statement statement = null;
                try {
                    statement = connection.createStatement();

                    results = statement.executeQuery(getCountQuery());
                    results.next();

                    long count = results.getLong(1);
                    int chunks = 3; //job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
                    long chunkSize = (count / chunks);

                    results.close();
                    statement.close();

                    List<InputSplit> splits = new ArrayList<InputSplit>();

                    // Split the rows into n-number of chunks and adjust the last chunk
                    // accordingly
                    for (int i = 0; i < chunks; i++) {
                        DBInputSplit split;

                        if ((i + 1) == chunks)
                            split = new DBInputSplit(i * chunkSize, count);
                        else
                            split = new DBInputSplit(i * chunkSize, (i * chunkSize) + chunkSize);

                        splits.add(split);
                    }

                    connection.commit();
                    return splits;
                } catch (SQLException e) {
                    throw new IOException("Got SQLException", e);
                } finally {
                    try {
                        if (results != null) {
                            results.close();
                        }
                    } catch (SQLException e1) {
                    }
                    try {
                        if (statement != null) {
                            statement.close();
                        }
                    } catch (SQLException e1) {
                    }

                    closeConnection();
                }
            }
        }

輸出格式

1.TextOutputFormat
    默認格式
2.SequenceFileOutputFormat
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

MR特性

1.計數器
    debug-tool遠程調試類。

2.計數器限制
    [mapred-site.xml]
    mapreduce.job.counters.limit=120

3.計數器名稱長度限制(64,代碼中硬性約束)
    192.168.11.113:13932:pool-3-thread-1:DBReduce@546740333:reduce()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章