Hadoop编程

Hadoop思维导图下载链接

实例

/opt/sxt/hadoop-2.6.5/share/hadoop/mapreduce/

jar包

hadoop-mapreduce-examples-2.6.5.ja

准备

for i in seq 100000;do echo “hello sxt $i” >> test.txt;done
hdfs dfs -mkdir -p /user/root
hdfs dfs -ls -R /
hdfs dfs -D dfs.blocksize=1048576 -put ./test.txt /user/root

命令

hadoop jar hadoop-mapreduce-examples-2.6.5.jar wordcount /input /output
- wordcount为主程序
- *input:是hdfs文件系统中数据所在的目录
- *ouput:是hdfs中不存在的目录，mr程序运行的结果会输出到该目录（输出路径不允许放东西）

讲解

以下是输出目录的内容：
-rw-r–r-- 3 root supergroup 0 2017-07-02 02:49 /mr/test/output/_SUCCESS/
- /_SUCCESS：是信号/标志文件
-rw-r–r-- 3 root supergroup 49 2017-07-02 02:49 /mr/test/output/part-r-00000
- /part-r-00000：是reduce输出的数据文件，r：reduce的意思，00000是对应的reduce
多个reduce会有多个数据文件

WordCount案例

启动

zkServer.sh start
start.dfs.sh
yarn-daemon.sh start resourcemanager
start-yarn.sh

WordCount

MyWC
- package com.sxt.mr.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyWC {

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    Configuration conf = new Configuration();
    //Create a new job
    Job job = Job.getInstance(conf);
    //要打成jar包的入口函数
    job.setJarByClass(MyWC.class);

    //Specify various job-specific parameters
    //定义job名称
    job.setJobName("myjob");

    //定义输入路径
    Path inPath = new Path("/user/root/test.txt");
    FileInputFormat.addInputPath(job, inPath);

    //定义输出路径（不允许放东西）
    Path outPath = new Path("/output/wordcount");
    //有则删除
    if (outPath.getFileSystem(conf).exists(outPath)) {
        outPath.getFileSystem(conf).delete(outPath, true);
    }
    FileOutputFormat.setOutputPath(job, outPath);


    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setReducerClass(MyReducer.class);

    //Submit the job,then poll for progress until the job is complete
    //提交job作业
    job.waitForCompletion(true);


}

}

MyMapper
- package com.sxt.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.StringTokenizer;

public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    StringTokenizer itr = new StringTokenizer(value.toString());      //hello **
    while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
    }

}

}

MyReducer
- package com.sxt.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

//迭代计算
private IntWritable result = new IntWritable();

@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    int sum = 0;
    for (IntWritable val : values) {
        sum += val.get();
    }
    result.set(sum);
    context.write(key, result);

}

}

源码分析

Mapreduce案例

案例一

MyTQ
- package com.bjsxt.tq;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/*
*1949-10-01 14:21:02 34c
*

*/
public class MyTQ {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

 //1.配置
 Configuration conf = new Configuration();
 Job job = Job.getInstance(conf);

 job.setJarByClass(MyTQ.class);
 job.setJobName("tq");

 //2.设置输入输出路径
 Path inPath = new Path("/tq/input");
 FileInputFormat.addInputPath(job, inPath);
 Path outPath = new Path("/tq/output");
 if (outPath.getFileSystem(conf).exists(outPath)) {
     outPath.getFileSystem(conf).delete(outPath, true);
 }
 FileOutputFormat.setOutputPath(job, outPath);

 //3.设置Mapper
 job.setMapperClass(Tmapper.class);//自定义传输key
 job.setMapOutputKeyClass(Tq.class);
 job.setOutputValueClass(IntWritable.class);

 //4.自定义排序比较器
 job.setSortComparatorClass(TSortComparator.class);

 //5.自定义分区器
 job.setPartitionerClass(TPartitoner.class);

 //6. 自定义组排序器
 job.setGroupingComparatorClass(TGroupComparator.class);

 //7.设置reducetask数量
 job.setNumReduceTasks(2);

 //8.设置reducer
 job.setReducerClass(Treducer.class);

 //9.打印过程
 job.waitForCompletion(true);

}
}

TGroupComparator
- package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TGroupComparator extends WritableComparator {
Tq t1 = null;
Tq t2 = null;

public TGroupComparator() {
    super(Tq.class, true);
}


@Override
public int compare(WritableComparable a, WritableComparable b) {
    t1 = (Tq) a;
    t2 = (Tq) b;

    int c1 = Integer.compare(t1.getYear(), t2.getYear());
    if (c1 == 0) {
        return Integer.compare(t1.getMonth(), t2.getMonth());


    }

    return c1;

}

}

Tmapper
- package com.bjsxt.tq;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

public class Tmapper extends Mapper<LongWritable, Text, Tq, IntWritable> {
Tq tkey = new Tq();
IntWritable tval = new IntWritable();

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

    //获得时间、温度数组
    String[] words = StringUtils.split(value.toString(), '\t');
    String pattern = "yyyy-MM-dd";
    SimpleDateFormat sdf = new SimpleDateFormat(pattern);
    try {
        Date date = sdf.parse(words[0]);
        Calendar cal = Calendar.getInstance();
        cal.setTime(date);

        tkey.setYear(cal.get(Calendar.YEAR));
        tkey.setMonth(cal.get(Calendar.MONTH) + 1);
        tkey.setDay(cal.get(Calendar.DAY_OF_MONTH));
        int wd = Integer.parseInt(words[1].substring(0, words[1].lastIndexOf("c")));
        tkey.setWd(wd);

        tval.set(wd);
        context.write(tkey, tval);

    } catch (ParseException e) {
        e.printStackTrace();
    }


}

}

TPartitoner
- package com.bjsxt.tq;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class TPartitoner extends Partitioner<Tq, IntWritable> {

@Override
public int getPartition(Tq key, IntWritable value, int i) {
    return key.getYear() % i;
}

}

Tq
- package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Tq implements WritableComparable {

private int year;
private int month;
private int day;
private int wd;

public int getYear() {
    return year;
}

public void setYear(int year) {
    this.year = year;
}

public int getMonth() {
    return month;
}

public void setMonth(int month) {
    this.month = month;
}

public int getDay() {
    return day;
}

public void setDay(int day) {
    this.day = day;
}

public int getWd() {
    return wd;
}

public void setWd(int wd) {
    this.wd = wd;
}

@Override
public String toString() {
    return year + "-" + month + "-" + day;
}

@Override
public void write(DataOutput dataOutput) throws IOException {
    dataOutput.writeInt(this.getYear());
    dataOutput.writeInt(this.getMonth());
    dataOutput.writeInt(this.getDay());
    dataOutput.writeInt(this.getWd());
}

@Override
public void readFields(DataInput dataInput) throws IOException {
    this.setYear(dataInput.readInt());
    this.setMonth(dataInput.readInt());
    this.setDay(dataInput.readInt());
    this.setWd(dataInput.readInt());


}

@Override
public int compareTo(Tq o) {
    int c1 = Integer.compare(this.getYear(), o.getYear());

    if (c1 == 0) {
        int c2 = Integer.compare(this.getMonth(), o.getMonth());

        if (c2 == 0) {
            return Integer.compare(this.getDay(), o.getDay());
        }
        return c2;
    }
    return c1;
}

}

Treducer
- package com.bjsxt.tq;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

1949-10-01 34
1949-10-02 34
1949-10-03 34
1949-10-05 34

*/
public class Treducer extends Reducer<Tq, IntWritable, Text, IntWritable> {

Text tkey = new Text();
IntWritable tval = new IntWritable();

@Override
protected void reduce(Tq key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    for (IntWritable val : values) {
        int flag = 0;
        int day = 0;
        if (flag == 0) {
            tkey.set(key.toString());
            tval.set(val.get());
            context.write(tkey, tval);
            flag++;
            day = key.getDay();
        }
        if (flag != 0 && day != key.getDay()) {
            tkey.set(key.toString());
            tval.set(val.get());
            context.write(tkey, tval);
            //  break;
            return;
        }

    }


}

}

TSortComparator
- package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/*
*

实现天气年月正序，温度倒序
*/

public class TSortComparator extends WritableComparator {

Tq t1 = null;
Tq t2 = null;

public TSortComparator() {
    super(Tq.class, true);
}


@Override
public int compare(WritableComparable a, WritableComparable b) {
    t1 = (Tq) a;
    t2 = (Tq) b;

    int c1 = Integer.compare(t1.getYear(), t2.getYear());
    if (c1 == 0) {
        int c2 = Integer.compare(t1.getMonth(), t2.getMonth());
        if (c2 == 0) {
            return Integer.compare(t1.getWd(), t2.getWd());

        }
        return c2;
    }

    return c1;

}

}

案例二

列表差集
思路
MyFD
- package com.bjsxt.fd;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyFD {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    job.setJarByClass(MyFD.class);
    job.setJobName("friend");

    Path inPath = new Path("/fd/input");
    FileInputFormat.addInputPath(job, inPath);
    Path outPath = new Path("/fd/output");
    if (outPath.getFileSystem(conf).exists(outPath)) {
        outPath.getFileSystem(conf).delete(outPath, true);
    }
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapperClass(FMapper.class);
    job.setReducerClass(FRudcer.class);


    job.waitForCompletion(true);

}

}

FMapper
- package com.bjsxt.fd;

import java.io.IOException;

public class FMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

Text tkey = new Text();
IntWritable tval = new IntWritable();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    //world tom hello hadoop cat
    String[] words = StringUtils.split(value.toString(), ' ');

    for (int i = 1; i < words.length; i++) {

        //把一对封装在tkey当中
        tkey.set(getFd(words[0], words[i]));
        //如果是直接好友，则直接输出0
        tval.set(0);
        //用数组的第一个元素与后边的所有元素一一匹配，输出他们的直接好友关系
        context.write(tkey, tval);

        for (int j=i+1;j<words.length;j++){

            //把一对封装在tkey当中
            tkey.set(getFd(words[i], words[j]));
            //如果是潜在好友，则直接输出1
            tval.set(1);
            //用数组的第一个元素与后边的所有元素一一匹配，输出他们的直接好友关系
            context.write(tkey, tval);

        }

    }

}

private String getFd(String a, String b) {

    return a.compareTo(b) > 0 ? b + ":" : a + ":" + b;

}

}

FReducer
- package com.bjsxt.fd;

import java.io.IOException;

public class FMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

Text tkey = new Text();
IntWritable tval = new IntWritable();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    //world tom hello hadoop cat
    String[] words = StringUtils.split(value.toString(), ' ');

    for (int i = 1; i < words.length; i++) {

        //把一对封装在tkey当中
        tkey.set(getFd(words[0], words[i]));
        //如果是直接好友，则直接输出0
        tval.set(0);
        //用数组的第一个元素与后边的所有元素一一匹配，输出他们的直接好友关系
        context.write(tkey, tval);

        for (int j=i+1;j<words.length;j++){

            //把一对封装在tkey当中
            tkey.set(getFd(words[i], words[j]));
            //如果是潜在好友，则直接输出1
            tval.set(1);
            //用数组的第一个元素与后边的所有元素一一匹配，输出他们的直接好友关系
            context.write(tkey, tval);

        }

    }

}

private String getFd(String a, String b) {

    return a.compareTo(b) > 0 ? b + ":" : a + ":" + b;

}

}

PageRank

什么是pagerank

PageRank是Google提出的算法，用于衡量特定网页相对于搜索引擎索引中的其他网页而言的重要程度。
是Google创始人拉里·佩奇和谢尔盖·布林于1997年创造的
PageRank实现了将链接价值概念作为排名因素

计算环境

Hadoop-2.5.2
四台主机
两台NN的HA
两台RM的HA
离线计算框架MapReduce

算法原理（1）

思考超链接在互联网中的作用？
入链 ====给？的投票
- PageRank让链接来“投票“，到一个页面的超链接相当于对该页投一票
入链数量
- 如果一个页面节点接收到的其他网页指向的入链数量越多，那么这个页面越重要
入链质量
- 指向页面A的入链质量不同，质量高的页面会通过链接向其他页面传递更多的权重。所以越是质量高的页面指向页面A，则页面A越重要

算法原理（2）

初始值
- Google的每个页面设置相同的页面价值，即PR值
- pagerank算法给每个页面的PR初始值为1。
迭代计算（收敛）
- Google不断的重复计算每个页面的PageRank。那么经过不断的重复计算，这些页面的PR值会趋向于稳定，也就是收敛的状态。
- 在具体企业应用中怎么样确定收敛标准？
  - 1、每个页面的PR值和上一次计算的PR相等
  - 2、设定一个差值指标（0.0001）。当所有页面和上一次计算的PR差值平均小于该标准时，则收敛。
  - 3、设定一个百分比（99%），当99%的页面和上一次计算的PR相等

算法原理（3）

站在互联网的角度：
- 只出，不入：PR会为0
- 只入，不出：PR会很高
- 直接访问网页
修正PageRank计算公式：增加阻尼系数
- 在简单公式的基础上增加了阻尼系数（damping factor）d
- 一般取值d=0.85。
完整PageRank计算公式
- d：阻尼系数
- M(i)：指向i的页面集合
- L(j)：页面的出链数
- PR(pj)：j页面的PR值
- n：所有页面数

代码实现

RunJob
- package com.bjsxt.pg;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class RunJob {

//enum枚举
public static enum MyCounter {
    my
}

public static void main(String[] args) {

    Configuration conf = new Configuration(true);
    //如果分布式运行，必须打Jar包
    //且，client在集群外非hadoop jar这种方式启动，client中必须配置jar的位置
    conf.set("mapreduce.app-submission.cross-platform", "true");

    //这个配置，只属于，切换分布式到本地单进程模拟运行的配置
    //这种方式不是分布式，所以不用打Jar包
    conf.set("mapreduce.framework.name", "local");

    //下一个pr和上一个pr的偏差值
    double d = 0.0000001;

    int i = 0;
    while (true) {
        i++;
        try {

            //跑Job作业的次数
            conf.setInt("runCount", i);

            FileSystem fs = FileSystem.get(conf);
            Job job = Job.getInstance(conf);
            //job.setJarByClass(RunJob.class);
            job.setJobName("pr" + i);
            job.setMapperClass(PageRankMapper.class);
            job.setReducerClass(PageRankReduceer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            //告知jar包位置，如果想要客户端在任意位置运行，且任务为分布式运行
            //job.setJar("jar在哪儿，写在这里");
            //使用心得输入格式化类，抛弃原索引
            job.setInputFormatClass(KeyValueTextInputFormat.class);

            Path inputPath = new Path("/data/pagerank/input");

            if (i > 1) {
                inputPath = new Path("/data/pagerank/output/pr" + (i - 1));
            }
            FileInputFormat.addInputPath(job, inputPath);

            Path outputPath = new Path("/data/pagerank/output/pr" + i);
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
            FileOutputFormat.setOutputPath(job, outputPath);

            boolean f = job.waitForCompletion(true);
            if (f) {
                System.out.println("success.");
                long sum = job.getCounters().findCounter(MyCounter.my).getValue();

                System.out.println(sum);

                double avgd = sum / 4000.0;
                if (avgd < d) {
                    break;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }


    }


}

}

Node
- package com.bjsxt.pg;

import org.apache.commons.lang.StringUtils;

import java.io.IOException;
import java.util.Arrays;

public class Node {

//1.0
private double pageRank = 1.0;
//节点名称数组 B D
private String[] adjacentNodeNames;

public static final char fieldSeparator = '\t';

public double getPageRank() {
    return pageRank;
}

public Node setPageRank(double pageRank) {
    this.pageRank = pageRank;
    return this;
}

public String[] getAdjacentNodeNames() {
    return adjacentNodeNames;
}

public Node setAdjacentNodeNames(String[] adjacentNodeNames) {
    this.adjacentNodeNames = adjacentNodeNames;
    return this;
}

public boolean containsAdjacentNodes() {
    return adjacentNodeNames != null && adjacentNodeNames.length > 0;
}



@Override
public String toString() {
    StringBuffer sb = new StringBuffer();
    sb.append(pageRank);
    if (getAdjacentNodeNames() != null) {
        sb.append(fieldSeparator).append(StringUtils.join(getAdjacentNodeNames(), fieldSeparator));
    }
    return "Node{" +
            "pageRank=" + pageRank +
            ", adjacentNodeNames=" + Arrays.toString(adjacentNodeNames) +
            '}';
}

//value=1.0 B D
public static Node fromMR(String value) throws IOException {
    String[] parts = StringUtils.splitPreserveAllTokens(value, fieldSeparator);

    if (parts.length < 1) {
        throw new IOException("Expected 1 or more parts but received" + parts.length);
    }
    Node node = new Node().setPageRank(Double.valueOf(parts[0]));
    if (parts.length > 1) {
        node.setAdjacentNodeNames(Arrays.copyOfRange(parts, 1, parts.length));
    }
    return node;

}


//1.0 B D
public static Node fromMR(String v1, String v2) throws IOException {
    return fromMR(v1 + fieldSeparator + v2);


}

}

PageRankMapper
- package com.bjsxt.pg;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class PageRankMapper extends Mapper<Text,Text,Text,Text> {

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {

    //获取当前job作业的轮次
    int runCount=context.getConfiguration().getInt("runCount",1);

    //A     B D
    //K:A
    //V:B D
    //V:0.3 B D
    String page=key.toString();//A
    Node node=null;
    if (runCount==1){
        node=Node.fromMR("1.0",value.toString());//1.0      B D --->出链
    }else {
        node=Node.fromMR(value.toString());
    }
    //A:1.0 B D 传递老的pr值和对应页面关系
    context.write(new Text(page),new Text(node.toString()));

    //如果有出链
    if (node.containsAdjacentNodes()){

        // 1/2获取新的pr 1/2
        double outValue=node.getPageRank()/node.getAdjacentNodeNames().length;
        for (int i=0;i<node.getAdjacentNodeNames().length;i++){
            String outPage=node.getAdjacentNodeNames()[i];
            //B:0.5
            //D:0.5 页面A投给谁，谁作为key，val是票面值，票面值为：A的pr值除以超链接数量
            context.write(new Text(outPage),new Text(outPage+""));
        }


    }




}

}

PageRankReducer
- package com.bjsxt.pg;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

class PageRankReduceer extends Reducer<Text, Text, Text, Text> {

@Override
protected void reduce(Text key, Iterable<Text> iterable, Context context) throws IOException, InterruptedException {

    //相同的key为一组
    //key：页面名称比如B
    //包含两类数据
    //B:1.0 C //页面对应关系及老的pr值

    //B:0.5
    //B:0.5

    double sum = 0;
    Node sourceNode = null;
    for (Text i : iterable) {
        Node node = Node.fromMR(i.toString());
        if (node.containsAdjacentNodes()) {
            sourceNode = node;
        } else {
            sum = sum + node.getPageRank();
        }

        //页面总数
        double newPR = (0.15 / 4.0) + (0.85 * sum);
        System.out.println("*******new pageRank value is" + newPR);

        //把新的pr值和计算之前的pr比较
        double d = newPR - sourceNode.getPageRank();

        int j=(int)(d*1000.0);
        j=Math.abs(j);
        System.out.println(j+"___________");
        context.getCounter(RunJob.MyCounter.my).increment(j);
        sourceNode.setPageRank(newPR);
        context.write(key,new Text(sourceNode.toString()));

    }


}

}

TF-IDF

概念

TF-IDF（term frequency–inverse document frequency）是一种用于资讯检索与资讯探勘的常用加权技术
TF-IDF是一种统计方法，用以评估一字词对于一个文件集或一个语料库中的其中一份文件的重要程度
- 字词的重要性随着它在文件中出现的次数成正比增加
- 但同时会随着它在语料库中出现的频率成反比下降
TF-IDF加权的各种形式常被搜寻引擎应用
- 作为文件与用户查询之间相关程度的度量或评级
- 除了TF-IDF以外，因特网上的搜寻引擎还会使用基于链接分析的评级方法，以确定文件在搜寻结果中出现的顺序：PR

大白话

TF

词频 (term frequency, TF) 指的是某一个给定的词语在一份给定的文件中出现的次数。这个数字通常会被归一化（分子一般小于分母区别于IDF），以防止它偏向长的文件。（同一个词语在长文件里可能会比短文件有更高的词频，而不管该词语重要与否。）
公式中：
- ni,j是该词在文件dj中的出现次数，而分母则是在文件dj中所有字词的出现次数之和。

逆向文件频率

逆向文件频率 (inverse document frequency, IDF) 是一个词语普遍重要性的度量。某一特定词语的IDF，可以由总文件数目除以包含该词语之文件的数目，再将得到的商取对数得到。
|D|：语料库中的文件总数
包含ti文件的数目

TF-IDF：

某一特定文件内的高词语频率，以及该词语在整个文件集合中的低文件频率，可以产生出高权重的TF-IDF。因此，TF-IDF倾向于过滤掉常见的词语，保留重要的词语。
TFIDF的主要思想是：如果某个词或短语在一篇文章中出现的频率TF高，并且在其他文章中很少出现，则认为此词或者短语具有很好的类别区分能力，适合用来分类。

代码实现

分词器
- IKAnalyzer2012_FF.jar
FirstJob
- package com.sxt.mr.tfidf;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FirstJob {

public static void main(String[] args) {
    Configuration conf = new Configuration();
    conf.set("mapreduce.app-submission.cross-platform", "true");
    conf.set("mapreduce.framework", "local");

    try {
        FileSystem fs = FileSystem.get(conf);
        Job job = Job.getInstance(conf);
        //job.setJarByClass(FirstJob.class);
        job.setJobName("weibo1");

        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setNumReduceTasks(4);
        job.setPartitionerClass(FirstPartiton.class);
        job.setCombinerClass(FirstReduce.class);
        job.setReducerClass(FirstReduce.class);

        FileInputFormat.addInputPath(job, new Path("/data/tfidf/input"));

        Path path = new Path("/data/tfidf/output");
        if (fs.exists(path)) {
            fs.delete(path, true);
        }
        FileOutputFormat.setOutputPath(job, path);

        boolean f = job.waitForCompletion(true);
        if (f) {

        }


    } catch (IOException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    }


}

}

FirstMapper
- package com.sxt.mr.tfidf;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;
import java.io.StringReader;

第一个MR，计算TF和计算N（微博总数）
*/

public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    //33546465464654 今天我约了豆浆，油条
    String[] v = value.toString().trim().split("\t");

    if (v.length >= 2) {

        //trim缩减两个空格
        String id = v[0].trim();
        String content = v[1].trim();

        StringReader sr = new StringReader(content);

        //IK分词器
        IKSegmenter ikSegmenter = new IKSegmenter(sr, true);
        Lexeme word = null;
        while ((word = ikSegmenter.next()) != null) {
            String w = word.getLexemeText();
            context.write(new Text(w + "_" + id), new IntWritable(1));
            //今天_1648498435132 1
        }
        context.write(new Text("count"), new IntWritable());
        //count 1

    } else {
        System.out.println(value.toString() + "---------------");
    }
}

}

FirstPartition
- package com.sxt.mr.tfidf;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.HashPartitioner;

第一个MR自定义分区
*/

public class FirstPartiton extends HashPartitioner<Text, IntWritable> {

@Override
public int getPartition(Text key, IntWritable value, int reduceCount) {
    if (key.equals(new Text("count"))) {
        return 3;
    } else {
        return super.getPartition(key, value, reduceCount-1);
    }
}

}

FirstReduce
- package com.sxt.mr.tfidf;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

c1_001,2 c2_001,1 count,10000
*/

public class FirstReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

@Override
protected void reduce(Text key, Iterable<IntWritable> iterable, Context context) throws IOException, InterruptedException {

    int sum = 0;
    for (IntWritable i : iterable) {
        sum = sum + i.get();

    }
    if (key.equals(new Text("count"))) {
        System.out.println(key.toString() + "_____________" + sum);
    }
    context.write(key, new IntWritable());


}

}

LastJob
- package com.sxt.mr.tfidf;

import java.io.IOException;

public class LastJob {

public static void main(String[] args) {
    Configuration conf = new Configuration();
    //conf.set("mapreduce.jar","C:\\User\\root\\Desktop\\tfidf.jar");
    //conf.set("mapreduce.job.jar","C:\\User\\root\\Desktop\\tfidf.jar");

    conf.set("mapreduce.app-submission.cross-platform", "true");
    conf.set("mapreduce.framework", "local");

    try {
        FileSystem fs = FileSystem.get(conf);
        Job job = Job.getInstance(conf);
        //job.setJarByClass(LastJob.class);
        job.setJobName("weibo3");
        //job.setJar("C:\\\\User\\\\root\\\\Desktop\\\\tfidf.jar");

        //2.5
        //把微博总数加载到
        job.addCacheFile(new Path("/data/tfidf/output/weibo1/part-r-00003").toUri());

        //把df加载到
        job.addCacheFile(new Path("/data/tfidf/output/weibo2/part-r-00000").toUri());

        //设置map任务的输出key类型、value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setMapperClass(LastMapper.class);
        job.setReducerClass(LastReduce.class);


        //mr运行时的数据从hdfs的哪个目录中获取
        FileInputFormat.addInputPath(job, new Path("/data/tfidf/output/weibo1"));
        Path output = new Path("/data/tfidf/output/weibo3");
        if (fs.exists(output)) {
            fs.delete(output, true);
        }
        FileOutputFormat.setOutputPath(job, output);

        boolean f = job.waitForCompletion(true);
        if (f) {
            System.out.println("执行job成功");
        }


    } catch (IOException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    }


}

}

LastMapper
- package com.sxt.mr.tfidf;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;

import java.util.HashMap;
import java.util.Map;

public class LastMapper extends Mapper<LongWritable, Text, Text, Text> {

//存放微博总数
public static Map<String, Integer> cmap = null;
//存df
public static Map<String, Integer> df = null;

//在map方法执行之前
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    System.out.println("**************");
    if (cmap == null || cmap.size() == 0 || df == null || df.size() == 0) {

        URI[] ss = context.getCacheFiles();
        if (ss != null) {
            for (int i = 0; i < ss.length; i++) {
                URI uri = ss[i];

                //微博总数
                if (uri.getPath().endsWith("part-r-00003")) {
                    Path path = new Path(uri.getPath());

                    //FileSystem fs=FileSystem.get(context.getConfiguration())
                    //fs.open(path);
                    BufferedReader br = new BufferedReader(new FileReader(path.getName()));
                    String line = br.readLine();
                    if (line.startsWith("count")) {
                        String[] ls = line.split("\t");
                        cmap = new HashMap<String, Integer>();
                        //count 1065
                        cmap.put(ls[0], Integer.parseInt(ls[1].trim()));
                    }
                    br.close();

                    //词条的DF
                } else if (uri.getPath().endsWith("part-r-00000")) {
                    df = new HashMap<String, Integer>();
                    Path path = new Path(uri.getPath());
                    BufferedReader br = new BufferedReader(new FileReader(path.getName()));
                    String line;
                    while ((line = br.readLine()) != null) {
                        String[] ls = line.split("\t");
                        df.put(ls[0], Integer.parseInt(ls[1].trim()));
                    }
                    br.close();

                }
            }
        }
    }
}

}

LastReduce
- package com.sxt.mr.tfidf;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class LastReduce extends Reducer<Text, Text, Text, Text> {

@Override
protected void reduce(Text key, Iterable<Text> iterable, Context context) throws IOException, InterruptedException {
    StringBuffer sb = new StringBuffer();
    for (Text i : iterable) {

        sb.append(i.toString() + "\t");
    }
    context.write(key, new Text(sb.toString()));

}

}

TwoJob
- package com.sxt.mr.tfidf;

public class TwoJob {

public static void main(String[] args) {
    Configuration conf = new Configuration();
    conf.set("mapreduce.app-submission.cross-platform", "true");
    conf.set("mapreduce.framework", "local");

    try {
        FileSystem fs = FileSystem.get(conf);
        Job job = Job.getInstance(conf);
        //job.setJarByClass(TwoJob.class);
        job.setJobName("weibo2");

        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setMapperClass(TwoMapper.class);
        job.setCombinerClass(TwoReduce.class);
        job.setReducerClass(TwoReduce.class);

        FileInputFormat.addInputPath(job, new Path("/data/tfidf/output/weibo1"));
        FileOutputFormat.setOutputPath(job, new Path("/data/tfidf/output/weibo2"));

        boolean f = job.waitForCompletion(true);
        if (f) {
            System.out.println("执行job成功");
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

}

}

TwoMapper
- package com.sxt.mr.tfidf;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class TwoMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    //获取当前mapper task的数据片段（split）
    FileSplit fs= (FileSplit) context.getInputSplit();

    if (!fs.getPath().getName().contains("part-r-00003")){

        //豆浆_1654654654654 3
        String[] v=value.toString().trim().split("\t");
        if (v.length>=2){
            String[] ss=v[0].split("_");
            if (ss.length>=2){
                String w=ss[0];
                context.write(new Text(w),new IntWritable());
            }
        }else {
            System.out.println(value.toString()+"------------");
        }



    }



}

}

TwoReduce
- package com.sxt.mr.tfidf;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class TwoReduce extends Reducer<Text, IntWritable,Text,IntWritable> {

@Override
protected void reduce(Text key, Iterable<IntWritable> arg1, Context context) throws IOException, InterruptedException {
    int sum=0;
    for (IntWritable i:arg1){
        sum=sum+i.get();
    }

    context.write(key,new IntWritable()); //word 出现的微博数

}

}

ItemCF

（基于物品的协同过滤）

思考

推荐系统

协同过滤（Collaborative Filtering）算法
- UserCF基于用户的协同过滤，通过不同用户对物品的评分来评测用户之间的相似性，基于用户之间的相似性做出推荐。简单来讲就是：给用户推荐和他兴趣相似的其他用户喜欢的物品。
- 同现矩阵
代码实现
- StartRun
  - package com.bjsxt.itemcf;

import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;

public class StartRun {

public static void main(String[] args) {
	Configuration conf = new Configuration();
	
	conf.set("mapreduce.app-submission.corss-paltform", "true");
	conf.set("mapreduce.framework.name", "local");
	
	//所有mr的输入和输出目录定义在map集合中
	Map<String, String> paths = new HashMap<String, String>();
	paths.put("Step1Input", "/data/itemcf/input/");
	paths.put("Step1Output", "/data/itemcf/output/step1");
	paths.put("Step2Input", paths.get("Step1Output"));
	paths.put("Step2Output", "/data/itemcf/output/step2");
	paths.put("Step3Input", paths.get("Step2Output"));
	paths.put("Step3Output", "/data/itemcf/output/step3");
	paths.put("Step4Input1", paths.get("Step2Output"));
	paths.put("Step4Input2", paths.get("Step3Output"));
	paths.put("Step4Output", "/data/itemcf/output/step4");
	paths.put("Step5Input", paths.get("Step4Output"));
	paths.put("Step5Output", "/data/itemcf/output/step5");
	paths.put("Step6Input", paths.get("Step5Output"));
	paths.put("Step6Output", "/data/itemcf/output/step6");

	Step1.run(conf, paths);
	Step2.run(conf, paths);

// Step3.run(conf, paths);
// Step4.run(conf, paths);
// Step5.run(conf, paths);
// Step6.run(conf, paths);
}

public static Map<String, Integer> R = new HashMap<String, Integer>();
static {
	R.put("click", 1);
	R.put("collect", 2);
	R.put("cart", 3);
	R.put("alipay", 4);
}

}

- Step1

	- package com.bjsxt.itemcf;

import java.io.IOException;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**

去重复
@author root

*/
public class Step1 {

public static boolean run(Configuration config,Map<String, String> paths){
	try {
		FileSystem fs =FileSystem.get(config);
		Job job =Job.getInstance(config);
		job.setJobName("step1");
		job.setJarByClass(Step1.class);
		job.setMapperClass(Step1_Mapper.class);
		job.setReducerClass(Step1_Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		
		
		FileInputFormat.addInputPath(job, new Path(paths.get("Step1Input")));
		Path outpath=new Path(paths.get("Step1Output"));
		if(fs.exists(outpath)){
			fs.delete(outpath,true);
		}
		FileOutputFormat.setOutputPath(job, outpath);
		
		boolean f= job.waitForCompletion(true);
		return f;
	} catch (Exception e) {
		e.printStackTrace();
	}
	return false;
}

 static class Step1_Mapper extends Mapper<LongWritable, Text, Text, NullWritable>{

	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		if(key.get()!=0){
			context.write(value, NullWritable.get());
		}
	}
}

 
 static class Step1_Reducer extends Reducer<Text, IntWritable, Text, NullWritable>{

		protected void reduce(Text key, Iterable<IntWritable> i, Context context)
				throws IOException, InterruptedException {
			context.write(key,NullWritable.get());
		}
	}

}

- Step2

	- package com.bjsxt.itemcf;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**

按用户分组，计算所有物品出现的组合列表，得到用户对物品的喜爱度得分矩阵
u13 i160:1,
u14 i25:1,i223:1,
u16 i252:1,
u21 i266:1,
u24 i64:1,i218:1,i185:1,
u26 i276:1,i201:1,i348:1,i321:1,i136:1,
@author root

*/
public class Step2 {

public static boolean run(Configuration config,Map<String, String> paths){
	try {
		FileSystem fs =FileSystem.get(config);
		Job job =Job.getInstance(config);
		job.setJobName("step2");
		job.setJarByClass(StartRun.class);
		job.setMapperClass(Step2_Mapper.class);
		job.setReducerClass(Step2_Reducer.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		
		FileInputFormat.addInputPath(job, new Path(paths.get("Step2Input")));
		Path outpath=new Path(paths.get("Step2Output"));
		if(fs.exists(outpath)){
			fs.delete(outpath,true);
		}
		FileOutputFormat.setOutputPath(job, outpath);
		
		boolean f= job.waitForCompletion(true);
		return f;
	} catch (Exception e) {
		e.printStackTrace();
	}
	return false;
}

 static class Step2_Mapper extends Mapper<LongWritable, Text, Text, Text>{

	 //如果使用：用户+物品，同时作为输出key，更优
	 //i161,u2625,click,2014/9/18 15:03
	@Override
	protected void map(LongWritable key, Text value,
					   Context context)
			throws IOException, InterruptedException {
		String[]  tokens=value.toString().split(",");
		String item=tokens[0];
		String user=tokens[1];
		String action =tokens[2];
		Text k= new Text(user);
		Integer rv =StartRun.R.get(action);
		Text v =new Text(item+":"+ rv.intValue());
		context.write(k, v);
		//u2625    i161:1
	}
}

 
 static class Step2_Reducer extends Reducer<Text, Text, Text, Text>{

		@Override
		protected void reduce(Text key, Iterable<Text> i,
							  Context context)
				throws IOException, InterruptedException {
			Map<String, Integer> r =new HashMap<String, Integer>();
			//u2625
			// i161:1
			// i161:2
			// i161:4
			// i162:3
			// i161:4
			for(Text value :i){
				String[] vs =value.toString().split(":");
				String item=vs[0];
				Integer action=Integer.parseInt(vs[1]);
				action = ((Integer) (r.get(item)==null?  0:r.get(item))).intValue() + action;
				r.put(item,action);
			}
			StringBuffer sb =new StringBuffer();
			for(Entry<String, Integer> entry :r.entrySet() ){
				sb.append(entry.getKey()+":"+entry.getValue().intValue()+",");
			}
			
			context.write(key,new Text(sb.toString()));
		}
	}

}

- Step3

	- package com.bjsxt.itemcf;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;
/**

对物品组合列表进行计数，建立物品的同现矩阵
i100:i100 3
i100:i105 1
i100:i106 1
i100:i109 1
i100:i114 1
i100:i124 1
@author root

*/
public class Step3 {
private final static Text K = new Text();
private final static IntWritable V = new IntWritable(1);

public static boolean run(Configuration config,Map<String, String> paths){
	try {
		FileSystem fs =FileSystem.get(config);
		Job job =Job.getInstance(config);
		job.setJobName("step3");
		job.setJarByClass(StartRun.class);
		job.setMapperClass(Step3_Mapper.class);
		job.setReducerClass(Step3_Reducer.class);
		job.setCombinerClass(Step3_Reducer.class);

//
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(paths.get("Step3Input")));
		Path outpath=new Path(paths.get("Step3Output"));
		if(fs.exists(outpath)){
			fs.delete(outpath,true);
		}
		FileOutputFormat.setOutputPath(job, outpath);
		
		boolean f= job.waitForCompletion(true);
		return f;
	} catch (Exception e) {
		e.printStackTrace();
	}
	return false;
}

 static class Step3_Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{

	protected void map(LongWritable key, Text value,
			Context context)
			throws IOException, InterruptedException {
		
		//u3244	i469:1,i498:1,i154:1,i73:1,i162:1,
		String[]  tokens=value.toString().split("\t");
		String[] items =tokens[1].split(",");
		for (int i = 0; i < items.length; i++) {
			String itemA = items[i].split(":")[0];
			for (int j = 0; j < items.length; j++) {
				String itemB = items[j].split(":")[0];
				K.set(itemA+":"+itemB);
				context.write(K, V);
			}
		}
		
	}
}

 
 static class Step3_Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{

		protected void reduce(Text key, Iterable<IntWritable> i,
				Context context)
				throws IOException, InterruptedException {
			int sum =0;
			for(IntWritable v :i ){
				sum =sum+v.get();
			}
			V.set(sum);
			context.write(key, V);
		}
	}

}

- Step4

	- package com.bjsxt.itemcf;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.log4j.Logger;

/**
*

把同现矩阵和得分矩阵相乘
@author root

*/
public class Step4 {

public static boolean run(Configuration config, Map<String, String> paths) {
	try {
		FileSystem fs = FileSystem.get(config);
		Job job = Job.getInstance(config);
		job.setJobName("step4");
		job.setJarByClass(StartRun.class);
		job.setMapperClass(Step4_Mapper.class);
		job.setReducerClass(Step4_Reducer.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		// FileInputFormat.addInputPath(job, new
		// Path(paths.get("Step4Input")));
		FileInputFormat.setInputPaths(job,
				new Path[] { new Path(paths.get("Step4Input1")),
						new Path(paths.get("Step4Input2")) });
		Path outpath = new Path(paths.get("Step4Output"));
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);

		boolean f = job.waitForCompletion(true);
		return f;
	} catch (Exception e) {
		e.printStackTrace();
	}
	return false;
}

static class Step4_Mapper extends Mapper<LongWritable, Text, Text, Text> {
	private String flag;// A同现矩阵 or B得分矩阵

	//每个maptask，初始化时调用一次
	protected void setup(Context context) throws IOException,
			InterruptedException {
		FileSplit split = (FileSplit) context.getInputSplit();
		flag = split.getPath().getParent().getName();// 判断读的数据集

		System.out.println(flag + "**********************");
	}

	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String[] tokens = Pattern.compile("[\t,]").split(value.toString());
		if (flag.equals("step3")) {// 同现矩阵
			//i100:i125	1
			String[] v1 = tokens[0].split(":");
			String itemID1 = v1[0];
			String itemID2 = v1[1];
			String num = tokens[1];
			//A:B 3
			//B:A 3
			Text k = new Text(itemID1);// 以前一个物品为key 比如i100
			Text v = new Text("A:" + itemID2 + "," + num);// A:i109,1

			context.write(k, v);

		} else if (flag.equals("step2")) {// 用户对物品喜爱得分矩阵
			
			//u26	i276:1,i201:1,i348:1,i321:1,i136:1,
			String userID = tokens[0];
			for (int i = 1; i < tokens.length; i++) {
				String[] vector = tokens[i].split(":");
				String itemID = vector[0];// 物品id
				String pref = vector[1];// 喜爱分数

				Text k = new Text(itemID); // 以物品为key 比如：i100
				Text v = new Text("B:" + userID + "," + pref); // B:u401,2

				context.write(k, v);
			}
		}
	}
}

static class Step4_Reducer extends Reducer<Text, Text, Text, Text> {
	protected void reduce(Text key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		// A同现矩阵 or B得分矩阵
		//某一个物品，针对它和其他所有物品的同现次数，都在mapA集合中
		Map<String, Integer> mapA = new HashMap<String, Integer>();// 和该物品（key中的itemID）同现的其他物品的同现集合// 。其他物品ID为map的key，同现数字为值
		Map<String, Integer> mapB = new HashMap<String, Integer>();// 该物品（key中的itemID），所有用户的推荐权重分数。

		
		//A  > reduce   相同的KEY为一组
		//value:2类:
		//物品同现A:b:2  c:4   d:8
		//评分数据B:u1:18  u2:33   u3:22
		for (Text line : values) {
			String val = line.toString();
			if (val.startsWith("A:")) {// 表示物品同现数字
				// A:i109,1
				String[] kv = Pattern.compile("[\t,]").split(
						val.substring(2));
				try {
					mapA.put(kv[0], Integer.parseInt(kv[1]));
									//物品同现A:b:2  c:4   d:8
					//基于 A,物品同现次数
				} catch (Exception e) {
					e.printStackTrace();
				}

			} else if (val.startsWith("B:")) {
				 // B:u401,2
				String[] kv = Pattern.compile("[\t,]").split(
						val.substring(2));
						//评分数据B:u1:18  u2:33   u3:22		
				try {
					mapB.put(kv[0], Integer.parseInt(kv[1]));
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}

		double result = 0;
		Iterator<String> iter = mapA.keySet().iterator();//同现
		while (iter.hasNext()) {
			String mapk = iter.next();// itemID

			int num = mapA.get(mapk).intValue();  //对于A的同现次数
			
			
			Iterator<String> iterb = mapB.keySet().iterator();//评分
			while (iterb.hasNext()) {
				String mapkb = iterb.next();// userID
				int pref = mapB.get(mapkb).intValue();
				result = num * pref;// 矩阵乘法相乘计算

				Text k = new Text(mapkb);  //用户ID为key
				Text v = new Text(mapk + "," + result);//基于A物品,其他物品的同现与评分(所有用户对A物品)乘机
				context.write(k, v);
			}
		}
	}
}

}

- Step5

	- package com.bjsxt.itemcf;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.log4j.Logger;

/**
*

把相乘之后的矩阵相加获得结果矩阵
@author root

*/
public class Step5 {
private final static Text K = new Text();
private final static Text V = new Text();

public static boolean run(Configuration config, Map<String, String> paths) {
	try {
		FileSystem fs = FileSystem.get(config);
		Job job = Job.getInstance(config);
		job.setJobName("step5");
		job.setJarByClass(StartRun.class);
		job.setMapperClass(Step5_Mapper.class);
		job.setReducerClass(Step5_Reducer.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		FileInputFormat
				.addInputPath(job, new Path(paths.get("Step5Input")));
		Path outpath = new Path(paths.get("Step5Output"));
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);

		boolean f = job.waitForCompletion(true);
		return f;
	} catch (Exception e) {
		e.printStackTrace();
	}
	return false;
}

static class Step5_Mapper extends Mapper<LongWritable, Text, Text, Text> {

	/**
	 * 原封不动输出
	 */
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String[] tokens = Pattern.compile("[\t,]").split(value.toString());
		Text k = new Text(tokens[0]);// 用户为key
		Text v = new Text(tokens[1] + "," + tokens[2]);
		context.write(k, v);
	}
}

static class Step5_Reducer extends Reducer<Text, Text, Text, Text> {
	protected void reduce(Text key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		Map<String, Double> map = new HashMap<String, Double>();// 结果

		//u3  >  reduce
		//101, 11
		//101, 12
		//101, 8
		//102, 12
		//102, 32
	
		for (Text line : values) {// i9,4.0
			String[] tokens = line.toString().split(",");
			String itemID = tokens[0];
			Double score = Double.parseDouble(tokens[1]);

			if (map.containsKey(itemID)) {
				map.put(itemID, map.get(itemID) + score);// 矩阵乘法求和计算
			} else {
				map.put(itemID, score);
			}
		}

		Iterator<String> iter = map.keySet().iterator();
		while (iter.hasNext()) {
			String itemID = iter.next();
			double score = map.get(itemID);
			Text v = new Text(itemID + "," + score);
			context.write(key, v);
		}
	}

}

}

- Step6

	- package com.bjsxt.itemcf;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
*

按照推荐得分降序排序，每个用户列出10个推荐物品
@author root

*/
public class Step6 {
private final static Text K = new Text();
private final static Text V = new Text();

public static boolean run(Configuration config, Map<String, String> paths) {
	try {
		FileSystem fs = FileSystem.get(config);
		Job job = Job.getInstance(config);
		job.setJobName("step6");
		job.setJarByClass(StartRun.class);
		job.setMapperClass(Step6_Mapper.class);
		job.setReducerClass(Step6_Reducer.class);
		job.setSortComparatorClass(NumSort.class);
		job.setGroupingComparatorClass(UserGroup.class);
		job.setMapOutputKeyClass(PairWritable.class);
		job.setMapOutputValueClass(Text.class);

		FileInputFormat
				.addInputPath(job, new Path(paths.get("Step6Input")));
		Path outpath = new Path(paths.get("Step6Output"));
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);

		boolean f = job.waitForCompletion(true);
		return f;
	} catch (Exception e) {
		e.printStackTrace();
	}
	return false;
}

static class Step6_Mapper extends Mapper<LongWritable, Text, PairWritable, Text> {

	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String[] tokens = Pattern.compile("[\t,]").split(value.toString());
		String u = tokens[0];
		String item = tokens[1];
		String num = tokens[2];
		PairWritable k =new PairWritable();
		k.setUid(u);
		k.setNum(Double.parseDouble(num));
		V.set(item+":"+num);
		context.write(k, V);

	}
}

static class Step6_Reducer extends Reducer<PairWritable, Text, Text, Text> {
	protected void reduce(PairWritable key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		int i=0;
		StringBuffer sb =new StringBuffer();
		for(Text v :values){
			if(i==10)
				break;
			sb.append(v.toString()+",");
			i++;
		}
		K.set(key.getUid());
		V.set(sb.toString());
		context.write(K, V);
	}

}

static class PairWritable implements WritableComparable<PairWritable>{

// private String itemId;
private String uid;
private double num;
public void write(DataOutput out) throws IOException {
out.writeUTF(uid);
// out.writeUTF(itemId);
out.writeDouble(num);
}

	public void readFields(DataInput in) throws IOException {
		this.uid=in.readUTF();

// this.itemId=in.readUTF();
this.num=in.readDouble();
}

	public int compareTo(PairWritable o) {
		int r =this.uid.compareTo(o.getUid());
		if(r==0){
			return Double.compare(this.num, o.getNum());
		}
		return r;
	}

	public String getUid() {
		return uid;
	}

	public void setUid(String uid) {
		this.uid = uid;
	}

	public double getNum() {
		return num;
	}

	public void setNum(double num) {
		this.num = num;
	}
	
}

static class NumSort extends WritableComparator{
	public NumSort(){
		super(PairWritable.class,true);
	}
	
	public int compare(WritableComparable a, WritableComparable b) {
		PairWritable o1 =(PairWritable) a;
		PairWritable o2 =(PairWritable) b;
		
		int r =o1.getUid().compareTo(o2.getUid());
		if(r==0){
			return -Double.compare(o1.getNum(), o2.getNum());
		}
		return r;
	}
}

static class UserGroup extends WritableComparator{
	public UserGroup(){
		super(PairWritable.class,true);
	}
	
	public int compare(WritableComparable a, WritableComparable b) {
		PairWritable o1 =(PairWritable) a;
		PairWritable o2 =(PairWritable) b;
		return o1.getUid().compareTo(o2.getUid());
	}
}

}

09-Hadoop编程

Hadoop编程

实例

/opt/sxt/hadoop-2.6.5/share/hadoop/mapreduce/

jar包

准备

命令

讲解

WordCount案例

启动

WordCount

源码分析

Mapreduce案例

案例一

案例二

PageRank

什么是pagerank

计算环境

算法原理（1）

算法原理（2）

算法原理（3）

代码实现

TF-IDF

概念

大白话

TF

逆向文件频率

TF-IDF：

代码实现

ItemCF

（基于物品的协同过滤）

思考

推荐系统