Hadoop編程
實例
/opt/sxt/hadoop-2.6.5/share/hadoop/mapreduce/
jar包
- hadoop-mapreduce-examples-2.6.5.ja
準備
- for i in
seq 100000
;do echo “hello sxt $i” >> test.txt;done - hdfs dfs -mkdir -p /user/root
- hdfs dfs -ls -R /
- hdfs dfs -D dfs.blocksize=1048576 -put ./test.txt /user/root
命令
-
hadoop jar hadoop-mapreduce-examples-2.6.5.jar wordcount /input /output
- wordcount爲主程序
- *input:是hdfs文件系統中數據所在的目錄
- *ouput:是hdfs中不存在的目錄,mr程序運行的結果會輸出到該目錄(輸出路徑不允許放東西)
講解
-
以下是輸出目錄的內容:
-
-rw-r–r-- 3 root supergroup 0 2017-07-02 02:49 /mr/test/output/_SUCCESS/
- /_SUCCESS:是信號/標誌文件
-
-rw-r–r-- 3 root supergroup 49 2017-07-02 02:49 /mr/test/output/part-r-00000
- /part-r-00000:是reduce輸出的數據文件,r:reduce的意思,00000是對應的reduce
-
多個reduce會有多個數據文件
WordCount案例
啓動
- zkServer.sh start
- start.dfs.sh
- yarn-daemon.sh start resourcemanager
- start-yarn.sh
WordCount
-
MyWC
- package com.sxt.mr.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyWC {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//Create a new job
Job job = Job.getInstance(conf);
//要打成jar包的入口函數
job.setJarByClass(MyWC.class);
//Specify various job-specific parameters
//定義job名稱
job.setJobName("myjob");
//定義輸入路徑
Path inPath = new Path("/user/root/test.txt");
FileInputFormat.addInputPath(job, inPath);
//定義輸出路徑(不允許放東西)
Path outPath = new Path("/output/wordcount");
//有則刪除
if (outPath.getFileSystem(conf).exists(outPath)) {
outPath.getFileSystem(conf).delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setReducerClass(MyReducer.class);
//Submit the job,then poll for progress until the job is complete
//提交job作業
job.waitForCompletion(true);
}
}
-
MyMapper
- package com.sxt.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString()); //hello **
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
-
MyReducer
- package com.sxt.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
//迭代計算
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
源碼分析
Mapreduce案例
案例一
-
MyTQ
- package com.bjsxt.tq;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/*
*1949-10-01 14:21:02 34c
*
-
*/
public class MyTQ {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {//1.配置 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(MyTQ.class); job.setJobName("tq"); //2.設置輸入輸出路徑 Path inPath = new Path("/tq/input"); FileInputFormat.addInputPath(job, inPath); Path outPath = new Path("/tq/output"); if (outPath.getFileSystem(conf).exists(outPath)) { outPath.getFileSystem(conf).delete(outPath, true); } FileOutputFormat.setOutputPath(job, outPath); //3.設置Mapper job.setMapperClass(Tmapper.class);//自定義傳輸key job.setMapOutputKeyClass(Tq.class); job.setOutputValueClass(IntWritable.class); //4.自定義排序比較器 job.setSortComparatorClass(TSortComparator.class); //5.自定義分區器 job.setPartitionerClass(TPartitoner.class); //6. 自定義組排序器 job.setGroupingComparatorClass(TGroupComparator.class); //7.設置reducetask數量 job.setNumReduceTasks(2); //8.設置reducer job.setReducerClass(Treducer.class); //9.打印過程 job.waitForCompletion(true);
}
}
-
TGroupComparator
- package com.bjsxt.tq;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TGroupComparator extends WritableComparator {
Tq t1 = null;
Tq t2 = null;
public TGroupComparator() {
super(Tq.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
t1 = (Tq) a;
t2 = (Tq) b;
int c1 = Integer.compare(t1.getYear(), t2.getYear());
if (c1 == 0) {
return Integer.compare(t1.getMonth(), t2.getMonth());
}
return c1;
}
}
-
Tmapper
- package com.bjsxt.tq;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
public class Tmapper extends Mapper<LongWritable, Text, Tq, IntWritable> {
Tq tkey = new Tq();
IntWritable tval = new IntWritable();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
//獲得時間、溫度數組
String[] words = StringUtils.split(value.toString(), '\t');
String pattern = "yyyy-MM-dd";
SimpleDateFormat sdf = new SimpleDateFormat(pattern);
try {
Date date = sdf.parse(words[0]);
Calendar cal = Calendar.getInstance();
cal.setTime(date);
tkey.setYear(cal.get(Calendar.YEAR));
tkey.setMonth(cal.get(Calendar.MONTH) + 1);
tkey.setDay(cal.get(Calendar.DAY_OF_MONTH));
int wd = Integer.parseInt(words[1].substring(0, words[1].lastIndexOf("c")));
tkey.setWd(wd);
tval.set(wd);
context.write(tkey, tval);
} catch (ParseException e) {
e.printStackTrace();
}
}
}
-
TPartitoner
- package com.bjsxt.tq;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class TPartitoner extends Partitioner<Tq, IntWritable> {
@Override
public int getPartition(Tq key, IntWritable value, int i) {
return key.getYear() % i;
}
}
-
Tq
- package com.bjsxt.tq;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Tq implements WritableComparable {
private int year;
private int month;
private int day;
private int wd;
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public int getMonth() {
return month;
}
public void setMonth(int month) {
this.month = month;
}
public int getDay() {
return day;
}
public void setDay(int day) {
this.day = day;
}
public int getWd() {
return wd;
}
public void setWd(int wd) {
this.wd = wd;
}
@Override
public String toString() {
return year + "-" + month + "-" + day;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(this.getYear());
dataOutput.writeInt(this.getMonth());
dataOutput.writeInt(this.getDay());
dataOutput.writeInt(this.getWd());
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.setYear(dataInput.readInt());
this.setMonth(dataInput.readInt());
this.setDay(dataInput.readInt());
this.setWd(dataInput.readInt());
}
@Override
public int compareTo(Tq o) {
int c1 = Integer.compare(this.getYear(), o.getYear());
if (c1 == 0) {
int c2 = Integer.compare(this.getMonth(), o.getMonth());
if (c2 == 0) {
return Integer.compare(this.getDay(), o.getDay());
}
return c2;
}
return c1;
}
}
-
Treducer
- package com.bjsxt.tq;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
- 1949-10-01 34
- 1949-10-02 34
- 1949-10-03 34
- 1949-10-05 34
*/
public class Treducer extends Reducer<Tq, IntWritable, Text, IntWritable> {
Text tkey = new Text();
IntWritable tval = new IntWritable();
@Override
protected void reduce(Tq key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
for (IntWritable val : values) {
int flag = 0;
int day = 0;
if (flag == 0) {
tkey.set(key.toString());
tval.set(val.get());
context.write(tkey, tval);
flag++;
day = key.getDay();
}
if (flag != 0 && day != key.getDay()) {
tkey.set(key.toString());
tval.set(val.get());
context.write(tkey, tval);
// break;
return;
}
}
}
}
-
TSortComparator
- package com.bjsxt.tq;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/*
*
- 實現天氣年月正序,溫度倒序
- */
public class TSortComparator extends WritableComparator {
Tq t1 = null;
Tq t2 = null;
public TSortComparator() {
super(Tq.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
t1 = (Tq) a;
t2 = (Tq) b;
int c1 = Integer.compare(t1.getYear(), t2.getYear());
if (c1 == 0) {
int c2 = Integer.compare(t1.getMonth(), t2.getMonth());
if (c2 == 0) {
return Integer.compare(t1.getWd(), t2.getWd());
}
return c2;
}
return c1;
}
}
案例二
-
列表差集
-
思路
-
MyFD
- package com.bjsxt.fd;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyFD {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(MyFD.class);
job.setJobName("friend");
Path inPath = new Path("/fd/input");
FileInputFormat.addInputPath(job, inPath);
Path outPath = new Path("/fd/output");
if (outPath.getFileSystem(conf).exists(outPath)) {
outPath.getFileSystem(conf).delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
job.setMapperClass(FMapper.class);
job.setReducerClass(FRudcer.class);
job.waitForCompletion(true);
}
}
-
FMapper
- package com.bjsxt.fd;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import java.io.IOException;
public class FMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text tkey = new Text();
IntWritable tval = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//world tom hello hadoop cat
String[] words = StringUtils.split(value.toString(), ' ');
for (int i = 1; i < words.length; i++) {
//把一對封裝在tkey當中
tkey.set(getFd(words[0], words[i]));
//如果是直接好友,則直接輸出0
tval.set(0);
//用數組的第一個元素與後邊的所有元素一一匹配,輸出他們的直接好友關係
context.write(tkey, tval);
for (int j=i+1;j<words.length;j++){
//把一對封裝在tkey當中
tkey.set(getFd(words[i], words[j]));
//如果是潛在好友,則直接輸出1
tval.set(1);
//用數組的第一個元素與後邊的所有元素一一匹配,輸出他們的直接好友關係
context.write(tkey, tval);
}
}
}
private String getFd(String a, String b) {
return a.compareTo(b) > 0 ? b + ":" : a + ":" + b;
}
}
-
FReducer
- package com.bjsxt.fd;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import java.io.IOException;
public class FMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
Text tkey = new Text();
IntWritable tval = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//world tom hello hadoop cat
String[] words = StringUtils.split(value.toString(), ' ');
for (int i = 1; i < words.length; i++) {
//把一對封裝在tkey當中
tkey.set(getFd(words[0], words[i]));
//如果是直接好友,則直接輸出0
tval.set(0);
//用數組的第一個元素與後邊的所有元素一一匹配,輸出他們的直接好友關係
context.write(tkey, tval);
for (int j=i+1;j<words.length;j++){
//把一對封裝在tkey當中
tkey.set(getFd(words[i], words[j]));
//如果是潛在好友,則直接輸出1
tval.set(1);
//用數組的第一個元素與後邊的所有元素一一匹配,輸出他們的直接好友關係
context.write(tkey, tval);
}
}
}
private String getFd(String a, String b) {
return a.compareTo(b) > 0 ? b + ":" : a + ":" + b;
}
}
PageRank
什麼是pagerank
- PageRank是Google提出的算法,用於衡量特定網頁相對於搜索引擎索引中的其他網頁而言的重要程度。
- 是Google創始人拉里·佩奇和謝爾蓋·布林於1997年創造的
- PageRank實現了將鏈接價值概念作爲排名因素
計算環境
- Hadoop-2.5.2
- 四臺主機
- 兩臺NN的HA
- 兩臺RM的HA
- 離線計算框架MapReduce
算法原理(1)
-
思考超鏈接在互聯網中的作用?
-
入鏈 ====給?的投票
- PageRank讓鏈接來“投票“,到一個頁面的超鏈接相當於對該頁投一票
-
入鏈數量
- 如果一個頁面節點接收到的其他網頁指向的入鏈數量越多,那麼這個頁面越重要
-
入鏈質量
- 指向頁面A的入鏈質量不同,質量高的頁面會通過鏈接向其他頁面傳遞更多的權重。所以越是質量高的頁面指向頁面A,則頁面A越重要
算法原理(2)
-
初始值
- Google的每個頁面設置相同的頁面價值,即PR值
- pagerank算法給每個頁面的PR初始值爲1。
-
迭代計算(收斂)
-
Google不斷的重複計算每個頁面的PageRank。那麼經過不斷的重複計算,這些頁面的PR值會趨向於穩定,也就是收斂的狀態。
-
在具體企業應用中怎麼樣確定收斂標準?
- 1、每個頁面的PR值和上一次計算的PR相等
- 2、設定一個差值指標(0.0001)。當所有頁面和上一次計算的PR差值平均小於該標準時,則收斂。
- 3、設定一個百分比(99%),當99%的頁面和上一次計算的PR相等
-
算法原理(3)
-
站在互聯網的角度:
- 只出,不入:PR會爲0
- 只入,不出:PR會很高
- 直接訪問網頁
-
修正PageRank計算公式:增加阻尼係數
- 在簡單公式的基礎上增加了阻尼係數(damping factor)d
- 一般取值d=0.85。
-
完整PageRank計算公式
- d:阻尼係數
- M(i):指向i的頁面集合
- L(j):頁面的出鏈數
- PR(pj):j頁面的PR值
- n:所有頁面數
代碼實現
-
RunJob
- package com.bjsxt.pg;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class RunJob {
//enum枚舉
public static enum MyCounter {
my
}
public static void main(String[] args) {
Configuration conf = new Configuration(true);
//如果分佈式運行,必須打Jar包
//且,client在集羣外非hadoop jar這種方式啓動,client中必須配置jar的位置
conf.set("mapreduce.app-submission.cross-platform", "true");
//這個配置,只屬於,切換分佈式到本地單進程模擬運行的配置
//這種方式不是分佈式,所以不用打Jar包
conf.set("mapreduce.framework.name", "local");
//下一個pr和上一個pr的偏差值
double d = 0.0000001;
int i = 0;
while (true) {
i++;
try {
//跑Job作業的次數
conf.setInt("runCount", i);
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
//job.setJarByClass(RunJob.class);
job.setJobName("pr" + i);
job.setMapperClass(PageRankMapper.class);
job.setReducerClass(PageRankReduceer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//告知jar包位置,如果想要客戶端在任意位置運行,且任務爲分佈式運行
//job.setJar("jar在哪兒,寫在這裏");
//使用心得輸入格式化類,拋棄原索引
job.setInputFormatClass(KeyValueTextInputFormat.class);
Path inputPath = new Path("/data/pagerank/input");
if (i > 1) {
inputPath = new Path("/data/pagerank/output/pr" + (i - 1));
}
FileInputFormat.addInputPath(job, inputPath);
Path outputPath = new Path("/data/pagerank/output/pr" + i);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean f = job.waitForCompletion(true);
if (f) {
System.out.println("success.");
long sum = job.getCounters().findCounter(MyCounter.my).getValue();
System.out.println(sum);
double avgd = sum / 4000.0;
if (avgd < d) {
break;
}
}
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}
}
-
Node
- package com.bjsxt.pg;
import org.apache.commons.lang.StringUtils;
import java.io.IOException;
import java.util.Arrays;
public class Node {
//1.0
private double pageRank = 1.0;
//節點名稱數組 B D
private String[] adjacentNodeNames;
public static final char fieldSeparator = '\t';
public double getPageRank() {
return pageRank;
}
public Node setPageRank(double pageRank) {
this.pageRank = pageRank;
return this;
}
public String[] getAdjacentNodeNames() {
return adjacentNodeNames;
}
public Node setAdjacentNodeNames(String[] adjacentNodeNames) {
this.adjacentNodeNames = adjacentNodeNames;
return this;
}
public boolean containsAdjacentNodes() {
return adjacentNodeNames != null && adjacentNodeNames.length > 0;
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append(pageRank);
if (getAdjacentNodeNames() != null) {
sb.append(fieldSeparator).append(StringUtils.join(getAdjacentNodeNames(), fieldSeparator));
}
return "Node{" +
"pageRank=" + pageRank +
", adjacentNodeNames=" + Arrays.toString(adjacentNodeNames) +
'}';
}
//value=1.0 B D
public static Node fromMR(String value) throws IOException {
String[] parts = StringUtils.splitPreserveAllTokens(value, fieldSeparator);
if (parts.length < 1) {
throw new IOException("Expected 1 or more parts but received" + parts.length);
}
Node node = new Node().setPageRank(Double.valueOf(parts[0]));
if (parts.length > 1) {
node.setAdjacentNodeNames(Arrays.copyOfRange(parts, 1, parts.length));
}
return node;
}
//1.0 B D
public static Node fromMR(String v1, String v2) throws IOException {
return fromMR(v1 + fieldSeparator + v2);
}
}
-
PageRankMapper
- package com.bjsxt.pg;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class PageRankMapper extends Mapper<Text,Text,Text,Text> {
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
//獲取當前job作業的輪次
int runCount=context.getConfiguration().getInt("runCount",1);
//A B D
//K:A
//V:B D
//V:0.3 B D
String page=key.toString();//A
Node node=null;
if (runCount==1){
node=Node.fromMR("1.0",value.toString());//1.0 B D --->出鏈
}else {
node=Node.fromMR(value.toString());
}
//A:1.0 B D 傳遞老的pr值和對應頁面關係
context.write(new Text(page),new Text(node.toString()));
//如果有出鏈
if (node.containsAdjacentNodes()){
// 1/2獲取新的pr 1/2
double outValue=node.getPageRank()/node.getAdjacentNodeNames().length;
for (int i=0;i<node.getAdjacentNodeNames().length;i++){
String outPage=node.getAdjacentNodeNames()[i];
//B:0.5
//D:0.5 頁面A投給誰,誰作爲key,val是票面值,票面值爲:A的pr值除以超鏈接數量
context.write(new Text(outPage),new Text(outPage+""));
}
}
}
}
-
PageRankReducer
- package com.bjsxt.pg;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
class PageRankReduceer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> iterable, Context context) throws IOException, InterruptedException {
//相同的key爲一組
//key:頁面名稱比如B
//包含兩類數據
//B:1.0 C //頁面對應關係及老的pr值
//B:0.5
//B:0.5
double sum = 0;
Node sourceNode = null;
for (Text i : iterable) {
Node node = Node.fromMR(i.toString());
if (node.containsAdjacentNodes()) {
sourceNode = node;
} else {
sum = sum + node.getPageRank();
}
//頁面總數
double newPR = (0.15 / 4.0) + (0.85 * sum);
System.out.println("*******new pageRank value is" + newPR);
//把新的pr值和計算之前的pr比較
double d = newPR - sourceNode.getPageRank();
int j=(int)(d*1000.0);
j=Math.abs(j);
System.out.println(j+"___________");
context.getCounter(RunJob.MyCounter.my).increment(j);
sourceNode.setPageRank(newPR);
context.write(key,new Text(sourceNode.toString()));
}
}
}
TF-IDF
概念
-
TF-IDF(term frequency–inverse document frequency)是一種用於資訊檢索與資訊探勘的常用加權技術
-
TF-IDF是一種統計方法,用以評估一字詞對於一個文件集或一個語料庫中的其中一份文件的重要程度
- 字詞的重要性隨着它在文件中出現的次數成正比增加
- 但同時會隨着它在語料庫中出現的頻率成反比下降
-
TF-IDF加權的各種形式常被搜尋引擎應用
- 作爲文件與用戶查詢之間相關程度的度量或評級
- 除了TF-IDF以外,因特網上的搜尋引擎還會使用基於鏈接分析的評級方法,以確定文件在搜尋結果中出現的順序:PR
大白話
TF
-
詞頻 (term frequency, TF) 指的是某一個給定的詞語在一份給定的文件中出現的次數。這個數字通常會被歸一化(分子一般小於分母 區別於IDF),以防止它偏向長的文件。(同一個詞語在長文件裏可能會比短文件有更高的詞頻,而不管該詞語重要與否。)
-
公式中:
- ni,j是該詞在文件dj中的出現次數,而分母則是在文件dj中所有字詞的出現次數之和。
逆向文件頻率
- 逆向文件頻率 (inverse document frequency, IDF) 是一個詞語普遍重要性的度量。某一特定詞語的IDF,可以由總文件數目除以包含該詞語之文件的數目,再將得到的商取對數得到。
- |D|:語料庫中的文件總數
- 包含ti文件的數目
TF-IDF:
- 某一特定文件內的高詞語頻率,以及該詞語在整個文件集合中的低文件頻率,可以產生出高權重的TF-IDF。因此,TF-IDF傾向於過濾掉常見的詞語,保留重要的詞語。
- TFIDF的主要思想是:如果某個詞或短語在一篇文章中出現的頻率TF高,並且在其他文章中很少出現,則認爲此詞或者短語具有很好的類別區分能力,適合用來分類。
代碼實現
-
分詞器
- IKAnalyzer2012_FF.jar
-
FirstJob
- package com.sxt.mr.tfidf;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FirstJob {
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set("mapreduce.framework", "local");
try {
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
//job.setJarByClass(FirstJob.class);
job.setJobName("weibo1");
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(4);
job.setPartitionerClass(FirstPartiton.class);
job.setCombinerClass(FirstReduce.class);
job.setReducerClass(FirstReduce.class);
FileInputFormat.addInputPath(job, new Path("/data/tfidf/input"));
Path path = new Path("/data/tfidf/output");
if (fs.exists(path)) {
fs.delete(path, true);
}
FileOutputFormat.setOutputPath(job, path);
boolean f = job.waitForCompletion(true);
if (f) {
}
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}
-
FirstMapper
- package com.sxt.mr.tfidf;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.StringReader;
/*
- 第一個MR,計算TF和計算N(微博總數)
- */
public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//33546465464654 今天我約了豆漿,油條
String[] v = value.toString().trim().split("\t");
if (v.length >= 2) {
//trim縮減兩個空格
String id = v[0].trim();
String content = v[1].trim();
StringReader sr = new StringReader(content);
//IK分詞器
IKSegmenter ikSegmenter = new IKSegmenter(sr, true);
Lexeme word = null;
while ((word = ikSegmenter.next()) != null) {
String w = word.getLexemeText();
context.write(new Text(w + "_" + id), new IntWritable(1));
//今天_1648498435132 1
}
context.write(new Text("count"), new IntWritable());
//count 1
} else {
System.out.println(value.toString() + "---------------");
}
}
}
-
FirstPartition
- package com.sxt.mr.tfidf;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.HashPartitioner;
/*
- 第一個MR自定義分區
- */
public class FirstPartiton extends HashPartitioner<Text, IntWritable> {
@Override
public int getPartition(Text key, IntWritable value, int reduceCount) {
if (key.equals(new Text("count"))) {
return 3;
} else {
return super.getPartition(key, value, reduceCount-1);
}
}
}
-
FirstReduce
- package com.sxt.mr.tfidf;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
- c1_001,2 c2_001,1 count,10000
- */
public class FirstReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> iterable, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i : iterable) {
sum = sum + i.get();
}
if (key.equals(new Text("count"))) {
System.out.println(key.toString() + "_____________" + sum);
}
context.write(key, new IntWritable());
}
}
-
LastJob
- package com.sxt.mr.tfidf;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LastJob {
public static void main(String[] args) {
Configuration conf = new Configuration();
//conf.set("mapreduce.jar","C:\\User\\root\\Desktop\\tfidf.jar");
//conf.set("mapreduce.job.jar","C:\\User\\root\\Desktop\\tfidf.jar");
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set("mapreduce.framework", "local");
try {
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
//job.setJarByClass(LastJob.class);
job.setJobName("weibo3");
//job.setJar("C:\\\\User\\\\root\\\\Desktop\\\\tfidf.jar");
//2.5
//把微博總數加載到
job.addCacheFile(new Path("/data/tfidf/output/weibo1/part-r-00003").toUri());
//把df加載到
job.addCacheFile(new Path("/data/tfidf/output/weibo2/part-r-00000").toUri());
//設置map任務的輸出key類型、value類型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(LastMapper.class);
job.setReducerClass(LastReduce.class);
//mr運行時的數據從hdfs的哪個目錄中獲取
FileInputFormat.addInputPath(job, new Path("/data/tfidf/output/weibo1"));
Path output = new Path("/data/tfidf/output/weibo3");
if (fs.exists(output)) {
fs.delete(output, true);
}
FileOutputFormat.setOutputPath(job, output);
boolean f = job.waitForCompletion(true);
if (f) {
System.out.println("執行job成功");
}
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}
-
LastMapper
- package com.sxt.mr.tfidf;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class LastMapper extends Mapper<LongWritable, Text, Text, Text> {
//存放微博總數
public static Map<String, Integer> cmap = null;
//存df
public static Map<String, Integer> df = null;
//在map方法執行之前
@Override
protected void setup(Context context) throws IOException, InterruptedException {
System.out.println("**************");
if (cmap == null || cmap.size() == 0 || df == null || df.size() == 0) {
URI[] ss = context.getCacheFiles();
if (ss != null) {
for (int i = 0; i < ss.length; i++) {
URI uri = ss[i];
//微博總數
if (uri.getPath().endsWith("part-r-00003")) {
Path path = new Path(uri.getPath());
//FileSystem fs=FileSystem.get(context.getConfiguration())
//fs.open(path);
BufferedReader br = new BufferedReader(new FileReader(path.getName()));
String line = br.readLine();
if (line.startsWith("count")) {
String[] ls = line.split("\t");
cmap = new HashMap<String, Integer>();
//count 1065
cmap.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
//詞條的DF
} else if (uri.getPath().endsWith("part-r-00000")) {
df = new HashMap<String, Integer>();
Path path = new Path(uri.getPath());
BufferedReader br = new BufferedReader(new FileReader(path.getName()));
String line;
while ((line = br.readLine()) != null) {
String[] ls = line.split("\t");
df.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
}
}
}
}
}
}
-
LastReduce
- package com.sxt.mr.tfidf;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class LastReduce extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> iterable, Context context) throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
for (Text i : iterable) {
sb.append(i.toString() + "\t");
}
context.write(key, new Text(sb.toString()));
}
}
-
TwoJob
- package com.sxt.mr.tfidf;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TwoJob {
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set("mapreduce.framework", "local");
try {
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
//job.setJarByClass(TwoJob.class);
job.setJobName("weibo2");
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(TwoMapper.class);
job.setCombinerClass(TwoReduce.class);
job.setReducerClass(TwoReduce.class);
FileInputFormat.addInputPath(job, new Path("/data/tfidf/output/weibo1"));
FileOutputFormat.setOutputPath(job, new Path("/data/tfidf/output/weibo2"));
boolean f = job.waitForCompletion(true);
if (f) {
System.out.println("執行job成功");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
-
TwoMapper
- package com.sxt.mr.tfidf;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class TwoMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//獲取當前mapper task的數據片段(split)
FileSplit fs= (FileSplit) context.getInputSplit();
if (!fs.getPath().getName().contains("part-r-00003")){
//豆漿_1654654654654 3
String[] v=value.toString().trim().split("\t");
if (v.length>=2){
String[] ss=v[0].split("_");
if (ss.length>=2){
String w=ss[0];
context.write(new Text(w),new IntWritable());
}
}else {
System.out.println(value.toString()+"------------");
}
}
}
}
-
TwoReduce
- package com.sxt.mr.tfidf;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class TwoReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> arg1, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable i:arg1){
sum=sum+i.get();
}
context.write(key,new IntWritable()); //word 出現的微博數
}
}
ItemCF
(基於物品的協同過濾)
思考
推薦系統
-
協同過濾(Collaborative Filtering)算法
- UserCF基於用戶的協同過濾,通過不同用戶對物品的評分來評測用戶之間的相似性,基於用戶之間的相似性做出推薦。簡單來講就是:給用戶推薦和他興趣相似的其他用戶喜歡的物品。
- 同現矩陣
-
代碼實現
-
StartRun
- package com.bjsxt.itemcf;
-
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
public class StartRun {
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("mapreduce.app-submission.corss-paltform", "true");
conf.set("mapreduce.framework.name", "local");
//所有mr的輸入和輸出目錄定義在map集合中
Map<String, String> paths = new HashMap<String, String>();
paths.put("Step1Input", "/data/itemcf/input/");
paths.put("Step1Output", "/data/itemcf/output/step1");
paths.put("Step2Input", paths.get("Step1Output"));
paths.put("Step2Output", "/data/itemcf/output/step2");
paths.put("Step3Input", paths.get("Step2Output"));
paths.put("Step3Output", "/data/itemcf/output/step3");
paths.put("Step4Input1", paths.get("Step2Output"));
paths.put("Step4Input2", paths.get("Step3Output"));
paths.put("Step4Output", "/data/itemcf/output/step4");
paths.put("Step5Input", paths.get("Step4Output"));
paths.put("Step5Output", "/data/itemcf/output/step5");
paths.put("Step6Input", paths.get("Step5Output"));
paths.put("Step6Output", "/data/itemcf/output/step6");
Step1.run(conf, paths);
Step2.run(conf, paths);
// Step3.run(conf, paths);
// Step4.run(conf, paths);
// Step5.run(conf, paths);
// Step6.run(conf, paths);
}
public static Map<String, Integer> R = new HashMap<String, Integer>();
static {
R.put("click", 1);
R.put("collect", 2);
R.put("cart", 3);
R.put("alipay", 4);
}
}
- Step1
- package com.bjsxt.itemcf;
import java.io.IOException;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
- 去重複
- @author root
*/
public class Step1 {
public static boolean run(Configuration config,Map<String, String> paths){
try {
FileSystem fs =FileSystem.get(config);
Job job =Job.getInstance(config);
job.setJobName("step1");
job.setJarByClass(Step1.class);
job.setMapperClass(Step1_Mapper.class);
job.setReducerClass(Step1_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(paths.get("Step1Input")));
Path outpath=new Path(paths.get("Step1Output"));
if(fs.exists(outpath)){
fs.delete(outpath,true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean f= job.waitForCompletion(true);
return f;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
static class Step1_Mapper extends Mapper<LongWritable, Text, Text, NullWritable>{
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
if(key.get()!=0){
context.write(value, NullWritable.get());
}
}
}
static class Step1_Reducer extends Reducer<Text, IntWritable, Text, NullWritable>{
protected void reduce(Text key, Iterable<IntWritable> i, Context context)
throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
}
- Step2
- package com.bjsxt.itemcf;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
- 按用戶分組,計算所有物品出現的組合列表,得到用戶對物品的喜愛度得分矩陣
u13 i160:1,
u14 i25:1,i223:1,
u16 i252:1,
u21 i266:1,
u24 i64:1,i218:1,i185:1,
u26 i276:1,i201:1,i348:1,i321:1,i136:1, - @author root
*/
public class Step2 {
public static boolean run(Configuration config,Map<String, String> paths){
try {
FileSystem fs =FileSystem.get(config);
Job job =Job.getInstance(config);
job.setJobName("step2");
job.setJarByClass(StartRun.class);
job.setMapperClass(Step2_Mapper.class);
job.setReducerClass(Step2_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(paths.get("Step2Input")));
Path outpath=new Path(paths.get("Step2Output"));
if(fs.exists(outpath)){
fs.delete(outpath,true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean f= job.waitForCompletion(true);
return f;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
static class Step2_Mapper extends Mapper<LongWritable, Text, Text, Text>{
//如果使用:用戶+物品,同時作爲輸出key,更優
//i161,u2625,click,2014/9/18 15:03
@Override
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
String[] tokens=value.toString().split(",");
String item=tokens[0];
String user=tokens[1];
String action =tokens[2];
Text k= new Text(user);
Integer rv =StartRun.R.get(action);
Text v =new Text(item+":"+ rv.intValue());
context.write(k, v);
//u2625 i161:1
}
}
static class Step2_Reducer extends Reducer<Text, Text, Text, Text>{
@Override
protected void reduce(Text key, Iterable<Text> i,
Context context)
throws IOException, InterruptedException {
Map<String, Integer> r =new HashMap<String, Integer>();
//u2625
// i161:1
// i161:2
// i161:4
// i162:3
// i161:4
for(Text value :i){
String[] vs =value.toString().split(":");
String item=vs[0];
Integer action=Integer.parseInt(vs[1]);
action = ((Integer) (r.get(item)==null? 0:r.get(item))).intValue() + action;
r.put(item,action);
}
StringBuffer sb =new StringBuffer();
for(Entry<String, Integer> entry :r.entrySet() ){
sb.append(entry.getKey()+":"+entry.getValue().intValue()+",");
}
context.write(key,new Text(sb.toString()));
}
}
}
- Step3
- package com.bjsxt.itemcf;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;
/**
- 對物品組合列表進行計數,建立物品的同現矩陣
i100:i100 3
i100:i105 1
i100:i106 1
i100:i109 1
i100:i114 1
i100:i124 1 - @author root
*/
public class Step3 {
private final static Text K = new Text();
private final static IntWritable V = new IntWritable(1);
public static boolean run(Configuration config,Map<String, String> paths){
try {
FileSystem fs =FileSystem.get(config);
Job job =Job.getInstance(config);
job.setJobName("step3");
job.setJarByClass(StartRun.class);
job.setMapperClass(Step3_Mapper.class);
job.setReducerClass(Step3_Reducer.class);
job.setCombinerClass(Step3_Reducer.class);
//
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(paths.get("Step3Input")));
Path outpath=new Path(paths.get("Step3Output"));
if(fs.exists(outpath)){
fs.delete(outpath,true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean f= job.waitForCompletion(true);
return f;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
static class Step3_Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
//u3244 i469:1,i498:1,i154:1,i73:1,i162:1,
String[] tokens=value.toString().split("\t");
String[] items =tokens[1].split(",");
for (int i = 0; i < items.length; i++) {
String itemA = items[i].split(":")[0];
for (int j = 0; j < items.length; j++) {
String itemB = items[j].split(":")[0];
K.set(itemA+":"+itemB);
context.write(K, V);
}
}
}
}
static class Step3_Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void reduce(Text key, Iterable<IntWritable> i,
Context context)
throws IOException, InterruptedException {
int sum =0;
for(IntWritable v :i ){
sum =sum+v.get();
}
V.set(sum);
context.write(key, V);
}
}
}
- Step4
- package com.bjsxt.itemcf;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.log4j.Logger;
/**
*
- 把同現矩陣和得分矩陣相乘
- @author root
*/
public class Step4 {
public static boolean run(Configuration config, Map<String, String> paths) {
try {
FileSystem fs = FileSystem.get(config);
Job job = Job.getInstance(config);
job.setJobName("step4");
job.setJarByClass(StartRun.class);
job.setMapperClass(Step4_Mapper.class);
job.setReducerClass(Step4_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// FileInputFormat.addInputPath(job, new
// Path(paths.get("Step4Input")));
FileInputFormat.setInputPaths(job,
new Path[] { new Path(paths.get("Step4Input1")),
new Path(paths.get("Step4Input2")) });
Path outpath = new Path(paths.get("Step4Output"));
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean f = job.waitForCompletion(true);
return f;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
static class Step4_Mapper extends Mapper<LongWritable, Text, Text, Text> {
private String flag;// A同現矩陣 or B得分矩陣
//每個maptask,初始化時調用一次
protected void setup(Context context) throws IOException,
InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
flag = split.getPath().getParent().getName();// 判斷讀的數據集
System.out.println(flag + "**********************");
}
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] tokens = Pattern.compile("[\t,]").split(value.toString());
if (flag.equals("step3")) {// 同現矩陣
//i100:i125 1
String[] v1 = tokens[0].split(":");
String itemID1 = v1[0];
String itemID2 = v1[1];
String num = tokens[1];
//A:B 3
//B:A 3
Text k = new Text(itemID1);// 以前一個物品爲key 比如i100
Text v = new Text("A:" + itemID2 + "," + num);// A:i109,1
context.write(k, v);
} else if (flag.equals("step2")) {// 用戶對物品喜愛得分矩陣
//u26 i276:1,i201:1,i348:1,i321:1,i136:1,
String userID = tokens[0];
for (int i = 1; i < tokens.length; i++) {
String[] vector = tokens[i].split(":");
String itemID = vector[0];// 物品id
String pref = vector[1];// 喜愛分數
Text k = new Text(itemID); // 以物品爲key 比如:i100
Text v = new Text("B:" + userID + "," + pref); // B:u401,2
context.write(k, v);
}
}
}
}
static class Step4_Reducer extends Reducer<Text, Text, Text, Text> {
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
// A同現矩陣 or B得分矩陣
//某一個物品,針對它和其他所有物品的同現次數,都在mapA集合中
Map<String, Integer> mapA = new HashMap<String, Integer>();// 和該物品(key中的itemID)同現的其他物品的同現集合// 。其他物品ID爲map的key,同現數字爲值
Map<String, Integer> mapB = new HashMap<String, Integer>();// 該物品(key中的itemID),所有用戶的推薦權重分數。
//A > reduce 相同的KEY爲一組
//value:2類:
//物品同現A:b:2 c:4 d:8
//評分數據B:u1:18 u2:33 u3:22
for (Text line : values) {
String val = line.toString();
if (val.startsWith("A:")) {// 表示物品同現數字
// A:i109,1
String[] kv = Pattern.compile("[\t,]").split(
val.substring(2));
try {
mapA.put(kv[0], Integer.parseInt(kv[1]));
//物品同現A:b:2 c:4 d:8
//基於 A,物品同現次數
} catch (Exception e) {
e.printStackTrace();
}
} else if (val.startsWith("B:")) {
// B:u401,2
String[] kv = Pattern.compile("[\t,]").split(
val.substring(2));
//評分數據B:u1:18 u2:33 u3:22
try {
mapB.put(kv[0], Integer.parseInt(kv[1]));
} catch (Exception e) {
e.printStackTrace();
}
}
}
double result = 0;
Iterator<String> iter = mapA.keySet().iterator();//同現
while (iter.hasNext()) {
String mapk = iter.next();// itemID
int num = mapA.get(mapk).intValue(); //對於A的同現次數
Iterator<String> iterb = mapB.keySet().iterator();//評分
while (iterb.hasNext()) {
String mapkb = iterb.next();// userID
int pref = mapB.get(mapkb).intValue();
result = num * pref;// 矩陣乘法相乘計算
Text k = new Text(mapkb); //用戶ID爲key
Text v = new Text(mapk + "," + result);//基於A物品,其他物品的同現與評分(所有用戶對A物品)乘機
context.write(k, v);
}
}
}
}
}
- Step5
- package com.bjsxt.itemcf;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.log4j.Logger;
/**
*
- 把相乘之後的矩陣相加獲得結果矩陣
- @author root
*/
public class Step5 {
private final static Text K = new Text();
private final static Text V = new Text();
public static boolean run(Configuration config, Map<String, String> paths) {
try {
FileSystem fs = FileSystem.get(config);
Job job = Job.getInstance(config);
job.setJobName("step5");
job.setJarByClass(StartRun.class);
job.setMapperClass(Step5_Mapper.class);
job.setReducerClass(Step5_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat
.addInputPath(job, new Path(paths.get("Step5Input")));
Path outpath = new Path(paths.get("Step5Output"));
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean f = job.waitForCompletion(true);
return f;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
static class Step5_Mapper extends Mapper<LongWritable, Text, Text, Text> {
/**
* 原封不動輸出
*/
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] tokens = Pattern.compile("[\t,]").split(value.toString());
Text k = new Text(tokens[0]);// 用戶爲key
Text v = new Text(tokens[1] + "," + tokens[2]);
context.write(k, v);
}
}
static class Step5_Reducer extends Reducer<Text, Text, Text, Text> {
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
Map<String, Double> map = new HashMap<String, Double>();// 結果
//u3 > reduce
//101, 11
//101, 12
//101, 8
//102, 12
//102, 32
for (Text line : values) {// i9,4.0
String[] tokens = line.toString().split(",");
String itemID = tokens[0];
Double score = Double.parseDouble(tokens[1]);
if (map.containsKey(itemID)) {
map.put(itemID, map.get(itemID) + score);// 矩陣乘法求和計算
} else {
map.put(itemID, score);
}
}
Iterator<String> iter = map.keySet().iterator();
while (iter.hasNext()) {
String itemID = iter.next();
double score = map.get(itemID);
Text v = new Text(itemID + "," + score);
context.write(key, v);
}
}
}
}
- Step6
- package com.bjsxt.itemcf;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
*
- 按照推薦得分降序排序,每個用戶列出10個推薦物品
- @author root
*/
public class Step6 {
private final static Text K = new Text();
private final static Text V = new Text();
public static boolean run(Configuration config, Map<String, String> paths) {
try {
FileSystem fs = FileSystem.get(config);
Job job = Job.getInstance(config);
job.setJobName("step6");
job.setJarByClass(StartRun.class);
job.setMapperClass(Step6_Mapper.class);
job.setReducerClass(Step6_Reducer.class);
job.setSortComparatorClass(NumSort.class);
job.setGroupingComparatorClass(UserGroup.class);
job.setMapOutputKeyClass(PairWritable.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat
.addInputPath(job, new Path(paths.get("Step6Input")));
Path outpath = new Path(paths.get("Step6Output"));
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean f = job.waitForCompletion(true);
return f;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
static class Step6_Mapper extends Mapper<LongWritable, Text, PairWritable, Text> {
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] tokens = Pattern.compile("[\t,]").split(value.toString());
String u = tokens[0];
String item = tokens[1];
String num = tokens[2];
PairWritable k =new PairWritable();
k.setUid(u);
k.setNum(Double.parseDouble(num));
V.set(item+":"+num);
context.write(k, V);
}
}
static class Step6_Reducer extends Reducer<PairWritable, Text, Text, Text> {
protected void reduce(PairWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
int i=0;
StringBuffer sb =new StringBuffer();
for(Text v :values){
if(i==10)
break;
sb.append(v.toString()+",");
i++;
}
K.set(key.getUid());
V.set(sb.toString());
context.write(K, V);
}
}
static class PairWritable implements WritableComparable<PairWritable>{
// private String itemId;
private String uid;
private double num;
public void write(DataOutput out) throws IOException {
out.writeUTF(uid);
// out.writeUTF(itemId);
out.writeDouble(num);
}
public void readFields(DataInput in) throws IOException {
this.uid=in.readUTF();
// this.itemId=in.readUTF();
this.num=in.readDouble();
}
public int compareTo(PairWritable o) {
int r =this.uid.compareTo(o.getUid());
if(r==0){
return Double.compare(this.num, o.getNum());
}
return r;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
public double getNum() {
return num;
}
public void setNum(double num) {
this.num = num;
}
}
static class NumSort extends WritableComparator{
public NumSort(){
super(PairWritable.class,true);
}
public int compare(WritableComparable a, WritableComparable b) {
PairWritable o1 =(PairWritable) a;
PairWritable o2 =(PairWritable) b;
int r =o1.getUid().compareTo(o2.getUid());
if(r==0){
return -Double.compare(o1.getNum(), o2.getNum());
}
return r;
}
}
static class UserGroup extends WritableComparator{
public UserGroup(){
super(PairWritable.class,true);
}
public int compare(WritableComparable a, WritableComparable b) {
PairWritable o1 =(PairWritable) a;
PairWritable o2 =(PairWritable) b;
return o1.getUid().compareTo(o2.getUid());
}
}
}