Hadoop Mapreduce
形式化映射器
//key 交易ID 忽略
//value 交易商品(i1,i2,...in)
map(key,value){
(s1,s2,...sn)=sort(i1,i2,...in);
List<Tuple2<si,sj>> listofpairs=Combinations.generateCombinations(s1,s2,...sn)//combinnations 是一個java工具類,爲給定的商品列表生成購物籃商品組合(2項)
for(Tuple2<si,sj>pair : listofpairs){
emit([Tuple2<si,sj>,1]);}
}
歸約器
//key Tuple2<si,sj>
//value list<iteger>
reduce(Tuple2<si,sj> key, List<iteger> values){
integer sum=0;
for (integer i :values){
sum +=i;}
emit(key,sum);}
MBAMapper
public class MBAMapper extends Mapper<LongWritable, Text,Text,IntWritable>{
public static final int DEFAULT_NUMBER_OF_PAIRS = 2;
//輸出key2
private static final Text reducekey = new Text();
// 輸出value2
private static final IntWritable Number_ONE=new IntWritable(1);
int numberofpairs
protected void setup(Context context)
throws IOException, InterruptedException{
this.numberofpairs=context.getConfiguration().getInt("number of pair", DEFAULT_NUMBER_OF_PAIRS);}
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line =value.toString().trim();
List<String> items = convertItemsToList(line);
if ((items==null)||(items.isEmpty())){ return;}
generateMapperOutput(numberOfPairs, items,context);}
private static List<String> convertItemsToList(String line){
...}
private void generateMapperOutput(){...}
}
spark 關聯規則
public class FindAssociationRules{
static JavaSparkContext createJavaSparkContext(){}
static List<String> toList(String transaction){}
static List<String> removeOneItem(List<String> list, int i ){...}
public static void main(String[] args) throws Exception{
static JavaSparkContext createJavaSparkContext() throws EXception{
SparkConf conf= new SparkConf();
conf.setAppName("market-basket-analysis");
conf.set("spark.serializer","org.apache.spark.serializer.KryoSeralizer");//快速串行化器
conf.set("spark.kryoserializer.bufffer.mb","32")//緩衝區
JavaSparkContext ctx = new JavaSparkContext(conf);
return ctx;
}
//tolist 接受一個交易返回一個項列表
static List<String> toList(String transaction){
String[] items = transaction.trim().split(",");
List<String> list = new ArrayList<String>();
for (String item : items){
list.add(item);
}
return list;
}
//removeoneitem 從一個列表中刪除一項
static List<String> removeOneItem(List<String> list, int i ){
if( (list==null)||(list.isempty())){return list;}
if ((i<0)||(i>(list.size())){return list;}
List<String> cloned = new ArrayList<String>(list);
cloned.remove(i);
return coned;}
//處理輸入參數
//創建spark上下文對象
JavaSparkContext ctx = createJavaSparkContext();
//從HDFS中讀取所有交易創建RDD
JavaRDD<String> transaction = ctx.textFile(transactionsfilename,1);
transaction.saveAsTextFile("");
//生成頻繁模式(map)
JavaPairRDD<List<String>, Integer> patterns =
transaction.flatMapToPair(new PairFlatMapFunction< String, List<String>,Integer>(){
public Inerable<Tuple2<List<String>, Integer>> call<String transcation>{
List<String> list = toList(transcation);
List<List<String>> combinations=Combination.findSortedCombinations(list);
List<Tuple2<List<String>,Integer>> result = new ArrayList<Tuple2<List<String>,Integer>>();
for (List<String> combilist: combinations){
if (comblist.size>0){
result.add(new Tuple2<List<String>, Integer>(combilist,1));
}
}
return result;
}
})
//規約組合頻繁模式(reduce)
JavaPairRDD<List<String>, Integer> combined =
pattens.reduceByKey(new Function2<Integer,Integer,Integer>(){
public Integer call(Integer i1, Integer i2)( return i1+i2); )})
//生成所有子模式(map)
JavaPairRDD<List<String>,Tuple2<List<String>,Integer>> subpattern=
combined.flatMapToPair(new PairFlatMapFunction<
Tuple2<List<String>,Integer>>,List<String>,Tuple2<List<String>,Integer>(){
public Iterable<Tuple2<List<String>,Tuple2<List<String>,Integer>>> call(Tuple2<List<String>,Integer> pattern){
List<Tuple2<List<String>,Tuple2<List<String>,Integer>>> result=
new ArrayList<Tuple2<List<String>,Tuple2<List<String>,Integer>>>();
List<String> list = pattern._1;
Integer frequency =pattern._2;
result.add(new Tuple2(list, new Tuple2(null, freuency)));
if(list.size()==1){ return result;}
for (int i =0;i <list.size(); i++){
List<String> sublist = removeOneItem(list,i);
result.add(new Tuple2(sublist, new Tuple2(list,frequency)));
}
return result;
}
})
//組合子模式T
JavaPairRDD<List<String>, Iterable<Tuple2<List<String>,Integer>>>> rulers= subpatterns.groupByKey();
//生成關聯規則
JavaRDD <List<Tuple3<List<String>,List<String>,Double>>> assocRules=
rules.map(new Function<Tuple2<List<String>,Iterable<Tuple2<List<String>,Integer>>>,
List<Tuple3<List<String>,List<String>,Double>>>(){
public List<Tuple3<List<String>,List<String>,Double>>
call<Tuple2<List<String>,Iterable<Tuple2<List<String>,Integer>>> in>{
List<Tuple3<List<String>,List<String>,Double>> result =
new ArrayList<Tuple3<List<String>,List<String>,Double>>();
List<String> fromList = in._1;
Iterable<Tuple2<List<String>,Interable>> to = in._2;
List<Tuple2<List<String>,Integer>> tolist=new ArrayList<Tuple2<List<String>,Integer>>();
Tuple2<List<String>,Integer> fromcount = null;
for(Tuple2<List<String>,Integer>t2:to){
if(t2._1 == null){fromcount=t2;}
else{tolist.add(t2);}
}
for (Tuple2<List<String>,Integer> t2: toList){
double confidence = double t2._2/ double fromcount._2;
List<String> t2list= new ArrayList<String>(t2._1);
t2list.removeall(fromlist);
result.add(new Tuple3(fromList,t2list,confidence));
}
return result;
}
})
//
//
}
}