mapreduce代碼整理之簡單的kmeans聚類

我們的目標是將如下座標點分成兩類,可以很清楚地看到這個數據可以很明確的分爲兩類。要分類的數據在kmeans.txt裏,初始中心在cluster.center.conf.txt裏,把這兩個文件都上傳到hdfs中。


Utils.java

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;

public class Utils {
    
    //讀取中心文件的數據
    public static ArrayList<ArrayList<Double>> getCentersFromHDFS(String centersPath,boolean isDirectory) throws IOException{
        
        ArrayList<ArrayList<Double>> result = new ArrayList<ArrayList<Double>>();
        
        Path path = new Path(centersPath);
        
        Configuration conf = new Configuration();
        
        FileSystem fileSystem = path.getFileSystem(conf);

        if(isDirectory){    
            FileStatus[] listFile = fileSystem.listStatus(path);
            for (int i = 0; i < listFile.length; i++) {
                result.addAll(getCentersFromHDFS(listFile[i].getPath().toString(),false));
            }
            return result;
        }
        
        FSDataInputStream fsis = fileSystem.open(path);
        LineReader lineReader = new LineReader(fsis, conf);
        
        Text line = new Text();
        while(lineReader.readLine(line) > 0){
            ArrayList<Double> tempList = textToArray(line);
            result.add(tempList);
        }
        lineReader.close();
        return result;
    }
    
    //刪掉文件
    public static void deletePath(String pathStr) throws IOException{//刪除以前的文件
        Configuration conf = new Configuration();
        Path path = new Path(pathStr);
        FileSystem hdfs = path.getFileSystem(conf);
        hdfs.delete(path ,true);
    }
    
    public static ArrayList<Double> textToArray(Text text){ //從一行中提取出兩個double型
        ArrayList<Double> list = new ArrayList<Double>();
        String[] fileds = text.toString().split(",");
        for(String val:fileds){
        	System.out.println(val);
        }
        for(int i=0;i<fileds.length;i++){
            list.add(Double.parseDouble(fileds[i]));
        }
        return list;
    }
    
    public static boolean compareCenters(String centerPath,String newPath) throws IOException{//查看兩個中心是否沒變化並返回一個布爾型
        
        List<ArrayList<Double>> oldCenters = Utils.getCentersFromHDFS(centerPath,false);
        List<ArrayList<Double>> newCenters = Utils.getCentersFromHDFS(newPath,true);
        
        int size = oldCenters.size();
        int fildSize = oldCenters.get(0).size();
        double distance = 0;
        for(int i=0;i<size;i++){
            for(int j=0;j<fildSize;j++){
                double t1 = Math.abs(oldCenters.get(i).get(j));
                double t2 = Math.abs(newCenters.get(i).get(j));
                distance += Math.pow((t1 - t2) / (t1 + t2), 2);
            }
        }
        
        if(distance == 0.0){
            //刪掉新的中心文件以便最後依次歸類輸出
            Utils.deletePath(newPath);
            return true;
        }else{
            //先清空中心文件,將新的中心文件複製到中心文件中,再刪掉中心文件
            
            Configuration conf = new Configuration();
            Path outPath = new Path(centerPath);
            FileSystem fileSystem = outPath.getFileSystem(conf);
            
            FSDataOutputStream overWrite = fileSystem.create(outPath,true);
            overWrite.writeChars("");
            overWrite.close();
            
            
            Path inPath = new Path(newPath);
            FileStatus[] listFiles = fileSystem.listStatus(inPath);
            for (int i = 0; i < listFiles.length; i++) {                
                FSDataOutputStream out = fileSystem.create(outPath);
                FSDataInputStream in = fileSystem.open(listFiles[i].getPath());
                IOUtils.copyBytes(in, out, 4096, true);
            }
            //刪掉新的中心文件以便第二次任務運行輸出
            Utils.deletePath(newPath);
        }
        
        return false;
    }
}

mapreduce.java
import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;

import java.util.Arrays;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class MapReduce {
    
    public static class Map extends Mapper<LongWritable, Text, IntWritable, Text>{

        //中心集合
        ArrayList<ArrayList<Double>> centers = null;
        //用k箇中心
        int k = 0;
        
        //讀取中心
        protected void setup(Context context) throws IOException,
                InterruptedException {
            centers = Utils.getCentersFromHDFS(context.getConfiguration().get("centersPath"),false);
            k = centers.size();
        }


        /**
         * 1.每次讀取一條要分類的條記錄與中心做對比,歸類到對應的中心
         * 2.以中心ID爲key,中心包含的記錄爲value輸出(例如: 1 0.2 。  1爲聚類中心的ID,0.2爲靠近聚類中心的某個值)
         */
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            //讀取一行數據
            ArrayList<Double> fileds = Utils.textToArray(value);
            int sizeOfFileds = fileds.size();
            
            double minDistance = 99999999;
            int centerIndex = 0;
            
            //依次取出k箇中心點與當前讀取的記錄做計算
            for(int i=0;i<k;i++){
                double currentDistance = 0;
                for(int j=0;j<sizeOfFileds;j++){
                    double centerPoint = Math.abs(centers.get(i).get(j));
                    double filed = Math.abs(fileds.get(j));
                    currentDistance += Math.pow((centerPoint - filed) / (centerPoint + filed), 2);
                }
                //循環找出距離該記錄最接近的中心點的ID
                if(currentDistance<minDistance){
                    minDistance = currentDistance;
                    centerIndex = i;
                }
            }
            //以中心點爲Key 將記錄原樣輸出
            context.write(new IntWritable(centerIndex+1), value);
        }
        
    }
    
    //利用reduce的歸併功能以中心爲Key將記錄歸併到一起
    public static class Reduce extends Reducer<IntWritable, Text, Text, Text>{

        /**
         * 1.Key爲聚類中心的ID value爲該中心的記錄集合
         * 2.計數所有記錄元素的平均值,求出新的中心
         */
        protected void reduce(IntWritable key, Iterable<Text> value,Context context)
                throws IOException, InterruptedException {
            ArrayList<ArrayList<Double>> filedsList = new ArrayList<ArrayList<Double>>();
            
            //依次讀取記錄集,每行爲一個ArrayList<Double>
            for(Iterator<Text> it =value.iterator();it.hasNext();){
                ArrayList<Double> tempList = Utils.textToArray(it.next());
                filedsList.add(tempList);
            }
            
            //計算新的中心
            //每行的元素個數
            int filedSize = filedsList.get(0).size();
            double[] avg = new double[filedSize];
            for(int i=0;i<filedSize;i++){
                //求沒列的平均值
                double sum = 0;
                int size = filedsList.size();
                for(int j=0;j<size;j++){
                    sum += filedsList.get(j).get(i);
                }
                avg[i] = sum / size;
            }
            context.write(new Text("") , new Text(Arrays.toString(avg).replace("[", "").replace("]", "")));
        }
        
    }
    
    @SuppressWarnings("deprecation")
    public static void run(String centerPath,String dataPath,String newCenterPath,boolean runReduce) throws IOException, ClassNotFoundException, InterruptedException{
        
        Configuration conf = new Configuration();
        conf.set("centersPath", centerPath);
        
        Job job = new Job(conf, "mykmeans");
        job.setJarByClass(MapReduce.class);
        
        job.setMapperClass(Map.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        if(runReduce){
            //最後依次輸出不許要reduce
            job.setReducerClass(Reduce.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
        }
        
        FileInputFormat.addInputPath(job, new Path(dataPath));
        
        FileOutputFormat.setOutputPath(job, new Path(newCenterPath));
        
        System.out.println(job.waitForCompletion(true));
    }

    public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
    	String localhost = "back03";
        String centerPath = "hdfs://"+localhost+":9000/user/root/in/cluster.center.conf.txt";
        String dataPath = "hdfs://"+localhost+":9000/user/root/in/kmeans.txt";
        String newCenterPath = "hdfs://"+localhost+":9000//user/root/out/kmeans";
        
        int count = 0;
        
        
        while(true){
            run(centerPath,dataPath,newCenterPath,true);
            System.out.println(" 第 " + ++count + " 次計算 ");
            if(Utils.compareCenters(centerPath,newCenterPath )){
                run(centerPath,dataPath,newCenterPath,false);
                break;
            }
        }
    }
    
}
最後輸出爲

1	114,115
1	100,100
1	105,109
1	108,104
1	110,111
2	2,2
2	2,4
2	4,2
2	4,4
2	6,6
2	6,8
2	8,6
2	8,8
2	10,17
2	1,8
2	5,6
2	1,9
2	2,3
2	6.6,7.7
2	6.7,5.7
2	1.2,3.2
2	1.6,7.8
2	5.4,3.6
2	8.9,2.9
2	0.02,3.68
2	5.6,4.9

可以看到很明確的分爲了兩類。
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章