Canopy算法計算聚類的簇數

Kmeans算是是聚類中的經典算法，過程如下：
選擇K個點作爲初始質心
repeat
將每個點指派到最近的質心，形成K個簇
重新計算每個簇的質心
until 簇不發生變化或達到最大迭代次數

算法中的K需要人爲的指定。確定K的做法有很多，比如多次進行試探，計算誤差，得出最好的K。這樣需要比較長的時間。我們可以根據Canopy算法來粗略確定K值(可以認爲相等)。看一下Canopy算法的過程：

（1）設樣本集合爲S，確定兩個閾值t1和t2，且t1>t2。
（2）任取一個樣本點p，作爲一個Canopy，記爲C，從S中移除p。
（3）計算S中所有點到p的距離dist
（4）若dist<t1，則將相應點歸到C,作爲弱關聯。
（5）若dist<t2，則將相應點移出S，作爲強關聯。
（6）重複（2）~（5），直至S爲空。

Canopy 個數完全可以作爲這個K值，一定程度上減少了選擇K的盲目性。下面通過Canopy算法對一些點進行計算Canopy的個數，如果僅僅計算K值，則T1沒有任何作用，之用指定T2即可，這裏使用所有點的平均距離的一半來作爲T2.

package cn.edu.ustc.dm.cluster;

import java.util.ArrayList;

import java.util.List;

import cn.edu.ustc.dm.bean.Point;

/**

 * Canopy算法 藉助canopy算法計算對應的Kmeans中的K值大小

 * 其中對於計算K值來說，canopy算法中的T1沒有意義，只用設定T2(T1>T2) 我們這裏將T2設置爲平均距離

 * 

 * @author YD

 *

 */

public class Canopy {

    private List<Point> points = new ArrayList<Point>(); // 進行聚類的點

    private List<List<Point>> clusters = new ArrayList<List<Point>>(); // 存儲簇

    private double T2 = -1; // 閾值

    public Canopy(List<Point> points) {

        for (Point point : points)

            // 進行深拷貝

            this.points.add(point);

    }

    /**

     * 進行聚類，按照Canopy算法進行計算，將所有點進行聚類

     */

    public void cluster() {

        T2 = getAverageDistance(points);

        while (points.size() != 0) {

            List<Point> cluster = new ArrayList<Point>();

            Point basePoint = points.get(0); // 基準點

            cluster.add(basePoint);

            points.remove(0);

            int index = 0;

            while (index < points.size()) {

                Point anotherPoint = points.get(index);

                double distance = Math.sqrt((basePoint.x - anotherPoint.x)

                        * (basePoint.x - anotherPoint.x)

                        + (basePoint.y - anotherPoint.y)

                        * (basePoint.y - anotherPoint.y));

                if (distance <= T2) {

                    cluster.add(anotherPoint);

                    points.remove(index);

                } else {

                    index++;

                }

            }

            clusters.add(cluster);

        }

    }

    /**

     * 得到Cluster的數目

     * 

     * @return 數目

     */

    public int getClusterNumber() {

        return clusters.size();

    }

    /**

     * 獲取Cluster對應的中心點(各點相加求平均)

     * 

     * @return

     */

    public List<Point> getClusterCenterPoints() {

        List<Point> centerPoints = new ArrayList<Point>();

        for (List<Point> cluster : clusters) {

            centerPoints.add(getCenterPoint(cluster));

        }

        return centerPoints;

    }

    /**

     * 得到的中心點(各點相加求平均)

     * 

     * @return 返回中心點

     */

    private double getAverageDistance(List<Point> points) {

        double sum = 0;

        int pointSize = points.size();

        for (int i = 0; i < pointSize; i++) {

            for (int j = 0; j < pointSize; j++) {

                if (i == j)

                    continue;

                Point pointA = points.get(i);

                Point pointB = points.get(j);

                sum += Math.sqrt((pointA.x - pointB.x) * (pointA.x - pointB.x)

                        + (pointA.y - pointB.y) * (pointA.y - pointB.y));

            }

        }

        int distanceNumber = pointSize * (pointSize + 1) / 2;

        double T2 = sum / distanceNumber / 2; // 平均距離的一半

        return T2;

    }

    /**

     * 得到的中心點(各點相加求平均)

     * 

     * @return 返回中心點

     */

    private Point getCenterPoint(List<Point> points) {

        double sumX = 0;

        double sumY = 0;

        for (Point point : points) {

            sumX += point.x;

            sumY += point.y;

        }

        int clusterSize = points.size();

        Point centerPoint = new Point(sumX / clusterSize, sumY / clusterSize);

        return centerPoint;

    }

    /**

     * 獲取閾值T2

     * 

     * @return 閾值T2

     */

    public double getThreshold() {

        return T2;

    }

    /**

     * 測試9個點，進行操作

     * @param args

     */

    public static void main(String[] args) {

        List<Point> points = new ArrayList<Point>();

        points.add(new Point(0, 0));

        points.add(new Point(0, 1));

        points.add(new Point(1, 0));

        points.add(new Point(5, 5));

        points.add(new Point(5, 6));

        points.add(new Point(6, 5));

        points.add(new Point(10, 2));

        points.add(new Point(10, 3));

        points.add(new Point(11, 3));

        Canopy canopy = new Canopy(points);

        canopy.cluster();

                //獲取canopy數目

        int clusterNumber = canopy.getClusterNumber();

        System.out.println(clusterNumber);

                //獲取canopy中T2的值

        System.out.println(canopy.getThreshold());

    }

}