Kolmogorov-Smirnov test in Java : K-S檢驗正態分佈 Java實現

K-S 檢驗即Kolmogorov-Smirnov Test:
The Kolmogorov-Smirnov test (KS-test) tries to determine if two datasets differ significantly. The KS-test has the advantage of making no assumption about the distribution of data.

Kolmogorov-Smirnov檢驗(KS檢驗)試圖確定兩個數據集是否顯着不同。 KS檢驗具有不假設數據分佈的優點。

通常用K-S test 來檢驗 一組數據是否符合某種分佈特徵,比如 正態分佈。

正態分佈的K-S Test

思路就是先假設原數據符合正態分佈,算出原始數據的實際分佈特徵,然後算出理論分佈特徵,設置一個置信區間,根據樣本數量得出D值,與理論分佈和實際分佈的最大離差相比較,若最大離差小於D值,則說明接受原假設,等於或大於D值,則拒絕原假設。看圖說話:
這裏寫圖片描述

圖中紅色垂直距離即爲最大離差。

D值得取值如下表:
這裏寫圖片描述

其中 n 爲樣本數量,a 爲置信區間,一般情況下,a 取值 0.05,即置信區間爲 0.05
通過樣本數量求出得D值符合該表,說明2組數據分佈上是顯著相同的,當然這是一個相對的概念,根據需求取置信區間的值。

K-S Test 的 Java 實現

一般情況下,在 R 或 Matlab ,python 中,其實幾行代碼就可以完成 k-s 驗證,可惜我都不會寫哈(可能以後會去學)。。。尷尬,所以自己實現了一個 Java 版本,如下:

package com.kay.algs;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
 * Created by kay on 2018/1/30.
 */
public class GaussCheck {

    private static final Logger logger = LoggerFactory.getLogger(GaussCheck.class);

    private boolean isGauss;

    private StatisticConstant statisticConstant;

    public GaussCheck(boolean isGauss, StatisticConstant statisticConstant) {
        this.isGauss = isGauss;
        this.statisticConstant = statisticConstant;
    }

    public GaussCheck() {
    }

    public StatisticConstant getStatisticConstant() {
        return statisticConstant;
    }

    public void setStatisticConstant(StatisticConstant statisticConstant) {
        this.statisticConstant = statisticConstant;
    }


    public boolean isGauss() {
        return isGauss;
    }

    public void setGauss(boolean gauss) {
        isGauss = gauss;
    }


    public static GaussCheck KSTest(Double[] list) {
        logger.info("***********************ks-test**********************");

        if (list == null || list.length == 0) {
            return new GaussCheck(false, new StatisticConstant());
        }

        int length = list.length;     /// TODO 原始數組長度
        Double median = 0.0D;   //中位數
        Double upperQuartile = list[length * 3 / 4];  //上四分位
        Double downQuartile = list[length  / 4];  //下四分位
        if (length % 2 == 0) {
            Double midTemp = list[length / 2-1 ] + list[length / 2];
            median = midTemp / 2.0;
        } else {
            median = list[length / 2];
        }


        TreeMap<Double, Integer> map = new TreeMap<>();  ///有序map,key-原始值,value-頻數

        Double sum = 0.00;
        for (int i = 0; i < length; i++) {
            sum += list[i];                       //求和
            Integer count = map.get(list[i]);
            if (count != null && count != 0) {
                map.put(list[i], ++count);
            } else {
                map.put(list[i], 1);
            }
        }

        Double avgNum = sum / length * 1.00;

        Double s = 0.00;       // 差平方和
        for (int i = 0; i < length; i++) {
            Double temp = (list[i] - avgNum) * (list[i] - avgNum);
            s = s + temp;
        }
        Double var = s / (length - 1);       //方差
        Double deviation = Math.sqrt(var);   //標準差

        Double variation = deviation / avgNum;   //變異係數

        int mapSize = map.size();
        Double[] keyArray = map.keySet().toArray(new Double[]{});

        Double maxNum = keyArray[mapSize - 1];  //最大值
        Double minNum = keyArray[0];     //最小值
        Double range = maxNum - minNum; //極差
        Double modeNum = 0.0;

        Integer[] frequencyArr = map.values().toArray(new Integer[]{});   //頻數數組

        map.clear();

        int frequencyArrLength = keyArray.length;
        Double[] leijiArr = new Double[frequencyArrLength];     //TODO  實際累計概率數組  Cumulative probability

        int modeIndex=0;
        for (int i = 0; i < frequencyArr.length; i++) {
            if (frequencyArr[modeIndex] < frequencyArr[i]) {
                modeIndex = i;
            }
            Integer temp = CumulativeFrequency(frequencyArr, i);     //累計頻數
            Double num = temp / (length * 1.00);  //累計頻率
            leijiArr[i] = num;
        }
        modeNum = keyArray[modeIndex];

        Double[] gaussArr = new Double[frequencyArrLength];   //TODO 理論累計概率
        Double maxDeviation = 0.00;  //理論值與實際值最大偏差

        for (int i = 0; i < frequencyArrLength; i++) {
            Double b = (keyArray[i] - avgNum) / deviation;   //標準正態分佈離差
            gaussArr[i] = Gauss(b);    //理論概率密度

            Double temp1 = Math.abs(gaussArr[i] - leijiArr[i]);
            if (maxDeviation < temp1) {
                maxDeviation = temp1;
            }
        }

        Double d = getD(length);
        boolean isGauss = checkGauss(d, maxDeviation);

        //統計常量
        StatisticConstant statisticConstant = new StatisticConstant(avgNum, median, modeNum, upperQuartile,
                downQuartile, var, deviation, variation, range, maxNum, minNum);

        return new GaussCheck(isGauss, statisticConstant);
    }


    public static Map drawData(Double u, Double sima) {
        Double t = u-3*sima;
        TreeMap<Double, Double> map = new TreeMap<>();
        //爲了畫出峯值,繪圖區間爲[u-3*sima,u+3*sima],步長爲u/10
        for (; t < u+3*sima; t += u/10) {
            Double y = DensityFunc(t, u, sima);
            map.put(t, y);
        }

        Double[] x = map.keySet().toArray(new Double[]{});
        Double[] y = map.values().toArray(new Double[]{});

        map.clear();

        Map dataMap = new HashMap();
        dataMap.put("x", x);   //橫軸數組
        dataMap.put("y", y);    //縱軸數組


        return dataMap;
    }

    /**
     * 默認置信度:a=0.05
     *
     * @param d
     * @param maxDeviation
     * @return
     */
    private static boolean checkGauss(Double d, Double maxDeviation) {
        if (maxDeviation >= d)
            return false;          // 拒絕原假設  p< 0.05
        else
            return true;            //接受原假設   ---服從正態分佈  p> 0.05

    }


    /**
     * 默認置信度:a=0.05
     *
     * @param n 樣本數量
     * @return
     */
    private static Double getD(Integer n) {
        if (n > 0 && n <= 5) {
            return 0.562;
        } else if (n > 5 && n <= 10) {
            return 0.409;
        } else if (n > 10 && n <= 20) {
            return 0.294;
        } else if (n > 20 && n <= 30) {
            return 0.242;
        } else if (n > 30 && n <= 50) {
            return 0.189;
        } else if (n > 50) {
            return 1.36 / Math.sqrt(n);
        }
        return -0.1;
    }

    /**
     * 正態分佈概率密度
     *
     * @param b
     * @return
     */
    private static Double Gauss(Double b) {
        Double a = -10000D;
        Double sum = 0.00;
        Integer n = 10000;
        for (int i = 1; i < n; i++) {
            double t = temp(a, b, n, i);
            sum += f(t);
        }
        return (2 * sum + f(a) + f(b)) * (b - a) / (2 * n);    //梯形面積
    }

    /**
     * 區間的中間值
     *
     * @param a 下限
     * @param b 上限
     * @param n 等分大小
     * @param i 第幾區間
     * @return
     */
    private static Double temp(Double a, Double b, Integer n, Integer i) {
        return a + i * (b - a) / n;
//        return a + (2 * i - 1) * (b - a) /(2 * n);
    }

    /**
     * 正態分佈被積函數
     *
     * @param t
     * @return
     */
    private static Double f(double t) {
        return 1 / Math.sqrt(2 * Math.PI) * Math.exp(-t * t / 2);
    }

    /**
     * 正態分佈概率密度函數
     *
     * @param x     因變量
     * @param u     均值(數學期望)
     * @param sigma 標準差
     * @return
     */
    private static Double DensityFunc(Double x, Double u, Double sigma) {
        return 1.0 / (Math.sqrt(2 * Math.PI) * sigma) * Math.exp(-(x - u) * (x - u) / (2 * sigma * sigma));
    }

    /**
     * 計算累計頻數
     */
    private static Integer CumulativeFrequency(Integer[] arr, int n) {
        if (n == 0) return arr[0];
        else {
            return CumulativeFrequency(arr, n - 1) + arr[n];
        }
    }
}

相關常量對象:

package com.kay.algs;

import java.io.Serializable;

/**
 * Created by kay on 2018/1/30.
 * 常用統計量
 */
public class StatisticConstant implements Serializable{

    private static final long serialVersionUID = 1256139578174896657L;

    private double mean;    //平均值

    private double median; //中位數

    private double mode;  //衆數

    private double upperQuartile;  //上四分位數

    private double downQuartile;   //下四分位數

    private double var;    //方差

    private double deviation;  //標準差

    private double variation;  //變異係數

    private double range;  //極差

    private double max;   //最大值

    private double min;     //最小值

//    private double skewness; //偏度
//
//    private double kurtosis; //峯度

    public StatisticConstant() {
    }

    public StatisticConstant(double mean, double median, double mode, double upperQuartile, double downQuartile, double var, double deviation, double variation, double range, double max, double min) {
        this.mean = mean;
        this.median = median;
        this.mode = mode;
        this.upperQuartile = upperQuartile;
        this.downQuartile = downQuartile;
        this.var = var;
        this.deviation = deviation;
        this.variation = variation;
        this.range = range;
        this.max = max;
        this.min = min;
    }

    public double getMean() {
        return mean;
    }

    public void setMean(double mean) {
        this.mean = mean;
    }

    public double getMedian() {
        return median;
    }

    public void setMedian(double median) {
        this.median = median;
    }

    public double getMode() {
        return mode;
    }

    public void setMode(double mode) {
        this.mode = mode;
    }

    public double getUpperQuartile() {
        return upperQuartile;
    }

    public void setUpperQuartile(double upperQuartile) {
        this.upperQuartile = upperQuartile;
    }

    public double getDownQuartile() {
        return downQuartile;
    }

    public void setDownQuartile(double downQuartile) {
        this.downQuartile = downQuartile;
    }

    public double getVar() {
        return var;
    }

    public void setVar(double var) {
        this.var = var;
    }

    public double getDeviation() {
        return deviation;
    }

    public void setDeviation(double deviation) {
        this.deviation = deviation;
    }

    public double getVariation() {
        return variation;
    }

    public void setVariation(double variation) {
        this.variation = variation;
    }

    public double getRange() {
        return range;
    }

    public void setRange(double range) {
        this.range = range;
    }

    public double getMax() {
        return max;
    }

    public void setMax(double max) {
        this.max = max;
    }

    public double getMin() {
        return min;
    }

    public void setMin(double min) {
        this.min = min;
    }

}

驗證

可以使用 Excel 的數據分析功能自動生成一組正態分佈數據進行驗證,經多次驗證通過該算法。
貼一組Excel自動生成的正態分佈數據:

        Double[] list=new Double[]{
                -26.9155895896255D,
                23.131279956724D,
                14.8355025228375D,
                -3.3484172692988D,
                18.076142421487D, 
                16.0152001424285D,
                3.18725031116628D,
                -26.3367144018411D,
                24.1486862048623D,
                24.6650745591614D,
                -4.70648612666992D,
                24.789752640354D,
                8.91525931161596D,
                12.0637207853724D,
                -8.57632469182136D,
                3.41213197112666D,
                2.25297756311193D,
                39.0965181193314D,
                17.0431588028441D,
                4.81745021512324D,
                9.09479811321943D,
                -2.40005985891912D,
                16.8641520556412D,
                -3.73510258417809D,
                8.87133526499383D,
                18.8807416886993D,
                -13.6366940953303D,
                13.2655873383192D,
                15.0578864829731D,
                22.3125664686086D,
                -8.08979050110793D,
                -5.70778294990305D,
                16.9007603542559D,
                -0.896355888689868D,
                8.95117298365221D,
                -0.779510830616346D,
                7.06490712014784D,
                36.1295463133137D,
                28.0323286258499D,
                -10.5694004762336D,
                14.06980234402D,
                26.2344281307014D,
                26.8029282576754D,
                17.7398988196364D,
                2.81361510838906D,
                2.5737029116135D,
                5.20497794947005D,
                -7.09845491859596D,
                -2.32504018844338D,
                -6.85116785665741D,
                1.14436832314823D,
                3.42682258429704D,
                -0.159194516745629D,
                24.9408265315287D,
                2.56550040627189D,
                19.2549089458771D,
                -2.6133249978011D,
                -8.2429039341514D,
                10.9759946806298D,
                6.10218424095365D
        };

貼一張效果圖(非上面的數據,數據庫真實數據):(echarts畫圖)
這裏寫圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章