K-S 檢驗即Kolmogorov-Smirnov Test
:
The Kolmogorov-Smirnov test (KS-test) tries to determine if two datasets differ significantly. The KS-test has the advantage of making no assumption about the distribution of data.
Kolmogorov-Smirnov檢驗(KS檢驗)試圖確定兩個數據集是否顯着不同。 KS檢驗具有不假設數據分佈的優點。
通常用K-S test 來檢驗 一組數據是否符合某種分佈特徵,比如 正態分佈。
正態分佈的K-S Test
思路就是先假設原數據符合正態分佈,算出原始數據的實際分佈特徵,然後算出理論分佈特徵,設置一個置信區間,根據樣本數量得出D值,與理論分佈和實際分佈的最大離差相比較,若最大離差小於D值,則說明接受原假設,等於或大於D值,則拒絕原假設。看圖說話:
圖中紅色垂直距離即爲最大離差。
D值得取值如下表:
其中 n 爲樣本數量,a 爲置信區間,一般情況下,a 取值 0.05,即置信區間爲 0.05
通過樣本數量求出得D值符合該表,說明2組數據分佈上是顯著相同的,當然這是一個相對的概念,根據需求取置信區間的值。
K-S Test 的 Java 實現
一般情況下,在 R 或 Matlab ,python 中,其實幾行代碼就可以完成 k-s 驗證,可惜我都不會寫哈(可能以後會去學)。。。尷尬,所以自己實現了一個 Java 版本,如下:
package com.kay.algs;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
* Created by kay on 2018/1/30.
*/
public class GaussCheck {
private static final Logger logger = LoggerFactory.getLogger(GaussCheck.class);
private boolean isGauss;
private StatisticConstant statisticConstant;
public GaussCheck(boolean isGauss, StatisticConstant statisticConstant) {
this.isGauss = isGauss;
this.statisticConstant = statisticConstant;
}
public GaussCheck() {
}
public StatisticConstant getStatisticConstant() {
return statisticConstant;
}
public void setStatisticConstant(StatisticConstant statisticConstant) {
this.statisticConstant = statisticConstant;
}
public boolean isGauss() {
return isGauss;
}
public void setGauss(boolean gauss) {
isGauss = gauss;
}
public static GaussCheck KSTest(Double[] list) {
logger.info("***********************ks-test**********************");
if (list == null || list.length == 0) {
return new GaussCheck(false, new StatisticConstant());
}
int length = list.length; /// TODO 原始數組長度
Double median = 0.0D; //中位數
Double upperQuartile = list[length * 3 / 4]; //上四分位
Double downQuartile = list[length / 4]; //下四分位
if (length % 2 == 0) {
Double midTemp = list[length / 2-1 ] + list[length / 2];
median = midTemp / 2.0;
} else {
median = list[length / 2];
}
TreeMap<Double, Integer> map = new TreeMap<>(); ///有序map,key-原始值,value-頻數
Double sum = 0.00;
for (int i = 0; i < length; i++) {
sum += list[i]; //求和
Integer count = map.get(list[i]);
if (count != null && count != 0) {
map.put(list[i], ++count);
} else {
map.put(list[i], 1);
}
}
Double avgNum = sum / length * 1.00;
Double s = 0.00; // 差平方和
for (int i = 0; i < length; i++) {
Double temp = (list[i] - avgNum) * (list[i] - avgNum);
s = s + temp;
}
Double var = s / (length - 1); //方差
Double deviation = Math.sqrt(var); //標準差
Double variation = deviation / avgNum; //變異係數
int mapSize = map.size();
Double[] keyArray = map.keySet().toArray(new Double[]{});
Double maxNum = keyArray[mapSize - 1]; //最大值
Double minNum = keyArray[0]; //最小值
Double range = maxNum - minNum; //極差
Double modeNum = 0.0;
Integer[] frequencyArr = map.values().toArray(new Integer[]{}); //頻數數組
map.clear();
int frequencyArrLength = keyArray.length;
Double[] leijiArr = new Double[frequencyArrLength]; //TODO 實際累計概率數組 Cumulative probability
int modeIndex=0;
for (int i = 0; i < frequencyArr.length; i++) {
if (frequencyArr[modeIndex] < frequencyArr[i]) {
modeIndex = i;
}
Integer temp = CumulativeFrequency(frequencyArr, i); //累計頻數
Double num = temp / (length * 1.00); //累計頻率
leijiArr[i] = num;
}
modeNum = keyArray[modeIndex];
Double[] gaussArr = new Double[frequencyArrLength]; //TODO 理論累計概率
Double maxDeviation = 0.00; //理論值與實際值最大偏差
for (int i = 0; i < frequencyArrLength; i++) {
Double b = (keyArray[i] - avgNum) / deviation; //標準正態分佈離差
gaussArr[i] = Gauss(b); //理論概率密度
Double temp1 = Math.abs(gaussArr[i] - leijiArr[i]);
if (maxDeviation < temp1) {
maxDeviation = temp1;
}
}
Double d = getD(length);
boolean isGauss = checkGauss(d, maxDeviation);
//統計常量
StatisticConstant statisticConstant = new StatisticConstant(avgNum, median, modeNum, upperQuartile,
downQuartile, var, deviation, variation, range, maxNum, minNum);
return new GaussCheck(isGauss, statisticConstant);
}
public static Map drawData(Double u, Double sima) {
Double t = u-3*sima;
TreeMap<Double, Double> map = new TreeMap<>();
//爲了畫出峯值,繪圖區間爲[u-3*sima,u+3*sima],步長爲u/10
for (; t < u+3*sima; t += u/10) {
Double y = DensityFunc(t, u, sima);
map.put(t, y);
}
Double[] x = map.keySet().toArray(new Double[]{});
Double[] y = map.values().toArray(new Double[]{});
map.clear();
Map dataMap = new HashMap();
dataMap.put("x", x); //橫軸數組
dataMap.put("y", y); //縱軸數組
return dataMap;
}
/**
* 默認置信度:a=0.05
*
* @param d
* @param maxDeviation
* @return
*/
private static boolean checkGauss(Double d, Double maxDeviation) {
if (maxDeviation >= d)
return false; // 拒絕原假設 p< 0.05
else
return true; //接受原假設 ---服從正態分佈 p> 0.05
}
/**
* 默認置信度:a=0.05
*
* @param n 樣本數量
* @return
*/
private static Double getD(Integer n) {
if (n > 0 && n <= 5) {
return 0.562;
} else if (n > 5 && n <= 10) {
return 0.409;
} else if (n > 10 && n <= 20) {
return 0.294;
} else if (n > 20 && n <= 30) {
return 0.242;
} else if (n > 30 && n <= 50) {
return 0.189;
} else if (n > 50) {
return 1.36 / Math.sqrt(n);
}
return -0.1;
}
/**
* 正態分佈概率密度
*
* @param b
* @return
*/
private static Double Gauss(Double b) {
Double a = -10000D;
Double sum = 0.00;
Integer n = 10000;
for (int i = 1; i < n; i++) {
double t = temp(a, b, n, i);
sum += f(t);
}
return (2 * sum + f(a) + f(b)) * (b - a) / (2 * n); //梯形面積
}
/**
* 區間的中間值
*
* @param a 下限
* @param b 上限
* @param n 等分大小
* @param i 第幾區間
* @return
*/
private static Double temp(Double a, Double b, Integer n, Integer i) {
return a + i * (b - a) / n;
// return a + (2 * i - 1) * (b - a) /(2 * n);
}
/**
* 正態分佈被積函數
*
* @param t
* @return
*/
private static Double f(double t) {
return 1 / Math.sqrt(2 * Math.PI) * Math.exp(-t * t / 2);
}
/**
* 正態分佈概率密度函數
*
* @param x 因變量
* @param u 均值(數學期望)
* @param sigma 標準差
* @return
*/
private static Double DensityFunc(Double x, Double u, Double sigma) {
return 1.0 / (Math.sqrt(2 * Math.PI) * sigma) * Math.exp(-(x - u) * (x - u) / (2 * sigma * sigma));
}
/**
* 計算累計頻數
*/
private static Integer CumulativeFrequency(Integer[] arr, int n) {
if (n == 0) return arr[0];
else {
return CumulativeFrequency(arr, n - 1) + arr[n];
}
}
}
相關常量對象:
package com.kay.algs;
import java.io.Serializable;
/**
* Created by kay on 2018/1/30.
* 常用統計量
*/
public class StatisticConstant implements Serializable{
private static final long serialVersionUID = 1256139578174896657L;
private double mean; //平均值
private double median; //中位數
private double mode; //衆數
private double upperQuartile; //上四分位數
private double downQuartile; //下四分位數
private double var; //方差
private double deviation; //標準差
private double variation; //變異係數
private double range; //極差
private double max; //最大值
private double min; //最小值
// private double skewness; //偏度
//
// private double kurtosis; //峯度
public StatisticConstant() {
}
public StatisticConstant(double mean, double median, double mode, double upperQuartile, double downQuartile, double var, double deviation, double variation, double range, double max, double min) {
this.mean = mean;
this.median = median;
this.mode = mode;
this.upperQuartile = upperQuartile;
this.downQuartile = downQuartile;
this.var = var;
this.deviation = deviation;
this.variation = variation;
this.range = range;
this.max = max;
this.min = min;
}
public double getMean() {
return mean;
}
public void setMean(double mean) {
this.mean = mean;
}
public double getMedian() {
return median;
}
public void setMedian(double median) {
this.median = median;
}
public double getMode() {
return mode;
}
public void setMode(double mode) {
this.mode = mode;
}
public double getUpperQuartile() {
return upperQuartile;
}
public void setUpperQuartile(double upperQuartile) {
this.upperQuartile = upperQuartile;
}
public double getDownQuartile() {
return downQuartile;
}
public void setDownQuartile(double downQuartile) {
this.downQuartile = downQuartile;
}
public double getVar() {
return var;
}
public void setVar(double var) {
this.var = var;
}
public double getDeviation() {
return deviation;
}
public void setDeviation(double deviation) {
this.deviation = deviation;
}
public double getVariation() {
return variation;
}
public void setVariation(double variation) {
this.variation = variation;
}
public double getRange() {
return range;
}
public void setRange(double range) {
this.range = range;
}
public double getMax() {
return max;
}
public void setMax(double max) {
this.max = max;
}
public double getMin() {
return min;
}
public void setMin(double min) {
this.min = min;
}
}
驗證
可以使用 Excel 的數據分析功能自動生成一組正態分佈數據進行驗證,經多次驗證通過該算法。
貼一組Excel自動生成的正態分佈數據:
Double[] list=new Double[]{
-26.9155895896255D,
23.131279956724D,
14.8355025228375D,
-3.3484172692988D,
18.076142421487D,
16.0152001424285D,
3.18725031116628D,
-26.3367144018411D,
24.1486862048623D,
24.6650745591614D,
-4.70648612666992D,
24.789752640354D,
8.91525931161596D,
12.0637207853724D,
-8.57632469182136D,
3.41213197112666D,
2.25297756311193D,
39.0965181193314D,
17.0431588028441D,
4.81745021512324D,
9.09479811321943D,
-2.40005985891912D,
16.8641520556412D,
-3.73510258417809D,
8.87133526499383D,
18.8807416886993D,
-13.6366940953303D,
13.2655873383192D,
15.0578864829731D,
22.3125664686086D,
-8.08979050110793D,
-5.70778294990305D,
16.9007603542559D,
-0.896355888689868D,
8.95117298365221D,
-0.779510830616346D,
7.06490712014784D,
36.1295463133137D,
28.0323286258499D,
-10.5694004762336D,
14.06980234402D,
26.2344281307014D,
26.8029282576754D,
17.7398988196364D,
2.81361510838906D,
2.5737029116135D,
5.20497794947005D,
-7.09845491859596D,
-2.32504018844338D,
-6.85116785665741D,
1.14436832314823D,
3.42682258429704D,
-0.159194516745629D,
24.9408265315287D,
2.56550040627189D,
19.2549089458771D,
-2.6133249978011D,
-8.2429039341514D,
10.9759946806298D,
6.10218424095365D
};
貼一張效果圖(非上面的數據,數據庫真實數據):(echarts畫圖)