k-means過程:
- 從數據集中隨機選取k個點作爲初始center
- 迭代(直到一定次數或迭代前後無變化):
- 計算每個向量最近的center,將其歸爲該類
- 計算每一類向量的質心,作爲新的center
在此使用歐氏距離
測試數據集(68040*32):CorelFeatures-mld/ColorHistogram.asc
#include <iostream>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <vector>
#include <map>
#include <fstream>
using namespace std;
/**
* @brief 計算歐氏距離
* @param a n維向量
* @param b n維向量
* @return 兩點間的歐氏距離
*/
double calcMeas(vector<double> a, vector<double> b){
const int n = a.size();
double p = 0;
for(int i=0; i<n; i++){
p += (a[i]-b[i])*(a[i]-b[i]);
}
return sqrt(p);
}
const int m = 68040;//行,向量個數
const int n = 32;//列,維數
int k = 10;//類別數
int n_calc = 100;//最大迭代次數
int main(){
fstream fs("ColorHistogram.asc");
vector<vector<double> > data;
for(int i=0; i<m; i++){
vector<double> line;
for(int j=0; j<=n; j++){
double t;
fs>>t;
if(j != 0){
line.push_back(t);
}
}
data.push_back(line);
}
fs.close();
cout<<"read data over."<<endl;
//隨機選取k個向量作爲初始中心
vector<double> center[k];
for(int i=0; i<k; i++){
center[i].resize(n);
int rand_index = rand()%m;
for(int j=0; j<n; j++){
center[i][j] = data[rand_index][j];
}
}
cout<<"rand k-center over."<<endl;
vector<int> belong;//每個向量所屬的類別
belong.resize(m);
int pre_num_center[k] = {0};//上次迭代後每個類的向量數
int num_center[k] = {0};//每個類的向量數
for(int nc=0; nc<n_calc; nc++){
cout<<"---------------------------------------"<<endl;
cout<<"calc: "<<nc<<endl;
//根據歐式距離進行分類
for(int i=0; i<m; i++){
belong[i] = 0;
double length = calcMeas(data[i], center[0]);
for(int j=1; j<k; j++){
double meas = calcMeas(data[i], center[j]);
if(meas < length){
length = meas;
belong[i] = j;
}
}
}
cout<<"classify over."<<endl;
//重新計算每個類的質心
vector<double> sum_center[k];
for(int i=0; i<k; i++){
sum_center[i].resize(n);
}
memset(num_center, 0, sizeof(int)*k);
for(int i=0; i<m; i++){
num_center[belong[i]]++;
for(int j=0; j<n; j++){
sum_center[belong[i]][j] += data[i][j];
}
}
for(int i=0; i<k; i++){
for(int j=0; j<n; j++){
center[i][j] = sum_center[i][j]/num_center[i];
}
}
cout<<"recalc center over."<<endl;
//打印
cout<<"num of center:"<<endl;
for(int i=0; i<k; i++){
cout<<i<<"\t"<<num_center[i]<<endl;
}
//是否結束迭代
bool redo = false;
for(int i=0; i<k; i++){
if(pre_num_center[i] != num_center[i]){
redo = true;
}
}
for(int i=0; i<k; i++){
pre_num_center[i] = num_center[i];
}
if(redo == false) break;
}
return 0;
}