最近在看計算機視覺:模型學習與推理,第六章中介紹一種算法,設計一個簡單的通用分類器。
設計思路基本如下:
寫一個數據生成器,產生三種label的數據,每個label分別服從一個正態分佈。label1:label2:label3=3:7:5;以同樣的比例在生成的數據中採樣十分之一作爲測試數據集。這樣可以保證,三個分類的比例不變。並將數據存儲到指定位置的文件中。
代碼如下:
void generate_data_for_learning()
{
//build a data set for classification algorithm
//set number of label is three
//label1->700 label2->300 label3->500 total
//label1 normal distribution parameter : mu:0 var:3
//label2 normal distribution parameter : mu 3 var:1
//label3 normal distribution parameter : mu 9 var:10
//train data set number:test data set number==9:1
//set superparameters
double mu1 = 0;
double var1 = 3;
double mu2 = 3;
double var2 = 1;
double mu3 = 9;
double var3 = 10;
int label_1_size = 700;
int label_2_size = 300;
int label_3_size = 500;
std::random_device rd{};
std::mt19937 gen{ rd() };
std::normal_distribution<> d1{ mu1,var1 };
std::normal_distribution<> d2{ mu2,var2 };
std::normal_distribution<> d3{ mu3,var3 };
vector<std::pair<int, double>> train_data, test_data;
int i = 1;
//generate data
for (int i = 0; i < label_1_size; i++)
{
if (i < 70)
{
test_data.push_back(std::make_pair(1, d1(gen)));
}
else
{
train_data.push_back(std::make_pair(1, d1(gen)));
}
}
for (int i = 0; i < label_2_size; i++)
{
if (i < 30)
{
test_data.push_back(std::make_pair(2, d2(gen)));
}
else
{
train_data.push_back(std::make_pair(2, d2(gen)));
}
}
for (int i = 0; i < label_3_size; i++)
{
if (i < 50)
{
test_data.push_back(std::make_pair(3, d3(gen)));
}
else
{
train_data.push_back(std::make_pair(3, d3(gen)));
}
}
std::string train_file_name = "E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/train_data.txt";
std::string test_file_name="E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/test_data.txt";
std::fstream train_file(train_file_name,std::ios_base::out);
std::fstream test_file(test_file_name,std::ios_base::out);
for (int i = 0; i < 150; i++)
{
test_file << test_data[i].second << " " << test_data[i].first << std::endl;
}
for (int i = 0; i < 1350; i++)
{
train_file << train_data[i].second << " " << train_data[i].first << std::endl;
}
train_file.close();
test_file.close();
}
學習的算法流程如下:
根據本文生成的數據C++實現如下:首先是工具函數
max_id------ 返回vector中最大元素的索引;
extract_data_from_string------解析生成的數據文件中的數據,分會可供機器學習算法使用的數據格式;
normal_distribution_probability------生成指定參數的正態分佈的概率密度值
load_data------載入數據
template<typename T>
int max_id(std::vector<T> &input)
{
assert(input.size()>0);
int index=0;
T tmp=input[0];
for (int i = 0; i < input.size(); i++)
{
if (input[i] > tmp)
{
index = i;
}
}
return index;
}
static pair<double,int> extract_data_from_string(string &s)
{
string num_s, label_s;
bool trans = false;
int count = 0;
for (auto c : s)
{
if (c != ' ')
{
if (trans == false)
num_s+=c;
else
label_s+=c;
}
else
{
count++;
if (count == 3)
trans = true;
}
}
double num=std::stod(num_s);
int label=std::stoi(label_s);
return std::make_pair(num, label);
}
double normal_distribution_probability(double x, double mu, double var)
{
return (1.0 / ((sqrt(2.0 * pi)*(sqrt(var)))))*exp(-(x - mu)*(x - mu) / (2.0 * var));
}
void load_data(vector<std::pair<double, int>> &train_data, vector<std::pair<double, int>> &test_data)
{
std::ifstream train_file("E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/train_data.txt");
std::ifstream test_file("E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/test_data.txt");
std::pair<double, int> p;
std::string s;
for (int i = 0; i < 1350; i++)
{
std::getline(train_file, s);
p = extract_data_from_string(s);
train_data.push_back(p);
}
for (int i = 0; i < 150; i++)
{
std::getline(test_file, s);
p = extract_data_from_string(s);
test_data.push_back(p);
}
}
學習和預測算法實現代碼如下:
void basic_generative_classifier()
{
vector<std::pair<double, int>> train_data, test_data;
load_data(train_data, test_data);
double mu1 ;
double var1 ;
double mu2 ;
double var2 ;
double mu3 ;
double var3 ;
double sum_mu1{}, sum_var1{}, sum_mu2{}, sum_var2{}, sum_mu3{}, sum_var3{};
//compute mean
for (int i = 0; i < 630; i++)
{
sum_mu1 += train_data[i].first;
}
mu1 = sum_mu1 / 630;
for (int i = 630; i < 900; i++)
{
sum_mu2 += train_data[i].first;
}
mu2 = sum_mu2 / 270;
for (int i = 900; i < 1350; i++)
{
sum_mu3 += train_data[i].first;
}
mu3 = sum_mu3 / 450;
//compute variance
for (int i = 0; i < 630; i++)
{
sum_var1 += (train_data[i].first-mu1)*(train_data[i].first - mu1);
}
var1 = sum_var1 / 630;
for (int i = 630; i < 900; i++)
{
sum_var2 += (train_data[i].first - mu2)*(train_data[i].first - mu2);
}
var2 = sum_var2 / 270;
for (int i = 900; i < 1350; i++)
{
sum_var3 += (train_data[i].first - mu3)*(train_data[i].first - mu3);
}
var3 = sum_var3 / 450;
double lambda1, lambda2, lambda3;
lambda1 = 630.0 / 1350.0;
lambda2 = 270.0 / 1350.0;
lambda3 = 450.0 / 1350.0;
vector<vector<double>> max_likelihood_probability_vector;
for (int i = 0; i < 150; i++)
{
double x = test_data[i].first;
vector<double> l;
for (int j = 0; j < 3; j++)
{
if (j == 0)
{
double p = normal_distribution_probability(x,mu1,var1);
l.push_back(p);
}
else if (j == 1)
{
double p = normal_distribution_probability(x, mu2, var2);
l.push_back(p);
}
else
{
double p = normal_distribution_probability(x, mu3, var3);
l.push_back(p);
}
}
max_likelihood_probability_vector.push_back(l);
}
vector<int> predection;
for (int i = 0; i < 150; i++)
{
vector<double> predic;
double total = 0;
for (int k = 0; k < 3; k++)
{
if (k == 0)
total += max_likelihood_probability_vector[i][k] * lambda1;
else if (k == 1)
total += max_likelihood_probability_vector[i][k] * lambda2;
else
total += max_likelihood_probability_vector[i][k] * lambda3;
}
for (int j = 0; j < 3; j++)
{
if (j == 0)
{
double q = max_likelihood_probability_vector[i][j] * lambda1;
double f = q / total;
predic.push_back((max_likelihood_probability_vector[i][j] * lambda1) / total);
}
else if (j == 1)
{
double q = max_likelihood_probability_vector[i][j] * lambda2;
double f = q / total;
predic.push_back((max_likelihood_probability_vector[i][j] * lambda2) / total);
}
else
{
double q = max_likelihood_probability_vector[i][j] * lambda3;
double f = q / total;
predic.push_back((max_likelihood_probability_vector[i][j] * lambda3) / total);
}
}
predection.push_back(max_id<double>(predic) + 1);
}
for (int i = 0; i < 150; i++)
{
cout << i + 1 << " : " << predection[i] << endl;
}
}
本篇博客代碼的缺陷是產生的label數目是指定死的,受限於時間關係沒來得及改成更通用的代碼,可以接受任意的分佈數量及其訓練學習,數據生成,是爲缺憾!
算法說明:如果label的分佈差異較大的時候分類器的分類精度較高。原因也很簡單,本篇博客使用的是一元正態分佈數據,只有一個特徵量,一旦label之間的分佈存在較大重合,重合部分是很難正確分開的。