最近在看计算机视觉:模型学习与推理,第六章中介绍一种算法,设计一个简单的通用分类器。
设计思路基本如下:
写一个数据生成器,产生三种label的数据,每个label分别服从一个正态分布。label1:label2:label3=3:7:5;以同样的比例在生成的数据中采样十分之一作为测试数据集。这样可以保证,三个分类的比例不变。并将数据存储到指定位置的文件中。
代码如下:
void generate_data_for_learning()
{
//build a data set for classification algorithm
//set number of label is three
//label1->700 label2->300 label3->500 total
//label1 normal distribution parameter : mu:0 var:3
//label2 normal distribution parameter : mu 3 var:1
//label3 normal distribution parameter : mu 9 var:10
//train data set number:test data set number==9:1
//set superparameters
double mu1 = 0;
double var1 = 3;
double mu2 = 3;
double var2 = 1;
double mu3 = 9;
double var3 = 10;
int label_1_size = 700;
int label_2_size = 300;
int label_3_size = 500;
std::random_device rd{};
std::mt19937 gen{ rd() };
std::normal_distribution<> d1{ mu1,var1 };
std::normal_distribution<> d2{ mu2,var2 };
std::normal_distribution<> d3{ mu3,var3 };
vector<std::pair<int, double>> train_data, test_data;
int i = 1;
//generate data
for (int i = 0; i < label_1_size; i++)
{
if (i < 70)
{
test_data.push_back(std::make_pair(1, d1(gen)));
}
else
{
train_data.push_back(std::make_pair(1, d1(gen)));
}
}
for (int i = 0; i < label_2_size; i++)
{
if (i < 30)
{
test_data.push_back(std::make_pair(2, d2(gen)));
}
else
{
train_data.push_back(std::make_pair(2, d2(gen)));
}
}
for (int i = 0; i < label_3_size; i++)
{
if (i < 50)
{
test_data.push_back(std::make_pair(3, d3(gen)));
}
else
{
train_data.push_back(std::make_pair(3, d3(gen)));
}
}
std::string train_file_name = "E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/train_data.txt";
std::string test_file_name="E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/test_data.txt";
std::fstream train_file(train_file_name,std::ios_base::out);
std::fstream test_file(test_file_name,std::ios_base::out);
for (int i = 0; i < 150; i++)
{
test_file << test_data[i].second << " " << test_data[i].first << std::endl;
}
for (int i = 0; i < 1350; i++)
{
train_file << train_data[i].second << " " << train_data[i].first << std::endl;
}
train_file.close();
test_file.close();
}
学习的算法流程如下:
根据本文生成的数据C++实现如下:首先是工具函数
max_id------ 返回vector中最大元素的索引;
extract_data_from_string------解析生成的数据文件中的数据,分会可供机器学习算法使用的数据格式;
normal_distribution_probability------生成指定参数的正态分布的概率密度值
load_data------载入数据
template<typename T>
int max_id(std::vector<T> &input)
{
assert(input.size()>0);
int index=0;
T tmp=input[0];
for (int i = 0; i < input.size(); i++)
{
if (input[i] > tmp)
{
index = i;
}
}
return index;
}
static pair<double,int> extract_data_from_string(string &s)
{
string num_s, label_s;
bool trans = false;
int count = 0;
for (auto c : s)
{
if (c != ' ')
{
if (trans == false)
num_s+=c;
else
label_s+=c;
}
else
{
count++;
if (count == 3)
trans = true;
}
}
double num=std::stod(num_s);
int label=std::stoi(label_s);
return std::make_pair(num, label);
}
double normal_distribution_probability(double x, double mu, double var)
{
return (1.0 / ((sqrt(2.0 * pi)*(sqrt(var)))))*exp(-(x - mu)*(x - mu) / (2.0 * var));
}
void load_data(vector<std::pair<double, int>> &train_data, vector<std::pair<double, int>> &test_data)
{
std::ifstream train_file("E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/train_data.txt");
std::ifstream test_file("E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/test_data.txt");
std::pair<double, int> p;
std::string s;
for (int i = 0; i < 1350; i++)
{
std::getline(train_file, s);
p = extract_data_from_string(s);
train_data.push_back(p);
}
for (int i = 0; i < 150; i++)
{
std::getline(test_file, s);
p = extract_data_from_string(s);
test_data.push_back(p);
}
}
学习和预测算法实现代码如下:
void basic_generative_classifier()
{
vector<std::pair<double, int>> train_data, test_data;
load_data(train_data, test_data);
double mu1 ;
double var1 ;
double mu2 ;
double var2 ;
double mu3 ;
double var3 ;
double sum_mu1{}, sum_var1{}, sum_mu2{}, sum_var2{}, sum_mu3{}, sum_var3{};
//compute mean
for (int i = 0; i < 630; i++)
{
sum_mu1 += train_data[i].first;
}
mu1 = sum_mu1 / 630;
for (int i = 630; i < 900; i++)
{
sum_mu2 += train_data[i].first;
}
mu2 = sum_mu2 / 270;
for (int i = 900; i < 1350; i++)
{
sum_mu3 += train_data[i].first;
}
mu3 = sum_mu3 / 450;
//compute variance
for (int i = 0; i < 630; i++)
{
sum_var1 += (train_data[i].first-mu1)*(train_data[i].first - mu1);
}
var1 = sum_var1 / 630;
for (int i = 630; i < 900; i++)
{
sum_var2 += (train_data[i].first - mu2)*(train_data[i].first - mu2);
}
var2 = sum_var2 / 270;
for (int i = 900; i < 1350; i++)
{
sum_var3 += (train_data[i].first - mu3)*(train_data[i].first - mu3);
}
var3 = sum_var3 / 450;
double lambda1, lambda2, lambda3;
lambda1 = 630.0 / 1350.0;
lambda2 = 270.0 / 1350.0;
lambda3 = 450.0 / 1350.0;
vector<vector<double>> max_likelihood_probability_vector;
for (int i = 0; i < 150; i++)
{
double x = test_data[i].first;
vector<double> l;
for (int j = 0; j < 3; j++)
{
if (j == 0)
{
double p = normal_distribution_probability(x,mu1,var1);
l.push_back(p);
}
else if (j == 1)
{
double p = normal_distribution_probability(x, mu2, var2);
l.push_back(p);
}
else
{
double p = normal_distribution_probability(x, mu3, var3);
l.push_back(p);
}
}
max_likelihood_probability_vector.push_back(l);
}
vector<int> predection;
for (int i = 0; i < 150; i++)
{
vector<double> predic;
double total = 0;
for (int k = 0; k < 3; k++)
{
if (k == 0)
total += max_likelihood_probability_vector[i][k] * lambda1;
else if (k == 1)
total += max_likelihood_probability_vector[i][k] * lambda2;
else
total += max_likelihood_probability_vector[i][k] * lambda3;
}
for (int j = 0; j < 3; j++)
{
if (j == 0)
{
double q = max_likelihood_probability_vector[i][j] * lambda1;
double f = q / total;
predic.push_back((max_likelihood_probability_vector[i][j] * lambda1) / total);
}
else if (j == 1)
{
double q = max_likelihood_probability_vector[i][j] * lambda2;
double f = q / total;
predic.push_back((max_likelihood_probability_vector[i][j] * lambda2) / total);
}
else
{
double q = max_likelihood_probability_vector[i][j] * lambda3;
double f = q / total;
predic.push_back((max_likelihood_probability_vector[i][j] * lambda3) / total);
}
}
predection.push_back(max_id<double>(predic) + 1);
}
for (int i = 0; i < 150; i++)
{
cout << i + 1 << " : " << predection[i] << endl;
}
}
本篇博客代码的缺陷是产生的label数目是指定死的,受限于时间关系没来得及改成更通用的代码,可以接受任意的分布数量及其训练学习,数据生成,是为缺憾!
算法说明:如果label的分布差异较大的时候分类器的分类精度较高。原因也很简单,本篇博客使用的是一元正态分布数据,只有一个特征量,一旦label之间的分布存在较大重合,重合部分是很难正确分开的。