概述
到目前爲止,我們已經把所有Caffe的基本模塊從最基本的SyncedMemory到最上層的Solver都過了一遍。那麼Caffe最後是怎麼把他們串在一起的呢?這一篇主要講解Caffe從main函數開始是怎麼完成整個訓練過程和測試過程的
Caffe支持的命令
- train: 訓練或者調整一個模型
- test : 在測試集上測試一個模型
- device_query : 打印GPU的調試信息
- time: 壓測一個模型的執行時間
Train函數
int train() {
vector<string> stages = get_stages_from_flags();
caffe::SolverParameter solver_param;
// 從FLAGS_solver文件裏面讀取solver配置
caffe::ReadSolverParamsFromTextFileOrDie(FLAGS_solver, &solver_param);
solver_param.mutable_train_state()->set_level(FLAGS_level);
for (int i = 0; i < stages.size(); i++) {
solver_param.mutable_train_state()->add_stage(stages[i]);
}
vector<int> gpus;
get_gpus(&gpus);
if (gpus.size() == 0) {
Caffe::set_mode(Caffe::CPU); // 如果沒有GPU,使用CPU
} else {
solver_param.set_device_id(gpus[0]); // 設置當前GPU爲gpus[0]
Caffe::SetDevice(gpus[0]);
Caffe::set_mode(Caffe::GPU);
Caffe::set_solver_count(gpus.size());
}
caffe::SignalHandler signal_handler(
GetRequestedAction(FLAGS_sigint_effect), //註冊收到SIGINT信號時做什麼,默認是停止訓練
GetRequestedAction(FLAGS_sighup_effect)); //註冊收到SIGHUP信號時做什麼,默認是做Snapshot
// 通過SolverFactory創建一個Solver類
shared_ptr<caffe::Solver<float> >
solver(caffe::SolverRegistry<float>::CreateSolver(solver_param));
solver->SetActionFunction(signal_handler.GetActionFunction());
if (FLAGS_snapshot.size()) {
solver->Restore(FLAGS_snapshot.c_str()); // 如果已經有snapshot,從snapshot開始訓練
} else if (FLAGS_weights.size()) {
CopyLayers(solver.get(), FLAGS_weights); // 如果已經有保存的參數,則拷貝到網絡裏開始訓練
}
if (gpus.size() > 1) { // 如果GPU個數多於一個,則開始GPU並行訓練
caffe::P2PSync<float> sync(solver, NULL, solver->param());
sync.Run(gpus);
} else {
solver->Solve(); // 否則進行網絡參數優化
}
return 0;
}
Test函數
int test() {
vector<string> stages = get_stages_from_flags();
vector<int> gpus;
get_gpus(&gpus);
if (gpus.size() != 0) {
Caffe::SetDevice(gpus[0]); // 使用GPU gpus[0]
Caffe::set_mode(Caffe::GPU);
} else {
Caffe::set_mode(Caffe::CPU); // 使用CPU
}
Net<float> caffe_net(FLAGS_model, caffe::TEST, FLAGS_level, &stages); // 創建一個測試網絡
caffe_net.CopyTrainedLayersFrom(FLAGS_weights); // 把訓練好的參數拷貝到網絡裏
vector<int> test_score_output_id;
vector<float> test_score;
float loss = 0;
for (int i = 0; i < FLAGS_iterations; ++i) {
float iter_loss;
const vector<Blob<float>*>& result = caffe_net.Forward(&iter_loss); // 進行前向傳播
loss += iter_loss; // 累計Loss值以求平均值
int idx = 0;
for (int j = 0; j < result.size(); ++j) {
const float* result_vec = result[j]->cpu_data();
for (int k = 0; k < result[j]->count(); ++k, ++idx) {
const float score = result_vec[k];
if (i == 0) {
test_score.push_back(score);
test_score_output_id.push_back(j);
} else {
test_score[idx] += score; // 累計網絡輸出的blob的值以求平均值
}
}
}
}
loss /= FLAGS_iterations; // 除以迭代的次數計算平均值
for (int i = 0; i < test_score.size(); ++i) {
const float loss_weight = caffe_net.blob_loss_weights()[
caffe_net.output_blob_indices()[test_score_output_id[i]]];
const float mean_score = test_score[i] / FLAGS_iterations; // 除以迭代的次數求平均值
if (loss_weight) { // 對blob值進行loss_weight加權
loss_msg_stream << " (* " << loss_weight
<< " = " << loss_weight * mean_score << " loss)";
}
}
return 0;
}
device_query函數
int device_query() {
vector<int> gpus;
// 通過cudaGetDeviceCount(&count)得到GPU個數,然後返回GPU id 0,1,...,count-1
get_gpus(&gpus);
for (int i = 0; i < gpus.size(); ++i) {
caffe::Caffe::SetDevice(gpus[i]); // 設置當前GPU爲gpus[i]
caffe::Caffe::DeviceQuery(); // 獲得當前GPU信息
}
return 0;
}
// 設置當前GPU爲device_id
void Caffe::SetDevice(const int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(¤t_device)); // 得到當前線程使用的GPU id
if (current_device == device_id) { // 如果已經是了,則退出
return;
}
CUDA_CHECK(cudaSetDevice(device_id)); // 設置這個線程使用GPU device_id
// Get()返回一個Caffe類型的Thread local,如果cublas_handle_和curand_generator_不爲null,需要釋放它們
if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
if (Get().curand_generator_) {
CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
}
// 重新建立一個新的cublas_handle_和curand_generator_並存在Thread local裏
CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
CURAND_RNG_PSEUDO_DEFAULT));
CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
cluster_seedgen()));
}
// 進行設備的查詢
void Caffe::DeviceQuery() {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); // 獲取設備屬性
// 答應屬性內容,
LOG(INFO) << "Device id: " << device;
LOG(INFO) << "Major revision number: " << prop.major;
LOG(INFO) << "Minor revision number: " << prop.minor;
LOG(INFO) << "Name: " << prop.name;
LOG(INFO) << "Total global memory: " << prop.totalGlobalMem;
LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
LOG(INFO) << "Total registers per block: " << prop.regsPerBlock;
LOG(INFO) << "Warp size: " << prop.warpSize;
LOG(INFO) << "Maximum memory pitch: " << prop.memPitch;
LOG(INFO) << "Maximum threads per block: " << prop.maxThreadsPerBlock;
LOG(INFO) << "Maximum dimension of block: "
<< prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
<< prop.maxThreadsDim[2];
LOG(INFO) << "Maximum dimension of grid: "
<< prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
<< prop.maxGridSize[2];
LOG(INFO) << "Clock rate: " << prop.clockRate;
LOG(INFO) << "Total constant memory: " << prop.totalConstMem;
LOG(INFO) << "Texture alignment: " << prop.textureAlignment;
LOG(INFO) << "Concurrent copy and execution: "
<< (prop.deviceOverlap ? "Yes" : "No");
LOG(INFO) << "Number of multiprocessors: " << prop.multiProcessorCount;
LOG(INFO) << "Kernel execution timeout: "
<< (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
return;
}
time函數
int time() {
caffe::Phase phase = get_phase_from_flags(caffe::TRAIN);
vector<string> stages = get_stages_from_flags();
vector<int> gpus;
get_gpus(&gpus);
if (gpus.size() != 0) {
// 使用GPU gpus[0]
Caffe::SetDevice(gpus[0]);
Caffe::set_mode(Caffe::GPU);
} else {
// 使用CPU
Caffe::set_mode(Caffe::CPU);
}
Net<float> caffe_net(FLAGS_model, phase, FLAGS_level, &stages);
float initial_loss;
// 先做一次前向和反向傳播來預先分配好內存,這樣能使測試的結果更加穩定
caffe_net.Forward(&initial_loss); // 做一次前向傳播。因爲是速度測試,網絡沒有輸入
caffe_net.Backward(); //再做一次反向傳播
Timer total_timer, forward_timer, backward_timer, timer;
total_timer.Start();
std::vector<double> forward_time_per_layer(layers.size(), 0.0);
std::vector<double> backward_time_per_layer(layers.size(), 0.0);
double forward_time = 0.0;
double backward_time = 0.0;
// 做FLAGS_iterations次迭代
for (int j = 0; j < FLAGS_iterations; ++j) {
Timer iter_timer;
iter_timer.Start();
forward_timer.Start();
// 對每一層分別做一次前向傳播,並記錄每層傳播的時間
for (int i = 0; i < layers.size(); ++i) {
timer.Start();
layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
forward_time_per_layer[i] += timer.MicroSeconds();
}
forward_time += forward_timer.MicroSeconds();
backward_timer.Start();
// 對每層分別做反向傳播
for (int i = layers.size() - 1; i >= 0; --i) {
timer.Start();
layers[i]->Backward(top_vecs[i], bottom_need_backward[i],
bottom_vecs[i]);
backward_time_per_layer[i] += timer.MicroSeconds();
}
backward_time += backward_timer.MicroSeconds();
}
// 輸出每層前向傳播和反向傳播需要的平均時間
LOG(INFO) << "Average time per layer: ";
for (int i = 0; i < layers.size(); ++i) {
const caffe::string& layername = layers[i]->layer_param().name();
LOG(INFO) << std::setfill(' ') << std::setw(10) << layername <<
"\tforward: " << forward_time_per_layer[i] / 1000 /
FLAGS_iterations << " ms."; // 前向傳播時間
LOG(INFO) << std::setfill(' ') << std::setw(10) << layername <<
"\tbackward: " << backward_time_per_layer[i] / 1000 /
FLAGS_iterations << " ms."; // 反向傳播時間
}
total_timer.Stop();
LOG(INFO) << "Average Forward pass: " << forward_time / 1000 /
FLAGS_iterations << " ms."; // 平均整個網絡的前向傳播時間
LOG(INFO) << "Average Backward pass: " << backward_time / 1000 /
FLAGS_iterations << " ms."; // 整個網絡的平均反向傳播時間
LOG(INFO) << "Average Forward-Backward: " << total_timer.MilliSeconds() /
FLAGS_iterations << " ms."; // 整個網絡的前向加反向傳播時間
LOG(INFO) << "Total Time: " << total_timer.MilliSeconds() << " ms.";
LOG(INFO) << "*** Benchmark ends ***";
return 0;
}