問題描述:在caffe中,做的是強化學習image caption,想要實現在網絡中多加一個全連接層(一共三個net要加,訓練baseline的net,訓練強化學習的scstnet,以及兩者共用的decode_net)。在訓練baseline的時候加入了全連接層。網絡可以訓練出加入那層的參數,並且decode的時候也能產生結果。但是在訓練強化學習的scstnet訓練能夠訓練處模型,不會報錯。但是在decode的過程中發現添加的那層全連接層並沒有傳入值,導致了
的問題。問題顯示加載的模型參數傳入不進decode網絡中。對模型參數進行打印,顯示加入的那層只有名稱,並沒有具體的數值(是訓練沒有產生,還是沒有保存?)!其中fc_atten是我添加的全連接層的層名。可以看到打印結果只有名稱,沒有參數的shape。其他參數由於都是共享參數,所以第一個節點都產生了具體的tensor。
2,相關代碼。
在訓練baseline的net和decode結構基本相同
```
layer {
name: "fc_atten_0"
type: "InnerProduct"
bottom: "lstm0_hidden0"
top: "fc_atten_0"
param {
name: "fc_param_0"
}
inner_product_param {
num_output: 1000
bias_term: true
weight_filler {
type: "gaussian"
std: 0.00999999977648
}
}
}
```
在訓練scst的部分,也是這樣加入的。但scst和net有點不一樣的是scst多了個beamsearch層,並且加載了net訓練的模型參數(兩者結構基本一樣)。由於我訓練的是語言模型,一共20個lstm的時間步。在作者的beamsearch層中,它將第一個step的全部代碼放到該層中,後面的20個時間步都是共享該層中的參數。
```
layer {
name: "beam"
type: "BeamSearch"
bottom: "num_boxes"
bottom: "spatial_features"
bottom: "fc"
bottom: "context"
top: "caption"
top: "log_prob"
top: "log_prob_sequence"
param {
name: "embed_param"
}
param {
name: "lstm0_param_0"
}
param {
name: "lstm0_param_1"
}
param {
name: "hidden_att_param_0"
}
param {
name: "predict_att_param_0"
}
param {
name: "lstm1_param_0"
}
param {
name: "lstm1_param_1"
}
param {
name: "fc_param_0"
}
param {
name: "predict_param_0"
}
param {
name: "predict_param_1"
}
beam_search_param {
net_param {
layer {
name: "input"
type: "Input"
top: "num_boxes"
top: "spatial_features"
top: "fc"
top: "context"
top: "input"
input_param {
shape {
dim: 12
dim: 1
}
shape {
dim: 12
dim: 100
dim: 2048
}
shape {
dim: 12
dim: 100
dim: 512
}
shape {
dim: 12
dim: 2048
}
shape {
dim: 12
dim: 1
}
}
}
layer {
name: "lstm0_hidden_prev"
type: "DummyData"
top: "lstm0_hidden_prev"
dummy_data_param {
shape {
dim: 12
dim: 1000
}
}
}
layer {
name: "lstm0_mem_cell_prev"
type: "DummyData"
top: "lstm0_mem_cell_prev"
dummy_data_param {
shape {
dim: 12
dim: 1000
}
}
}
layer {
name: "lstm1_hidden_prev"
type: "DummyData"
top: "lstm1_hidden_prev"
dummy_data_param {
shape {
dim: 12
dim: 1000
}
}
}
layer {
name: "lstm1_mem_cell_prev"
type: "DummyData"
top: "lstm1_mem_cell_prev"
dummy_data_param {
shape {
dim: 12
dim: 1000
}
}
}
layer {
name: "embedding"
type: "Embed"
bottom: "input"
top: "embedding"
param {
name: "embed_param"
}
propagate_down: false
embed_param {
num_output: 1000
input_dim: 10010
bias_term: false
weight_filler {
type: "gaussian"
std: 0.00999999977648
}
}
}
layer {
name: "concat0_t0"
type: "Concat"
bottom: "embedding"
bottom: "context"
bottom: "lstm1_hidden_prev"
bottom: "lstm0_hidden_prev"
top: "concat0_t0"
}
layer {
name: "lstm1"
type: "LSTMNode"
bottom: "concat0_t0"
bottom: "lstm0_mem_cell_prev"
top: "lstm0_hidden0"
top: "lstm0_mem_cell0"
param {
name: "lstm0_param_0"
}
param {
name: "lstm0_param_1"
}
propagate_down: true
propagate_down: false
lstm_param {
num_cells: 1000
input_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
input_gate_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
forget_gate_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
output_gate_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
input_bias_filler {
type: "constant"
value: 0.0
}
input_gate_bias_filler {
type: "constant"
value: 0.0
}
forget_gate_bias_filler {
type: "constant"
value: 1.0
}
output_gate_bias_filler {
type: "constant"
value: 0.0
}
}
}
layer {
name: "hidden_att_0"
type: "InnerProduct"
bottom: "lstm0_hidden0"
top: "hidden_att_0"
param {
name: "hidden_att_param_0"
}
inner_product_param {
num_output: 512
bias_term: false
weight_filler {
type: "gaussian"
std: 0.00999999977648
}
}
}
layer {
name: "tile_hidden_att_0"
type: "Tile"
bottom: "hidden_att_0"
top: "tile_hidden_att_0"
tile_param {
axis: 1
tiles: 100
}
}
layer {
name: "tile_hidden_reshape_0"
type: "Reshape"
bottom: "tile_hidden_att_0"
top: "tile_hidden_reshape_0"
reshape_param {
shape {
dim: 0
dim: -1
dim: 512
}
}
}
layer {
name: "sum_hidden_att_0"
type: "Eltwise"
bottom: "fc"
bottom: "tile_hidden_reshape_0"
top: "sum_hidden_att_0"
eltwise_param {
operation: SUM
}
}
layer {
name: "hidden_tanh_0"
type: "TanH"
bottom: "sum_hidden_att_0"
top: "sum_hidden_att_0"
}
layer {
name: "predict_att_0"
type: "InnerProduct"
bottom: "sum_hidden_att_0"
top: "predict_att_0"
param {
name: "predict_att_param_0"
}
inner_product_param {
num_output: 1
bias_term: false
weight_filler {
type: "gaussian"
std: 0.00999999977648
}
axis: 2
}
}
layer {
name: "reshape_predict_att_0"
type: "Reshape"
bottom: "predict_att_0"
top: "reshape_predict_att_0"
reshape_param {
shape {
dim: 0
dim: -1
}
}
}
layer {
name: "att_weight_0"
type: "Softmax"
bottom: "reshape_predict_att_0"
bottom: "num_boxes"
top: "att_weight_0"
softmax_param {
engine: CAFFE
axis: 1
}
}
layer {
name: "att_product_0"
type: "Scale"
bottom: "spatial_features"
bottom: "att_weight_0"
top: "att_product_0"
scale_param {
axis: 0
}
}
layer {
name: "permute_att_0"
type: "Permute"
bottom: "att_product_0"
top: "permute_att_0"
permute_param {
order: 0
order: 2
order: 1
}
}
layer {
name: "fc8_0"
type: "Reduction"
bottom: "permute_att_0"
top: "fc8_0"
reduction_param {
axis: 2
}
}
layer {
name: "concat1_t0"
type: "Concat"
bottom: "lstm0_hidden0"
bottom: "fc8_0"
bottom: "lstm1_hidden_prev"
top: "concat1_t0"
}
layer {
name: "lstm2"
type: "LSTMNode"
bottom: "concat1_t0"
bottom: "lstm1_mem_cell_prev"
top: "lstm1_hidden0"
top: "lstm1_mem_cell0"
param {
name: "lstm1_param_0"
}
param {
name: "lstm1_param_1"
}
propagate_down: true
propagate_down: false
lstm_param {
num_cells: 1000
input_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
input_gate_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
forget_gate_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
output_gate_weight_filler {
type: "gaussian"
std: 0.00999999977648
}
input_bias_filler {
type: "constant"
value: 0.0
}
input_gate_bias_filler {
type: "constant"
value: 0.0
}
forget_gate_bias_filler {
type: "constant"
value: 1.0
}
output_gate_bias_filler {
type: "constant"
value: 0.0
}
}
}
layer {
name: "fc_atten_0"
type: "InnerProduct"
bottom: "lstm0_hidden0"
top: "fc_atten_0"
param {
name: "fc_param_0"
}
inner_product_param {
num_output: 1000
bias_term: true
weight_filler {
type: "gaussian"
std: 0.00999999977648
}
}
}
layer {
name: "concat_lstm0_lstm1_0"
type: "Concat"
bottom: "lstm1_hidden0"
bottom: "fc_atten_0"
top: "concat_lstm0_lstm1_0"
}
layer {
name: "predict"
type: "InnerProduct"
bottom: "concat_lstm0_lstm1_0"
top: "predict"
param {
name: "predict_param_0"
lr_mult: 1.0
decay_mult: 1.0
}
param {
name: "predict_param_1"
lr_mult: 2.0
decay_mult: 0.0
}
inner_product_param {
num_output: 10010
weight_filler {
type: "gaussian"
std: 0.00999999977648
}
bias_filler {
type: "constant"
value: 0.0
}
axis: 1
}
}
layer {
name: "probs_0"
type: "Softmax"
bottom: "predict"
top: "probs_0"
softmax_param {
axis: 1
}
}
layer {
name: "logp_0"
type: "Log"
bottom: "probs_0"
top: "logp_0"
}
}
sequence_length: 20
beam_size: 5
end_of_sequence: 0
recurrent_connection {
src: "lstm0_hidden0"
dest: "lstm0_hidden_prev"
}
recurrent_connection {
src: "lstm0_mem_cell0"
dest: "lstm0_mem_cell_prev"
}
recurrent_connection {
src: "lstm1_hidden0"
dest: "lstm1_hidden_prev"
}
recurrent_connection {
src: "lstm1_mem_cell0"
dest: "lstm1_mem_cell_prev"
}
beam_search_connection {
src: "logp_0"
dest: "input"
}
allowed_multiple: 2
allowed_multiple: 5
allowed_multiple: 4
allowed_multiple: 15
allowed_multiple: 3
allowed_multiple: 6
allowed_multiple: 8
allowed_multiple: 7
allowed_multiple: 9
allowed_multiple: 13
allowed_multiple: 277
allowed_multiple: 11
allowed_multiple: 30
allowed_multiple: 16
allowed_multiple: 19
allowed_multiple: 27
allowed_multiple: 25
allowed_multiple: 119
allowed_multiple: 48
}
}
layer {
name: "silence_bs"
type: "Silence"
bottom: "log_prob"
bottom: "log_prob_sequence"
}
```
其中fc_atten是我仿造該網絡加入的層。並且相同設置了共享參數,並且把參數名放到了前面。以下兩部分就是我在別人代碼上作的修改
```
param {
name: "fc_param_0"
}
```
```
layer {
name: "fc_atten_0"
type: "InnerProduct"
bottom: "lstm0_hidden0"
top: "fc_atten_0"
param {
name: "fc_param_0"
}
inner_product_param {
num_output: 1000
bias_term: true
weight_filler {
type: "gaussian"
std: 0.00999999977648
}
}
}
```
其他都沒有改變。現在就是訓練出來不會報錯,但是模型中有該層fc_atten的名稱,沒有對應的tensor。
3,試過的方法,把寫的beam_search.cpp 和hpp都看過了,以及caffe.proto也看過修改過試着得到解決辦法。但是可能是作者寫的太nb自己水平不夠。
這裏貼上源代碼的連接 https://github.com/peteanderson80/caffe/tree/631806541c68658248ffdbbbde659f478fac4113
我考慮的是,1,作者是否把beam_search 傳入參數寫死,只允許傳入9個 我新加的傳不進去(但是目前併爲發現)2,既然在net和decode中可以得到結果,那麼問題可能就出現在作者寫的beamsearch.cpp中:https://github.com/peteanderson80/caffe/blob/631806541c68658248ffdbbbde659f478fac4113/src/caffe/layers/beam_search_layer.cpp。
4,提問:爲什麼在scst中訓練不出來參數,怎樣才能讓模型跑出來,並且decode出來結果
_______
caffe在別人修改的層中,加入了個全連接層,並且設置爲共享參數,訓練出的模型中只有該層的名稱,沒有具體的tensor
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.