| @@ -695,13 +695,7 @@ int GRU_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c | |||
| int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (bottom_blobs.size() != 2 || top_blobs.size() != 2) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int elembits = bottom_blob.elembits(); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| @@ -720,24 +714,72 @@ int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = gru(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden_state, opt); | |||
| int ret = gru(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1625,16 +1667,29 @@ int GRU_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // copy previous states | |||
| Mat hidden; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| @@ -1643,7 +1698,42 @@ int GRU_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1711,16 +1801,29 @@ int GRU_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // copy previous states | |||
| Mat hidden; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| @@ -1729,7 +1832,42 @@ int GRU_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -2365,16 +2503,29 @@ int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // copy previous states | |||
| Mat hidden; | |||
| cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| @@ -2383,7 +2534,42 @@ int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| return ret; | |||
| } | |||
| cast_float32_to_bfloat16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const unsigned short* pf = top_blob_forward.row<const unsigned short>(i); | |||
| const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i); | |||
| unsigned short* ptr = top_blob.row<unsigned short>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(unsigned short)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| cast_float32_to_bfloat16(hidden, top_blobs[1], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -423,13 +423,7 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (bottom_blobs.size() != 3 || top_blobs.size() != 3) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int elembits = bottom_blob.elembits(); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| @@ -448,26 +442,82 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| Mat& cell_state = top_blobs[2]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Mat cell; | |||
| Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 3) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_cell_allocator); | |||
| cell = bottom_blobs[2].clone(hidden_cell_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| cell_state = bottom_blobs[2].clone(opt.blob_allocator); | |||
| cell.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (cell.empty()) | |||
| return -100; | |||
| cell.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden_state, cell_state, opt); | |||
| int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| Mat cell0 = cell.row_range(0, 1); | |||
| int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| Mat cell1 = cell.row_range(1, 1); | |||
| int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 3) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| top_blobs[2] = cell; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1182,17 +1232,35 @@ int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| // copy previous states | |||
| Mat hidden; | |||
| Mat cell; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| cast_float16_to_float32(bottom_blobs[2], cell, opt); | |||
| Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 3) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_cell_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| cast_float16_to_float32(bottom_blobs[2], cell, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| cell.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (cell.empty()) | |||
| return -100; | |||
| cell.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| @@ -1202,8 +1270,45 @@ int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| cast_float32_to_float16(cell, top_blobs[2], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| Mat cell0 = cell.row_range(0, 1); | |||
| int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| Mat cell1 = cell.row_range(1, 1); | |||
| int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 3) | |||
| { | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| cast_float32_to_float16(cell, top_blobs[2], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1277,17 +1382,35 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| // copy previous states | |||
| Mat hidden; | |||
| Mat cell; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| cast_float16_to_float32(bottom_blobs[2], cell, opt); | |||
| Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 3) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_cell_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| cast_float16_to_float32(bottom_blobs[2], cell, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| cell.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (cell.empty()) | |||
| return -100; | |||
| cell.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| @@ -1297,8 +1420,45 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| cast_float32_to_float16(cell, top_blobs[2], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| Mat cell0 = cell.row_range(0, 1); | |||
| int ret0 = lstm_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| Mat cell1 = cell.row_range(1, 1); | |||
| int ret1 = lstm_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 3) | |||
| { | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| cast_float32_to_float16(cell, top_blobs[2], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1664,17 +1824,35 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| // copy previous states | |||
| Mat hidden; | |||
| Mat cell; | |||
| cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt); | |||
| cast_bfloat16_to_float32(bottom_blobs[2], cell, opt); | |||
| Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 3) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_cell_allocator; | |||
| cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| cast_bfloat16_to_float32(bottom_blobs[2], cell, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| cell.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (cell.empty()) | |||
| return -100; | |||
| cell.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| @@ -1684,8 +1862,45 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| return ret; | |||
| } | |||
| cast_float32_to_bfloat16(hidden, top_blobs[1], opt); | |||
| cast_float32_to_bfloat16(cell, top_blobs[2], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| Mat cell0 = cell.row_range(0, 1); | |||
| int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| Mat cell1 = cell.row_range(1, 1); | |||
| int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const unsigned short* pf = top_blob_forward.row<const unsigned short>(i); | |||
| const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i); | |||
| unsigned short* ptr = top_blob.row<unsigned short>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(unsigned short)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 3) | |||
| { | |||
| cast_float32_to_bfloat16(hidden, top_blobs[1], opt); | |||
| cast_float32_to_bfloat16(cell, top_blobs[2], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -377,13 +377,7 @@ int RNN_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c | |||
| int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (bottom_blobs.size() != 2 || top_blobs.size() != 2) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int elembits = bottom_blob.elembits(); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| @@ -402,24 +396,72 @@ int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden_state, opt); | |||
| int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = rnn(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = rnn(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -965,16 +1007,29 @@ int RNN_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // copy previous states | |||
| Mat hidden; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| @@ -983,7 +1038,42 @@ int RNN_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = rnn_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = rnn_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1051,16 +1141,29 @@ int RNN_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // copy previous states | |||
| Mat hidden; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| @@ -1069,7 +1172,42 @@ int RNN_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = rnn_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = rnn_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1387,16 +1525,29 @@ int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // copy previous states | |||
| Mat hidden; | |||
| cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| @@ -1405,7 +1556,42 @@ int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat | |||
| return ret; | |||
| } | |||
| cast_float32_to_bfloat16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = rnn_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = rnn_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const unsigned short* pf = top_blob_forward.row<const unsigned short>(i); | |||
| const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i); | |||
| unsigned short* ptr = top_blob.row<unsigned short>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(unsigned short)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| cast_float32_to_bfloat16(hidden, top_blobs[1], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -29,8 +29,6 @@ int GRU::load_param(const ParamDict& pd) | |||
| num_output = pd.get(0, 0); | |||
| weight_data_size = pd.get(1, 0); | |||
| direction = pd.get(2, 0); | |||
| if (direction == 2) | |||
| one_blob_only = true; | |||
| return 0; | |||
| } | |||
| @@ -223,30 +221,74 @@ int GRU::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| int GRU::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (bottom_blobs.size() != 2 || top_blobs.size() != 2) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, opt); | |||
| int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -29,8 +29,6 @@ int LSTM::load_param(const ParamDict& pd) | |||
| num_output = pd.get(0, 0); | |||
| weight_data_size = pd.get(1, 0); | |||
| direction = pd.get(2, 0); | |||
| if (direction == 2) | |||
| one_blob_only = true; | |||
| return 0; | |||
| } | |||
| @@ -232,32 +230,84 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons | |||
| int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (bottom_blobs.size() != 3 || top_blobs.size() != 3) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| Mat& cell_state = top_blobs[2]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Mat cell; | |||
| Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 3) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_cell_allocator); | |||
| cell = bottom_blobs[2].clone(hidden_cell_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| cell_state = bottom_blobs[2].clone(opt.blob_allocator); | |||
| cell.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (cell.empty()) | |||
| return -100; | |||
| cell.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, cell_state, opt); | |||
| int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| Mat cell0 = cell.row_range(0, 1); | |||
| int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| Mat cell1 = cell.row_range(1, 1); | |||
| int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 3) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| top_blobs[2] = cell; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -301,11 +301,6 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (bottom_blobs.size() != 2 || top_blobs.size() != 2) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int elembits = bottom_blob.elembits(); | |||
| @@ -321,24 +316,73 @@ int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t | |||
| #endif | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, opt); | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| } | |||
| return 0; | |||
| #endif | |||
| return GRU::forward(bottom_blobs, top_blobs, opt); | |||
| @@ -587,24 +631,75 @@ int GRU_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<M | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| //Copy previous states | |||
| Mat hidden; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -853,15 +948,29 @@ int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector< | |||
| { | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| Option opt_cast = opt; | |||
| opt_cast.blob_allocator = hidden_allocator; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output, T, 2u, opt.blob_allocator); | |||
| top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| //Copy previous states | |||
| Mat hidden; | |||
| cast_float16_to_float32(bottom_blobs[1], hidden, opt); | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| @@ -870,11 +979,46 @@ int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector< | |||
| return ret; | |||
| } | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const __fp16* pf = top_blob_forward.row<const __fp16>(i); | |||
| const __fp16* pr = top_blob_reverse.row<const __fp16>(i); | |||
| __fp16* ptr = top_blob.row<__fp16>(i); | |||
| memcpy(ptr, pf, num_output * sizeof(__fp16)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| cast_float32_to_float16(hidden, top_blobs[1], opt); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| } // namespace ncnn | |||
| } // namespace ncnn | |||
| @@ -29,8 +29,6 @@ int RNN::load_param(const ParamDict& pd) | |||
| num_output = pd.get(0, 0); | |||
| weight_data_size = pd.get(1, 0); | |||
| direction = pd.get(2, 0); | |||
| if (direction == 2) | |||
| one_blob_only = true; | |||
| return 0; | |||
| } | |||
| @@ -172,30 +170,74 @@ int RNN::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| if (bottom_blobs.size() != 2 || top_blobs.size() != 2) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| Mat hidden; | |||
| Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 2) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| } | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, opt); | |||
| int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| int ret0 = rnn(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| int ret1 = rnn(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 2) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -910,42 +910,123 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) | |||
| int LSTM_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const | |||
| { | |||
| #if __AVX__ | |||
| if (bottom_blobs.size() != 3 || top_blobs.size() != 3) | |||
| { | |||
| return forward(bottom_blobs[0], top_blobs[0], opt); | |||
| } | |||
| const Mat& bottom_blob = bottom_blobs[0]; | |||
| int T = bottom_blob.h; | |||
| Mat& top_blob = top_blobs[0]; | |||
| Mat& hidden_state = top_blobs[1]; | |||
| Mat& cell_state = top_blobs[2]; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| //Copy previous states | |||
| hidden_state = bottom_blobs[1].clone(opt.blob_allocator); | |||
| cell_state = bottom_blobs[2].clone(opt.blob_allocator); | |||
| Mat hidden; | |||
| Mat cell; | |||
| Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator; | |||
| if (bottom_blobs.size() == 3) | |||
| { | |||
| hidden = bottom_blobs[1].clone(hidden_cell_allocator); | |||
| cell = bottom_blobs[2].clone(hidden_cell_allocator); | |||
| } | |||
| else | |||
| { | |||
| hidden.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (hidden.empty()) | |||
| return -100; | |||
| hidden.fill(0.f); | |||
| top_blob.create(num_output, T, 4u, opt.blob_allocator); | |||
| cell.create(num_output, num_directions, 4u, hidden_cell_allocator); | |||
| if (cell.empty()) | |||
| return -100; | |||
| cell.fill(0.f); | |||
| } | |||
| Mat& top_blob = top_blobs[0]; | |||
| top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #if __AVX2__ | |||
| if (opt.use_weight_fp16_storage) | |||
| // Uni directional | |||
| if (direction == 0 || direction == 1) | |||
| { | |||
| // Uni directional | |||
| int ret = lstm_fp16(bottom_blob, top_blob, direction, weight_xc_data_fp16.channel(0), bias_c_data.channel(0), weight_hc_data_fp16.channel(0), hidden_state, cell_state, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| #if __AVX2__ | |||
| if (opt.use_weight_fp16_storage) | |||
| { | |||
| int ret = lstm_fp16(bottom_blob, top_blob, direction, weight_xc_data_fp16.channel(0), bias_c_data.channel(0), weight_hc_data_fp16.channel(0), hidden, cell, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| else | |||
| { | |||
| #endif | |||
| int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| #if __AVX2__ | |||
| } | |||
| #endif | |||
| } | |||
| else | |||
| if (direction == 2) | |||
| { | |||
| Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_forward.empty()) | |||
| return -100; | |||
| Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator); | |||
| if (top_blob_reverse.empty()) | |||
| return -100; | |||
| Mat hidden0 = hidden.row_range(0, 1); | |||
| Mat cell0 = cell.row_range(0, 1); | |||
| #if __AVX2__ | |||
| if (opt.use_weight_fp16_storage) | |||
| { | |||
| int ret = lstm_fp16(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16.channel(0), bias_c_data.channel(0), weight_hc_data_fp16.channel(0), hidden0, cell0, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| else | |||
| { | |||
| #endif | |||
| // Uni directional | |||
| int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, cell_state, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt); | |||
| if (ret0 != 0) | |||
| return ret0; | |||
| #if __AVX2__ | |||
| } | |||
| } | |||
| #endif | |||
| Mat hidden1 = hidden.row_range(1, 1); | |||
| Mat cell1 = cell.row_range(1, 1); | |||
| #if __AVX2__ | |||
| if (opt.use_weight_fp16_storage) | |||
| { | |||
| int ret = lstm_fp16(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16.channel(1), bias_c_data.channel(1), weight_hc_data_fp16.channel(1), hidden1, cell1, opt); | |||
| if (ret != 0) | |||
| return ret; | |||
| } | |||
| else | |||
| { | |||
| #endif | |||
| int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt); | |||
| if (ret1 != 0) | |||
| return ret1; | |||
| #if __AVX2__ | |||
| } | |||
| #endif | |||
| // concat w | |||
| for (int i = 0; i < T; i++) | |||
| { | |||
| const float* pf = top_blob_forward.row(i); | |||
| const float* pr = top_blob_reverse.row(i); | |||
| float* ptr = top_blob.row(i); | |||
| memcpy(ptr, pf, num_output * sizeof(float)); | |||
| memcpy(ptr + num_output, pr, num_output * sizeof(float)); | |||
| } | |||
| } | |||
| if (top_blobs.size() == 3) | |||
| { | |||
| top_blobs[1] = hidden; | |||
| top_blobs[2] = cell; | |||
| } | |||
| return 0; | |||
| #else | |||
| return LSTM::forward(bottom_blobs, top_blobs, opt); | |||
| @@ -42,19 +42,20 @@ static int test_gru(const ncnn::Mat& a, int outch, int direction) | |||
| int test_gru_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * 3); | |||
| pd.set(1, outch * input_size * 3 * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * 3); | |||
| weights[1] = RandomMat(outch * 4); | |||
| weights[2] = RandomMat(outch * outch * 3); | |||
| weights[0] = RandomMat(outch * input_size * 3 * num_directions); | |||
| weights[1] = RandomMat(outch * 4 * num_directions); | |||
| weights[2] = RandomMat(outch * outch * 3 * num_directions); | |||
| // initial hidden state | |||
| ncnn::Mat hidden = RandomMat(outch); | |||
| ncnn::Mat hidden = RandomMat(outch, num_directions); | |||
| std::vector<ncnn::Mat> as(2); | |||
| as[0] = a; | |||
| @@ -69,6 +70,64 @@ int test_gru_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) | |||
| return ret; | |||
| } | |||
| int test_gru_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * 3 * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * 3 * num_directions); | |||
| weights[1] = RandomMat(outch * 4 * num_directions); | |||
| weights[2] = RandomMat(outch * outch * 3 * num_directions); | |||
| // initial hidden state | |||
| ncnn::Mat hidden = RandomMat(outch, num_directions); | |||
| std::vector<ncnn::Mat> as(2); | |||
| as[0] = a; | |||
| as[1] = hidden; | |||
| int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 1); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_gru_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); | |||
| } | |||
| return ret; | |||
| } | |||
| int test_gru_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * 3 * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * 3 * num_directions); | |||
| weights[1] = RandomMat(outch * 4 * num_directions); | |||
| weights[2] = RandomMat(outch * outch * 3 * num_directions); | |||
| std::vector<ncnn::Mat> as(1); | |||
| as[0] = a; | |||
| int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 2); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_gru_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); | |||
| } | |||
| return ret; | |||
| } | |||
| static int test_gru_0() | |||
| { | |||
| return 0 | |||
| @@ -86,6 +145,14 @@ static int test_gru_0() | |||
| static int test_gru_1() | |||
| { | |||
| return 0 | |||
| || test_gru_layer_with_hidden(RandomMat(4, 4), 1, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(8, 2), 2, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(16, 8), 7, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(17, 8), 8, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(19, 15), 8, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(5, 16), 16, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(3, 16), 8, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(2, 5), 99, 2) | |||
| || test_gru_layer_with_hidden(RandomMat(4, 4), 1, 1) | |||
| || test_gru_layer_with_hidden(RandomMat(8, 2), 2, 1) | |||
| || test_gru_layer_with_hidden(RandomMat(16, 8), 7, 1) | |||
| @@ -101,7 +168,57 @@ static int test_gru_1() | |||
| || test_gru_layer_with_hidden(RandomMat(19, 15), 8, 0) | |||
| || test_gru_layer_with_hidden(RandomMat(5, 16), 16, 0) | |||
| || test_gru_layer_with_hidden(RandomMat(3, 16), 8, 0) | |||
| || test_gru_layer_with_hidden(RandomMat(2, 5), 17, 0); | |||
| || test_gru_layer_with_hidden(RandomMat(2, 5), 17, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(4, 4), 1, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(8, 2), 2, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(16, 8), 7, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(17, 8), 8, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(19, 15), 8, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(5, 16), 16, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(3, 16), 8, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(2, 5), 99, 2) | |||
| || test_gru_layer_with_hidden_input(RandomMat(4, 4), 1, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(8, 2), 2, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(16, 8), 7, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(17, 8), 8, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(19, 15), 8, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(5, 16), 16, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(3, 16), 8, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(2, 5), 99, 1) | |||
| || test_gru_layer_with_hidden_input(RandomMat(4, 2), 1, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(8, 2), 2, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(16, 8), 7, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(17, 8), 8, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(19, 15), 8, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(5, 16), 16, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(3, 16), 8, 0) | |||
| || test_gru_layer_with_hidden_input(RandomMat(2, 5), 17, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(4, 4), 1, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(8, 2), 2, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(16, 8), 7, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(17, 8), 8, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(19, 15), 8, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(5, 16), 16, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(3, 16), 8, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(2, 5), 99, 2) | |||
| || test_gru_layer_with_hidden_output(RandomMat(4, 4), 1, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(8, 2), 2, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(16, 8), 7, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(17, 8), 8, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(19, 15), 8, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(5, 16), 16, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(3, 16), 8, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(2, 5), 99, 1) | |||
| || test_gru_layer_with_hidden_output(RandomMat(4, 2), 1, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(8, 2), 2, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(16, 8), 7, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(17, 8), 8, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(19, 15), 8, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(5, 16), 16, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(3, 16), 8, 0) | |||
| || test_gru_layer_with_hidden_output(RandomMat(2, 5), 17, 0); | |||
| } | |||
| static int test_gru_2() | |||
| @@ -42,22 +42,23 @@ static int test_lstm(const ncnn::Mat& a, int outch, int direction) | |||
| int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * 4); | |||
| pd.set(1, outch * input_size * 4 * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * 4); | |||
| weights[1] = RandomMat(outch * 4); | |||
| weights[2] = RandomMat(outch * outch * 4); | |||
| weights[0] = RandomMat(outch * input_size * 4 * num_directions); | |||
| weights[1] = RandomMat(outch * 4 * num_directions); | |||
| weights[2] = RandomMat(outch * outch * 4 * num_directions); | |||
| // initial hidden state | |||
| ncnn::Mat hidden = RandomMat(outch); | |||
| ncnn::Mat hidden = RandomMat(outch, num_directions); | |||
| // initial cell state | |||
| ncnn::Mat cell = RandomMat(outch); | |||
| ncnn::Mat cell = RandomMat(outch, num_directions); | |||
| std::vector<ncnn::Mat> as(3); | |||
| as[0] = a; | |||
| @@ -73,6 +74,68 @@ int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) | |||
| return ret; | |||
| } | |||
| int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * 4 * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * 4 * num_directions); | |||
| weights[1] = RandomMat(outch * 4 * num_directions); | |||
| weights[2] = RandomMat(outch * outch * 4 * num_directions); | |||
| // initial hidden state | |||
| ncnn::Mat hidden = RandomMat(outch, num_directions); | |||
| // initial cell state | |||
| ncnn::Mat cell = RandomMat(outch, num_directions); | |||
| std::vector<ncnn::Mat> as(3); | |||
| as[0] = a; | |||
| as[1] = hidden; | |||
| as[2] = cell; | |||
| int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 1); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); | |||
| } | |||
| return ret; | |||
| } | |||
| int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * 4 * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * 4 * num_directions); | |||
| weights[1] = RandomMat(outch * 4 * num_directions); | |||
| weights[2] = RandomMat(outch * outch * 4 * num_directions); | |||
| std::vector<ncnn::Mat> as(1); | |||
| as[0] = a; | |||
| int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); | |||
| } | |||
| return ret; | |||
| } | |||
| static int test_lstm_0() | |||
| { | |||
| return 0 | |||
| @@ -90,6 +153,14 @@ static int test_lstm_0() | |||
| static int test_lstm_1() | |||
| { | |||
| return 0 | |||
| || test_lstm_layer_with_hidden(RandomMat(4, 4), 1, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(17, 8), 8, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 2) | |||
| || test_lstm_layer_with_hidden(RandomMat(4, 4), 1, 1) | |||
| || test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 1) | |||
| || test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 1) | |||
| @@ -105,7 +176,57 @@ static int test_lstm_1() | |||
| || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 0) | |||
| || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 0) | |||
| || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 0) | |||
| || test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0); | |||
| || test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(17, 8), 8, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 2) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(17, 8), 8, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 1) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(4, 2), 1, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(17, 8), 8, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 0) | |||
| || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 17, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(17, 8), 8, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 2) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(17, 8), 8, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 1) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(4, 2), 1, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(17, 8), 8, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 0) | |||
| || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 17, 0); | |||
| } | |||
| static int test_lstm_2() | |||
| @@ -42,19 +42,20 @@ static int test_rnn(const ncnn::Mat& a, int outch, int direction) | |||
| int test_rnn_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size); | |||
| pd.set(1, outch * input_size * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size); | |||
| weights[1] = RandomMat(outch); | |||
| weights[2] = RandomMat(outch * outch); | |||
| weights[0] = RandomMat(outch * input_size * num_directions); | |||
| weights[1] = RandomMat(outch * num_directions); | |||
| weights[2] = RandomMat(outch * outch * num_directions); | |||
| // initial hidden state | |||
| ncnn::Mat hidden = RandomMat(outch); | |||
| ncnn::Mat hidden = RandomMat(outch, num_directions); | |||
| std::vector<ncnn::Mat> as(2); | |||
| as[0] = a; | |||
| @@ -69,6 +70,64 @@ int test_rnn_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) | |||
| return ret; | |||
| } | |||
| int test_rnn_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * num_directions); | |||
| weights[1] = RandomMat(outch * num_directions); | |||
| weights[2] = RandomMat(outch * outch * num_directions); | |||
| // initial hidden state | |||
| ncnn::Mat hidden = RandomMat(outch, num_directions); | |||
| std::vector<ncnn::Mat> as(2); | |||
| as[0] = a; | |||
| as[1] = hidden; | |||
| int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 1); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_rnn_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); | |||
| } | |||
| return ret; | |||
| } | |||
| int test_rnn_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction) | |||
| { | |||
| int input_size = a.w; | |||
| int num_directions = direction == 2 ? 2 : 1; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, outch); | |||
| pd.set(1, outch * input_size * num_directions); | |||
| pd.set(2, direction); | |||
| std::vector<ncnn::Mat> weights(3); | |||
| weights[0] = RandomMat(outch * input_size * num_directions); | |||
| weights[1] = RandomMat(outch * num_directions); | |||
| weights[2] = RandomMat(outch * outch * num_directions); | |||
| std::vector<ncnn::Mat> as(1); | |||
| as[0] = a; | |||
| int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 2); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_rnn_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); | |||
| } | |||
| return ret; | |||
| } | |||
| static int test_rnn_0() | |||
| { | |||
| return 0 | |||
| @@ -86,6 +145,14 @@ static int test_rnn_0() | |||
| static int test_rnn_1() | |||
| { | |||
| return 0 | |||
| || test_rnn_layer_with_hidden(RandomMat(4, 4), 1, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(8, 2), 2, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(16, 8), 7, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(17, 8), 8, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(19, 15), 8, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(5, 16), 16, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(3, 16), 8, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(2, 5), 99, 2) | |||
| || test_rnn_layer_with_hidden(RandomMat(4, 4), 1, 1) | |||
| || test_rnn_layer_with_hidden(RandomMat(8, 2), 2, 1) | |||
| || test_rnn_layer_with_hidden(RandomMat(16, 8), 7, 1) | |||
| @@ -101,7 +168,57 @@ static int test_rnn_1() | |||
| || test_rnn_layer_with_hidden(RandomMat(19, 15), 8, 0) | |||
| || test_rnn_layer_with_hidden(RandomMat(5, 16), 16, 0) | |||
| || test_rnn_layer_with_hidden(RandomMat(3, 16), 8, 0) | |||
| || test_rnn_layer_with_hidden(RandomMat(2, 5), 17, 0); | |||
| || test_rnn_layer_with_hidden(RandomMat(2, 5), 17, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(4, 4), 1, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(8, 2), 2, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(16, 8), 7, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(17, 8), 8, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(19, 15), 8, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(5, 16), 16, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(3, 16), 8, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(2, 5), 99, 2) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(4, 4), 1, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(8, 2), 2, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(16, 8), 7, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(17, 8), 8, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(19, 15), 8, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(5, 16), 16, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(3, 16), 8, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(2, 5), 99, 1) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(4, 2), 1, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(8, 2), 2, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(16, 8), 7, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(17, 8), 8, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(19, 15), 8, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(5, 16), 16, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(3, 16), 8, 0) | |||
| || test_rnn_layer_with_hidden_input(RandomMat(2, 5), 17, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(4, 4), 1, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(8, 2), 2, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(16, 8), 7, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(17, 8), 8, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(19, 15), 8, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(5, 16), 16, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(3, 16), 8, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(2, 5), 99, 2) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(4, 4), 1, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(8, 2), 2, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(16, 8), 7, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(17, 8), 8, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(19, 15), 8, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(5, 16), 16, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(3, 16), 8, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(2, 5), 99, 1) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(4, 2), 1, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(8, 2), 2, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(16, 8), 7, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(17, 8), 8, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(19, 15), 8, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(5, 16), 16, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(3, 16), 8, 0) | |||
| || test_rnn_layer_with_hidden_output(RandomMat(2, 5), 17, 0); | |||
| } | |||
| static int test_rnn_2() | |||