Browse Source

rnn/lstm/gru with unequal input output (#3352)

tags/20211122
nihui GitHub 4 years ago
parent
commit
525df8bcc5
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1517 additions and 216 deletions
  1. +219
    -33
      src/layer/arm/gru_arm.cpp
  2. +259
    -44
      src/layer/arm/lstm_arm.cpp
  3. +219
    -33
      src/layer/arm/rnn_arm.cpp
  4. +54
    -12
      src/layer/gru.cpp
  5. +64
    -14
      src/layer/lstm.cpp
  6. +169
    -25
      src/layer/riscv/gru_riscv.cpp
  7. +54
    -12
      src/layer/rnn.cpp
  8. +105
    -24
      src/layer/x86/lstm_x86.cpp
  9. +123
    -6
      tests/test_gru.cpp
  10. +128
    -7
      tests/test_lstm.cpp
  11. +123
    -6
      tests/test_rnn.cpp

+ 219
- 33
src/layer/arm/gru_arm.cpp View File

@@ -695,13 +695,7 @@ int GRU_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c

int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() != 2 || top_blobs.size() != 2)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}

const Mat& bottom_blob = bottom_blobs[0];

int elembits = bottom_blob.elembits();

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -720,24 +714,72 @@ int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
#endif

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
int num_directions = direction == 2 ? 2 : 1;

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
hidden = bottom_blobs[1].clone(hidden_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 4u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = gru(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden_state, opt);
int ret = gru(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
if (ret != 0)
return ret;
}

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 2)
{
top_blobs[1] = hidden;
}

return 0;
}

@@ -1625,16 +1667,29 @@ int GRU_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 2u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// copy previous states
Mat hidden;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
@@ -1643,7 +1698,42 @@ int GRU_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 2)
{
cast_float32_to_float16(hidden, top_blobs[1], opt);
}

return 0;
}
@@ -1711,16 +1801,29 @@ int GRU_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 2u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// copy previous states
Mat hidden;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
@@ -1729,7 +1832,42 @@ int GRU_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 2)
{
top_blobs[1] = hidden;
}

return 0;
}
@@ -2365,16 +2503,29 @@ int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
int num_directions = direction == 2 ? 2 : 1;

top_blob.create(num_output, T, 2u, opt.blob_allocator);
Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// copy previous states
Mat hidden;
cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
@@ -2383,7 +2534,42 @@ int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
return ret;
}

cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
unsigned short* ptr = top_blob.row<unsigned short>(i);

memcpy(ptr, pf, num_output * sizeof(unsigned short));
memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
}
}

if (top_blobs.size() == 2)
{
cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
}

return 0;
}


+ 259
- 44
src/layer/arm/lstm_arm.cpp View File

@@ -423,13 +423,7 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)

int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() != 3 || top_blobs.size() != 3)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}

const Mat& bottom_blob = bottom_blobs[0];

int elembits = bottom_blob.elembits();

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -448,26 +442,82 @@ int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
#endif

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
Mat& cell_state = top_blobs[2];
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Mat cell;
Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 3)
{
hidden = bottom_blobs[1].clone(hidden_cell_allocator);
cell = bottom_blobs[2].clone(hidden_cell_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
cell_state = bottom_blobs[2].clone(opt.blob_allocator);
cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (cell.empty())
return -100;
cell.fill(0.f);
}

top_blob.create(num_output, T, 4u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden_state, cell_state, opt);
int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt);
if (ret != 0)
return ret;
}

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
Mat cell0 = cell.row_range(0, 1);
int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
Mat cell1 = cell.row_range(1, 1);
int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 3)
{
top_blobs[1] = hidden;
top_blobs[2] = cell;
}

return 0;
}

@@ -1182,17 +1232,35 @@ int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];

top_blob.create(num_output, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;
int num_directions = direction == 2 ? 2 : 1;

// copy previous states
Mat hidden;
Mat cell;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);
cast_float16_to_float32(bottom_blobs[2], cell, opt);
Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 3)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_cell_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
cast_float16_to_float32(bottom_blobs[2], cell, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);

cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (cell.empty())
return -100;
cell.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
@@ -1202,8 +1270,45 @@ int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
cast_float32_to_float16(cell, top_blobs[2], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
Mat cell0 = cell.row_range(0, 1);
int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
Mat cell1 = cell.row_range(1, 1);
int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 3)
{
cast_float32_to_float16(hidden, top_blobs[1], opt);
cast_float32_to_float16(cell, top_blobs[2], opt);
}

return 0;
}
@@ -1277,17 +1382,35 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];

top_blob.create(num_output, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;
int num_directions = direction == 2 ? 2 : 1;

// copy previous states
Mat hidden;
Mat cell;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);
cast_float16_to_float32(bottom_blobs[2], cell, opt);
Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 3)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_cell_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
cast_float16_to_float32(bottom_blobs[2], cell, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);

cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (cell.empty())
return -100;
cell.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
@@ -1297,8 +1420,45 @@ int LSTM_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<M
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
cast_float32_to_float16(cell, top_blobs[2], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
Mat cell0 = cell.row_range(0, 1);
int ret0 = lstm_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
Mat cell1 = cell.row_range(1, 1);
int ret1 = lstm_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 3)
{
cast_float32_to_float16(hidden, top_blobs[1], opt);
cast_float32_to_float16(cell, top_blobs[2], opt);
}

return 0;
}
@@ -1664,17 +1824,35 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];

top_blob.create(num_output, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;
int num_directions = direction == 2 ? 2 : 1;

// copy previous states
Mat hidden;
Mat cell;
cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt);
cast_bfloat16_to_float32(bottom_blobs[2], cell, opt);
Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 3)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_cell_allocator;
cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
cast_bfloat16_to_float32(bottom_blobs[2], cell, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);

cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (cell.empty())
return -100;
cell.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
@@ -1684,8 +1862,45 @@ int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Ma
return ret;
}

cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
cast_float32_to_bfloat16(cell, top_blobs[2], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
Mat cell0 = cell.row_range(0, 1);
int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
Mat cell1 = cell.row_range(1, 1);
int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
unsigned short* ptr = top_blob.row<unsigned short>(i);

memcpy(ptr, pf, num_output * sizeof(unsigned short));
memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
}
}

if (top_blobs.size() == 3)
{
cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
cast_float32_to_bfloat16(cell, top_blobs[2], opt);
}

return 0;
}


+ 219
- 33
src/layer/arm/rnn_arm.cpp View File

@@ -377,13 +377,7 @@ int RNN_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c

int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() != 2 || top_blobs.size() != 2)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}

const Mat& bottom_blob = bottom_blobs[0];

int elembits = bottom_blob.elembits();

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -402,24 +396,72 @@ int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top
#endif

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
int num_directions = direction == 2 ? 2 : 1;

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
hidden = bottom_blobs[1].clone(hidden_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 4u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden_state, opt);
int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
if (ret != 0)
return ret;
}

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = rnn(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = rnn(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 2)
{
top_blobs[1] = hidden;
}

return 0;
}

@@ -965,16 +1007,29 @@ int RNN_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
int num_directions = direction == 2 ? 2 : 1;

top_blob.create(num_output, T, 2u, opt.blob_allocator);
Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// copy previous states
Mat hidden;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
@@ -983,7 +1038,42 @@ int RNN_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = rnn_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = rnn_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 2)
{
cast_float32_to_float16(hidden, top_blobs[1], opt);
}

return 0;
}
@@ -1051,16 +1141,29 @@ int RNN_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 2u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// copy previous states
Mat hidden;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
@@ -1069,7 +1172,42 @@ int RNN_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Ma
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = rnn_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = rnn_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 2)
{
cast_float32_to_float16(hidden, top_blobs[1], opt);
}

return 0;
}
@@ -1387,16 +1525,29 @@ int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 2u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// copy previous states
Mat hidden;
cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
@@ -1405,7 +1556,42 @@ int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat
return ret;
}

cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = rnn_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = rnn_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
unsigned short* ptr = top_blob.row<unsigned short>(i);

memcpy(ptr, pf, num_output * sizeof(unsigned short));
memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
}
}

if (top_blobs.size() == 2)
{
cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
}

return 0;
}


+ 54
- 12
src/layer/gru.cpp View File

@@ -29,8 +29,6 @@ int GRU::load_param(const ParamDict& pd)
num_output = pd.get(0, 0);
weight_data_size = pd.get(1, 0);
direction = pd.get(2, 0);
if (direction == 2)
one_blob_only = true;
return 0;
}

@@ -223,30 +221,74 @@ int GRU::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const

int GRU::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() != 2 || top_blobs.size() != 2)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
int num_directions = direction == 2 ? 2 : 1;

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
hidden = bottom_blobs[1].clone(hidden_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 4u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, opt);
int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
if (ret != 0)
return ret;
}

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 2)
{
top_blobs[1] = hidden;
}

return 0;
}



+ 64
- 14
src/layer/lstm.cpp View File

@@ -29,8 +29,6 @@ int LSTM::load_param(const ParamDict& pd)
num_output = pd.get(0, 0);
weight_data_size = pd.get(1, 0);
direction = pd.get(2, 0);
if (direction == 2)
one_blob_only = true;
return 0;
}

@@ -232,32 +230,84 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons

int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() != 3 || top_blobs.size() != 3)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
Mat& cell_state = top_blobs[2];
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Mat cell;
Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 3)
{
hidden = bottom_blobs[1].clone(hidden_cell_allocator);
cell = bottom_blobs[2].clone(hidden_cell_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
cell_state = bottom_blobs[2].clone(opt.blob_allocator);
cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (cell.empty())
return -100;
cell.fill(0.f);
}

top_blob.create(num_output, T, 4u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, cell_state, opt);
int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
if (ret != 0)
return ret;
}

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
Mat cell0 = cell.row_range(0, 1);
int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
Mat cell1 = cell.row_range(1, 1);
int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 3)
{
top_blobs[1] = hidden;
top_blobs[2] = cell;
}

return 0;
}



+ 169
- 25
src/layer/riscv/gru_riscv.cpp View File

@@ -301,11 +301,6 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)

int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() != 2 || top_blobs.size() != 2)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}

const Mat& bottom_blob = bottom_blobs[0];
int elembits = bottom_blob.elembits();

@@ -321,24 +316,73 @@ int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
#endif

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
int num_directions = direction == 2 ? 2 : 1;

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
hidden = bottom_blobs[1].clone(hidden_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 4u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, opt);
Mat hidden0 = hidden.row_range(0, 1);
int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
if (ret != 0)
return ret;
}

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 2)
{
top_blobs[1] = hidden;
}

return 0;
#endif
return GRU::forward(bottom_blobs, top_blobs, opt);
@@ -587,24 +631,75 @@ int GRU_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<M
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output, T, 2u, opt.blob_allocator);
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

//Copy previous states
Mat hidden;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
Mat hidden0 = hidden.row_range(0, 1);
int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
if (ret != 0)
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 2)
{
cast_float32_to_float16(hidden, top_blobs[1], opt);
}

return 0;
}

@@ -853,15 +948,29 @@ int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<
{
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
int num_directions = direction == 2 ? 2 : 1;

Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
Option opt_cast = opt;
opt_cast.blob_allocator = hidden_allocator;
cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output, T, 2u, opt.blob_allocator);
top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
if (top_blob.empty())
return -100;

//Copy previous states
Mat hidden;
cast_float16_to_float32(bottom_blobs[1], hidden, opt);

// Uni directional
if (direction == 0 || direction == 1)
{
@@ -870,11 +979,46 @@ int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<
return ret;
}

cast_float32_to_float16(hidden, top_blobs[1], opt);
if (direction == 2)
{
Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const __fp16* pf = top_blob_forward.row<const __fp16>(i);
const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
__fp16* ptr = top_blob.row<__fp16>(i);

memcpy(ptr, pf, num_output * sizeof(__fp16));
memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
}
}

if (top_blobs.size() == 2)
{
cast_float32_to_float16(hidden, top_blobs[1], opt);
}

return 0;
}

#endif

} // namespace ncnn
} // namespace ncnn

+ 54
- 12
src/layer/rnn.cpp View File

@@ -29,8 +29,6 @@ int RNN::load_param(const ParamDict& pd)
num_output = pd.get(0, 0);
weight_data_size = pd.get(1, 0);
direction = pd.get(2, 0);
if (direction == 2)
one_blob_only = true;
return 0;
}

@@ -172,30 +170,74 @@ int RNN::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const

int RNN::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() != 2 || top_blobs.size() != 2)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}
const Mat& bottom_blob = bottom_blobs[0];
int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
int num_directions = direction == 2 ? 2 : 1;

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
Mat hidden;
Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 2)
{
hidden = bottom_blobs[1].clone(hidden_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);
}

top_blob.create(num_output, T, 4u, opt.blob_allocator);
Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

// Uni directional
if (direction == 0 || direction == 1)
{
int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, opt);
int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
if (ret != 0)
return ret;
}

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
int ret0 = rnn(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
if (ret0 != 0)
return ret0;

Mat hidden1 = hidden.row_range(1, 1);
int ret1 = rnn(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt);
if (ret1 != 0)
return ret1;

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 2)
{
top_blobs[1] = hidden;
}

return 0;
}



+ 105
- 24
src/layer/x86/lstm_x86.cpp View File

@@ -910,42 +910,123 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
int LSTM_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
#if __AVX__
if (bottom_blobs.size() != 3 || top_blobs.size() != 3)
{
return forward(bottom_blobs[0], top_blobs[0], opt);
}
const Mat& bottom_blob = bottom_blobs[0];

int T = bottom_blob.h;
Mat& top_blob = top_blobs[0];
Mat& hidden_state = top_blobs[1];
Mat& cell_state = top_blobs[2];
int num_directions = direction == 2 ? 2 : 1;

//Copy previous states
hidden_state = bottom_blobs[1].clone(opt.blob_allocator);
cell_state = bottom_blobs[2].clone(opt.blob_allocator);
Mat hidden;
Mat cell;
Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
if (bottom_blobs.size() == 3)
{
hidden = bottom_blobs[1].clone(hidden_cell_allocator);
cell = bottom_blobs[2].clone(hidden_cell_allocator);
}
else
{
hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (hidden.empty())
return -100;
hidden.fill(0.f);

top_blob.create(num_output, T, 4u, opt.blob_allocator);
cell.create(num_output, num_directions, 4u, hidden_cell_allocator);
if (cell.empty())
return -100;
cell.fill(0.f);
}

Mat& top_blob = top_blobs[0];
top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;
#if __AVX2__
if (opt.use_weight_fp16_storage)

// Uni directional
if (direction == 0 || direction == 1)
{
// Uni directional
int ret = lstm_fp16(bottom_blob, top_blob, direction, weight_xc_data_fp16.channel(0), bias_c_data.channel(0), weight_hc_data_fp16.channel(0), hidden_state, cell_state, opt);
if (ret != 0)
return ret;
#if __AVX2__
if (opt.use_weight_fp16_storage)
{
int ret = lstm_fp16(bottom_blob, top_blob, direction, weight_xc_data_fp16.channel(0), bias_c_data.channel(0), weight_hc_data_fp16.channel(0), hidden, cell, opt);
if (ret != 0)
return ret;
}
else
{
#endif
int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt);
if (ret != 0)
return ret;
#if __AVX2__
}
#endif
}
else

if (direction == 2)
{
Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_forward.empty())
return -100;

Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
if (top_blob_reverse.empty())
return -100;

Mat hidden0 = hidden.row_range(0, 1);
Mat cell0 = cell.row_range(0, 1);
#if __AVX2__
if (opt.use_weight_fp16_storage)
{
int ret = lstm_fp16(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16.channel(0), bias_c_data.channel(0), weight_hc_data_fp16.channel(0), hidden0, cell0, opt);
if (ret != 0)
return ret;
}
else
{
#endif
// Uni directional
int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden_state, cell_state, opt);
if (ret != 0)
return ret;
int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt);
if (ret0 != 0)
return ret0;
#if __AVX2__
}
}
#endif

Mat hidden1 = hidden.row_range(1, 1);
Mat cell1 = cell.row_range(1, 1);
#if __AVX2__
if (opt.use_weight_fp16_storage)
{
int ret = lstm_fp16(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16.channel(1), bias_c_data.channel(1), weight_hc_data_fp16.channel(1), hidden1, cell1, opt);
if (ret != 0)
return ret;
}
else
{
#endif
int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt);
if (ret1 != 0)
return ret1;
#if __AVX2__
}
#endif

// concat w
for (int i = 0; i < T; i++)
{
const float* pf = top_blob_forward.row(i);
const float* pr = top_blob_reverse.row(i);
float* ptr = top_blob.row(i);

memcpy(ptr, pf, num_output * sizeof(float));
memcpy(ptr + num_output, pr, num_output * sizeof(float));
}
}

if (top_blobs.size() == 3)
{
top_blobs[1] = hidden;
top_blobs[2] = cell;
}

return 0;
#else
return LSTM::forward(bottom_blobs, top_blobs, opt);


+ 123
- 6
tests/test_gru.cpp View File

@@ -42,19 +42,20 @@ static int test_gru(const ncnn::Mat& a, int outch, int direction)
int test_gru_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * 3);
pd.set(1, outch * input_size * 3 * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * 3);
weights[1] = RandomMat(outch * 4);
weights[2] = RandomMat(outch * outch * 3);
weights[0] = RandomMat(outch * input_size * 3 * num_directions);
weights[1] = RandomMat(outch * 4 * num_directions);
weights[2] = RandomMat(outch * outch * 3 * num_directions);

// initial hidden state
ncnn::Mat hidden = RandomMat(outch);
ncnn::Mat hidden = RandomMat(outch, num_directions);

std::vector<ncnn::Mat> as(2);
as[0] = a;
@@ -69,6 +70,64 @@ int test_gru_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
return ret;
}

int test_gru_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * 3 * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * 3 * num_directions);
weights[1] = RandomMat(outch * 4 * num_directions);
weights[2] = RandomMat(outch * outch * 3 * num_directions);

// initial hidden state
ncnn::Mat hidden = RandomMat(outch, num_directions);

std::vector<ncnn::Mat> as(2);
as[0] = a;
as[1] = hidden;

int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 1);
if (ret != 0)
{
fprintf(stderr, "test_gru_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
}

return ret;
}

int test_gru_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * 3 * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * 3 * num_directions);
weights[1] = RandomMat(outch * 4 * num_directions);
weights[2] = RandomMat(outch * outch * 3 * num_directions);

std::vector<ncnn::Mat> as(1);
as[0] = a;

int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 2);
if (ret != 0)
{
fprintf(stderr, "test_gru_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
}

return ret;
}

static int test_gru_0()
{
return 0
@@ -86,6 +145,14 @@ static int test_gru_0()
static int test_gru_1()
{
return 0
|| test_gru_layer_with_hidden(RandomMat(4, 4), 1, 2)
|| test_gru_layer_with_hidden(RandomMat(8, 2), 2, 2)
|| test_gru_layer_with_hidden(RandomMat(16, 8), 7, 2)
|| test_gru_layer_with_hidden(RandomMat(17, 8), 8, 2)
|| test_gru_layer_with_hidden(RandomMat(19, 15), 8, 2)
|| test_gru_layer_with_hidden(RandomMat(5, 16), 16, 2)
|| test_gru_layer_with_hidden(RandomMat(3, 16), 8, 2)
|| test_gru_layer_with_hidden(RandomMat(2, 5), 99, 2)
|| test_gru_layer_with_hidden(RandomMat(4, 4), 1, 1)
|| test_gru_layer_with_hidden(RandomMat(8, 2), 2, 1)
|| test_gru_layer_with_hidden(RandomMat(16, 8), 7, 1)
@@ -101,7 +168,57 @@ static int test_gru_1()
|| test_gru_layer_with_hidden(RandomMat(19, 15), 8, 0)
|| test_gru_layer_with_hidden(RandomMat(5, 16), 16, 0)
|| test_gru_layer_with_hidden(RandomMat(3, 16), 8, 0)
|| test_gru_layer_with_hidden(RandomMat(2, 5), 17, 0);
|| test_gru_layer_with_hidden(RandomMat(2, 5), 17, 0)

|| test_gru_layer_with_hidden_input(RandomMat(4, 4), 1, 2)
|| test_gru_layer_with_hidden_input(RandomMat(8, 2), 2, 2)
|| test_gru_layer_with_hidden_input(RandomMat(16, 8), 7, 2)
|| test_gru_layer_with_hidden_input(RandomMat(17, 8), 8, 2)
|| test_gru_layer_with_hidden_input(RandomMat(19, 15), 8, 2)
|| test_gru_layer_with_hidden_input(RandomMat(5, 16), 16, 2)
|| test_gru_layer_with_hidden_input(RandomMat(3, 16), 8, 2)
|| test_gru_layer_with_hidden_input(RandomMat(2, 5), 99, 2)
|| test_gru_layer_with_hidden_input(RandomMat(4, 4), 1, 1)
|| test_gru_layer_with_hidden_input(RandomMat(8, 2), 2, 1)
|| test_gru_layer_with_hidden_input(RandomMat(16, 8), 7, 1)
|| test_gru_layer_with_hidden_input(RandomMat(17, 8), 8, 1)
|| test_gru_layer_with_hidden_input(RandomMat(19, 15), 8, 1)
|| test_gru_layer_with_hidden_input(RandomMat(5, 16), 16, 1)
|| test_gru_layer_with_hidden_input(RandomMat(3, 16), 8, 1)
|| test_gru_layer_with_hidden_input(RandomMat(2, 5), 99, 1)
|| test_gru_layer_with_hidden_input(RandomMat(4, 2), 1, 0)
|| test_gru_layer_with_hidden_input(RandomMat(8, 2), 2, 0)
|| test_gru_layer_with_hidden_input(RandomMat(16, 8), 7, 0)
|| test_gru_layer_with_hidden_input(RandomMat(17, 8), 8, 0)
|| test_gru_layer_with_hidden_input(RandomMat(19, 15), 8, 0)
|| test_gru_layer_with_hidden_input(RandomMat(5, 16), 16, 0)
|| test_gru_layer_with_hidden_input(RandomMat(3, 16), 8, 0)
|| test_gru_layer_with_hidden_input(RandomMat(2, 5), 17, 0)

|| test_gru_layer_with_hidden_output(RandomMat(4, 4), 1, 2)
|| test_gru_layer_with_hidden_output(RandomMat(8, 2), 2, 2)
|| test_gru_layer_with_hidden_output(RandomMat(16, 8), 7, 2)
|| test_gru_layer_with_hidden_output(RandomMat(17, 8), 8, 2)
|| test_gru_layer_with_hidden_output(RandomMat(19, 15), 8, 2)
|| test_gru_layer_with_hidden_output(RandomMat(5, 16), 16, 2)
|| test_gru_layer_with_hidden_output(RandomMat(3, 16), 8, 2)
|| test_gru_layer_with_hidden_output(RandomMat(2, 5), 99, 2)
|| test_gru_layer_with_hidden_output(RandomMat(4, 4), 1, 1)
|| test_gru_layer_with_hidden_output(RandomMat(8, 2), 2, 1)
|| test_gru_layer_with_hidden_output(RandomMat(16, 8), 7, 1)
|| test_gru_layer_with_hidden_output(RandomMat(17, 8), 8, 1)
|| test_gru_layer_with_hidden_output(RandomMat(19, 15), 8, 1)
|| test_gru_layer_with_hidden_output(RandomMat(5, 16), 16, 1)
|| test_gru_layer_with_hidden_output(RandomMat(3, 16), 8, 1)
|| test_gru_layer_with_hidden_output(RandomMat(2, 5), 99, 1)
|| test_gru_layer_with_hidden_output(RandomMat(4, 2), 1, 0)
|| test_gru_layer_with_hidden_output(RandomMat(8, 2), 2, 0)
|| test_gru_layer_with_hidden_output(RandomMat(16, 8), 7, 0)
|| test_gru_layer_with_hidden_output(RandomMat(17, 8), 8, 0)
|| test_gru_layer_with_hidden_output(RandomMat(19, 15), 8, 0)
|| test_gru_layer_with_hidden_output(RandomMat(5, 16), 16, 0)
|| test_gru_layer_with_hidden_output(RandomMat(3, 16), 8, 0)
|| test_gru_layer_with_hidden_output(RandomMat(2, 5), 17, 0);
}

static int test_gru_2()


+ 128
- 7
tests/test_lstm.cpp View File

@@ -42,22 +42,23 @@ static int test_lstm(const ncnn::Mat& a, int outch, int direction)
int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * 4);
pd.set(1, outch * input_size * 4 * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * 4);
weights[1] = RandomMat(outch * 4);
weights[2] = RandomMat(outch * outch * 4);
weights[0] = RandomMat(outch * input_size * 4 * num_directions);
weights[1] = RandomMat(outch * 4 * num_directions);
weights[2] = RandomMat(outch * outch * 4 * num_directions);

// initial hidden state
ncnn::Mat hidden = RandomMat(outch);
ncnn::Mat hidden = RandomMat(outch, num_directions);

// initial cell state
ncnn::Mat cell = RandomMat(outch);
ncnn::Mat cell = RandomMat(outch, num_directions);

std::vector<ncnn::Mat> as(3);
as[0] = a;
@@ -73,6 +74,68 @@ int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
return ret;
}

int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * 4 * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * 4 * num_directions);
weights[1] = RandomMat(outch * 4 * num_directions);
weights[2] = RandomMat(outch * outch * 4 * num_directions);

// initial hidden state
ncnn::Mat hidden = RandomMat(outch, num_directions);

// initial cell state
ncnn::Mat cell = RandomMat(outch, num_directions);

std::vector<ncnn::Mat> as(3);
as[0] = a;
as[1] = hidden;
as[2] = cell;

int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 1);
if (ret != 0)
{
fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
}

return ret;
}

int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * 4 * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * 4 * num_directions);
weights[1] = RandomMat(outch * 4 * num_directions);
weights[2] = RandomMat(outch * outch * 4 * num_directions);

std::vector<ncnn::Mat> as(1);
as[0] = a;

int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3);
if (ret != 0)
{
fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
}

return ret;
}

static int test_lstm_0()
{
return 0
@@ -90,6 +153,14 @@ static int test_lstm_0()
static int test_lstm_1()
{
return 0
|| test_lstm_layer_with_hidden(RandomMat(4, 4), 1, 2)
|| test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 2)
|| test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 2)
|| test_lstm_layer_with_hidden(RandomMat(17, 8), 8, 2)
|| test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 2)
|| test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 2)
|| test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 2)
|| test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 2)
|| test_lstm_layer_with_hidden(RandomMat(4, 4), 1, 1)
|| test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 1)
|| test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 1)
@@ -105,7 +176,57 @@ static int test_lstm_1()
|| test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 0)
|| test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 0)
|| test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 0)
|| test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0);
|| test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0)

|| test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(17, 8), 8, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 2)
|| test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(17, 8), 8, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 1)
|| test_lstm_layer_with_hidden_input(RandomMat(4, 2), 1, 0)
|| test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 0)
|| test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 0)
|| test_lstm_layer_with_hidden_input(RandomMat(17, 8), 8, 0)
|| test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 0)
|| test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 0)
|| test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 0)
|| test_lstm_layer_with_hidden_input(RandomMat(2, 5), 17, 0)

|| test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(17, 8), 8, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 2)
|| test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(17, 8), 8, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 1)
|| test_lstm_layer_with_hidden_output(RandomMat(4, 2), 1, 0)
|| test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 0)
|| test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 0)
|| test_lstm_layer_with_hidden_output(RandomMat(17, 8), 8, 0)
|| test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 0)
|| test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 0)
|| test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 0)
|| test_lstm_layer_with_hidden_output(RandomMat(2, 5), 17, 0);
}

static int test_lstm_2()


+ 123
- 6
tests/test_rnn.cpp View File

@@ -42,19 +42,20 @@ static int test_rnn(const ncnn::Mat& a, int outch, int direction)
int test_rnn_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size);
pd.set(1, outch * input_size * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size);
weights[1] = RandomMat(outch);
weights[2] = RandomMat(outch * outch);
weights[0] = RandomMat(outch * input_size * num_directions);
weights[1] = RandomMat(outch * num_directions);
weights[2] = RandomMat(outch * outch * num_directions);

// initial hidden state
ncnn::Mat hidden = RandomMat(outch);
ncnn::Mat hidden = RandomMat(outch, num_directions);

std::vector<ncnn::Mat> as(2);
as[0] = a;
@@ -69,6 +70,64 @@ int test_rnn_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
return ret;
}

int test_rnn_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * num_directions);
weights[1] = RandomMat(outch * num_directions);
weights[2] = RandomMat(outch * outch * num_directions);

// initial hidden state
ncnn::Mat hidden = RandomMat(outch, num_directions);

std::vector<ncnn::Mat> as(2);
as[0] = a;
as[1] = hidden;

int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 1);
if (ret != 0)
{
fprintf(stderr, "test_rnn_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
}

return ret;
}

int test_rnn_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction)
{
int input_size = a.w;
int num_directions = direction == 2 ? 2 : 1;

ncnn::ParamDict pd;
pd.set(0, outch);
pd.set(1, outch * input_size * num_directions);
pd.set(2, direction);

std::vector<ncnn::Mat> weights(3);
weights[0] = RandomMat(outch * input_size * num_directions);
weights[1] = RandomMat(outch * num_directions);
weights[2] = RandomMat(outch * outch * num_directions);

std::vector<ncnn::Mat> as(1);
as[0] = a;

int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 2);
if (ret != 0)
{
fprintf(stderr, "test_rnn_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
}

return ret;
}

static int test_rnn_0()
{
return 0
@@ -86,6 +145,14 @@ static int test_rnn_0()
static int test_rnn_1()
{
return 0
|| test_rnn_layer_with_hidden(RandomMat(4, 4), 1, 2)
|| test_rnn_layer_with_hidden(RandomMat(8, 2), 2, 2)
|| test_rnn_layer_with_hidden(RandomMat(16, 8), 7, 2)
|| test_rnn_layer_with_hidden(RandomMat(17, 8), 8, 2)
|| test_rnn_layer_with_hidden(RandomMat(19, 15), 8, 2)
|| test_rnn_layer_with_hidden(RandomMat(5, 16), 16, 2)
|| test_rnn_layer_with_hidden(RandomMat(3, 16), 8, 2)
|| test_rnn_layer_with_hidden(RandomMat(2, 5), 99, 2)
|| test_rnn_layer_with_hidden(RandomMat(4, 4), 1, 1)
|| test_rnn_layer_with_hidden(RandomMat(8, 2), 2, 1)
|| test_rnn_layer_with_hidden(RandomMat(16, 8), 7, 1)
@@ -101,7 +168,57 @@ static int test_rnn_1()
|| test_rnn_layer_with_hidden(RandomMat(19, 15), 8, 0)
|| test_rnn_layer_with_hidden(RandomMat(5, 16), 16, 0)
|| test_rnn_layer_with_hidden(RandomMat(3, 16), 8, 0)
|| test_rnn_layer_with_hidden(RandomMat(2, 5), 17, 0);
|| test_rnn_layer_with_hidden(RandomMat(2, 5), 17, 0)

|| test_rnn_layer_with_hidden_input(RandomMat(4, 4), 1, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(8, 2), 2, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(16, 8), 7, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(17, 8), 8, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(19, 15), 8, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(5, 16), 16, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(3, 16), 8, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(2, 5), 99, 2)
|| test_rnn_layer_with_hidden_input(RandomMat(4, 4), 1, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(8, 2), 2, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(16, 8), 7, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(17, 8), 8, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(19, 15), 8, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(5, 16), 16, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(3, 16), 8, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(2, 5), 99, 1)
|| test_rnn_layer_with_hidden_input(RandomMat(4, 2), 1, 0)
|| test_rnn_layer_with_hidden_input(RandomMat(8, 2), 2, 0)
|| test_rnn_layer_with_hidden_input(RandomMat(16, 8), 7, 0)
|| test_rnn_layer_with_hidden_input(RandomMat(17, 8), 8, 0)
|| test_rnn_layer_with_hidden_input(RandomMat(19, 15), 8, 0)
|| test_rnn_layer_with_hidden_input(RandomMat(5, 16), 16, 0)
|| test_rnn_layer_with_hidden_input(RandomMat(3, 16), 8, 0)
|| test_rnn_layer_with_hidden_input(RandomMat(2, 5), 17, 0)

|| test_rnn_layer_with_hidden_output(RandomMat(4, 4), 1, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(8, 2), 2, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(16, 8), 7, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(17, 8), 8, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(19, 15), 8, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(5, 16), 16, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(3, 16), 8, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(2, 5), 99, 2)
|| test_rnn_layer_with_hidden_output(RandomMat(4, 4), 1, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(8, 2), 2, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(16, 8), 7, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(17, 8), 8, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(19, 15), 8, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(5, 16), 16, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(3, 16), 8, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(2, 5), 99, 1)
|| test_rnn_layer_with_hidden_output(RandomMat(4, 2), 1, 0)
|| test_rnn_layer_with_hidden_output(RandomMat(8, 2), 2, 0)
|| test_rnn_layer_with_hidden_output(RandomMat(16, 8), 7, 0)
|| test_rnn_layer_with_hidden_output(RandomMat(17, 8), 8, 0)
|| test_rnn_layer_with_hidden_output(RandomMat(19, 15), 8, 0)
|| test_rnn_layer_with_hidden_output(RandomMat(5, 16), 16, 0)
|| test_rnn_layer_with_hidden_output(RandomMat(3, 16), 8, 0)
|| test_rnn_layer_with_hidden_output(RandomMat(2, 5), 17, 0);
}

static int test_rnn_2()


Loading…
Cancel
Save