| @@ -83,6 +83,12 @@ | |||
| cuda_check(cudaGetLastError()); \ | |||
| } while (0) | |||
| #if MEGDNN_TEGRA_X2 | |||
| //! tx2 only have 256 cuda cores | |||
| #define NR_THREADS 256 | |||
| #define NR_THREADS_X 32 | |||
| #define NR_THREADS_Y 8 | |||
| #else | |||
| #if MEGDNN_THREADS_512 | |||
| #define NR_THREADS 512 | |||
| #define NR_THREADS_X 32 | |||
| @@ -92,6 +98,7 @@ | |||
| #define NR_THREADS_X 32 | |||
| #define NR_THREADS_Y 32 | |||
| #endif | |||
| #endif | |||
| #define DIVUP(x, y) (((x) + (y)-1) / (y)) | |||
| #define ROUNDUP(x, y) (DIVUP(x, y) * (y)) | |||
| @@ -22,6 +22,8 @@ | |||
| #include "test/cuda/fixture.h" | |||
| #include "test/cuda/utils.h" | |||
| #include <cudnn.h> | |||
| #define V1(x) #x | |||
| #define V(x) V1(x) | |||
| #define CUDNN_VERSION_STRING \ | |||
| @@ -161,23 +163,6 @@ TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) { | |||
| } | |||
| } | |||
| TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { | |||
| using namespace convolution; | |||
| std::vector<TestArg> args = get_1x1_args(); | |||
| Benchmarker<ConvolutionForward> marker(handle_cuda()); | |||
| NormalRNG default_rng; | |||
| for (auto&& arg : args) { | |||
| float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); | |||
| UniformFloatRNG rng(scale, 2 * scale); | |||
| marker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_rng(0, &default_rng) | |||
| .set_rng(1, &default_rng) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, {}}); | |||
| } | |||
| } | |||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { | |||
| using namespace convolution; | |||
| std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); | |||
| @@ -767,6 +752,23 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) { | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { | |||
| using namespace convolution; | |||
| std::vector<TestArg> args = get_1x1_args(); | |||
| Benchmarker<ConvolutionForward> marker(handle_cuda()); | |||
| NormalRNG default_rng; | |||
| for (auto&& arg : args) { | |||
| float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); | |||
| UniformFloatRNG rng(scale, 2 * scale); | |||
| marker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_rng(0, &default_rng) | |||
| .set_rng(1, &default_rng) | |||
| .set_param(arg.param) | |||
| .execs({arg.src, arg.filter, {}}); | |||
| } | |||
| } | |||
| TEST_F(CUDA, CONV_FWD_BENCHMARK) { | |||
| auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1, | |||
| size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0, | |||
| @@ -44,6 +44,7 @@ TEST_F(CUDA, FLIP) { | |||
| } | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, FLIP_BENCHMARK) { | |||
| auto run = [&](const TensorShapeArray& shapes) { | |||
| Benchmarker<Flip> benchmarker(handle_cuda()); | |||
| @@ -75,6 +76,7 @@ TEST_F(CUDA, FLIP_BENCHMARK) { | |||
| run(shapes); | |||
| } | |||
| #endif | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| @@ -14,6 +14,7 @@ | |||
| #include "test/common/images2neibs.h" | |||
| #include "test/common/rng.h" | |||
| #include "test/cuda/benchmark.h" | |||
| #include "test/cuda/utils.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| @@ -44,6 +45,7 @@ TEST_F(CUDA, BENCHMARK_IMAGES2NEIBS_FORWARD) { | |||
| #endif | |||
| TEST_F(CUDA, IMAGES2NEIBS_BACKWARD) { | |||
| require_compute_capability(6, 1); | |||
| UniformFloatRNG rng(0, 1); | |||
| auto args = images2neibs::get_args(); | |||
| for (auto&& arg : args) { | |||
| @@ -39,6 +39,11 @@ TEST_F(CUDA_ERROR_INFO, INDEXING_ONE_HOT) { | |||
| ASSERT_TRUE(failed); | |||
| } | |||
| TEST_F(CUDA, INDEXING_SET_ONE_HOT) { | |||
| run_indexing_set_one_hot_test(handle_cuda()); | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { | |||
| Benchmarker<IndexingOneHot> bench{handle_cuda()}; | |||
| bench.set_times(1); | |||
| @@ -53,9 +58,6 @@ TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { | |||
| printf("bandwidth: %.2fGiB/s\n", | |||
| A * B * D * sizeof(float) / 1024.0 / 1024 / 1024 / time); | |||
| } | |||
| TEST_F(CUDA, INDEXING_SET_ONE_HOT) { | |||
| run_indexing_set_one_hot_test(handle_cuda()); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -14,13 +14,12 @@ | |||
| #include "test/common/benchmarker.h" | |||
| #include "test/common/checker.h" | |||
| #include "test/common/matrix_mul.h" | |||
| #include "test/cuda/utils.h" | |||
| #if defined(cuda_check) | |||
| #undef cuda_check | |||
| #endif | |||
| #include "test/cuda/utils.h" | |||
| #include <cuda.h> | |||
| #include "src/cuda/utils.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| @@ -47,13 +46,7 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) { | |||
| } | |||
| TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { | |||
| if (cuda::current_device_prop().major < 7 || | |||
| (cuda::current_device_prop().major == 7 && | |||
| cuda::current_device_prop().minor < 5)) { | |||
| printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device " | |||
| "doesn't support\n"); | |||
| return; | |||
| } | |||
| require_compute_capability(7, 5); | |||
| Checker<MatrixMul> checker(handle_cuda(), false); | |||
| using Param = MatrixMul::Param; | |||
| Param param; | |||
| @@ -65,21 +58,15 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { | |||
| checker.exec({{256, 256}, {256, 256}, {256, 256}}); | |||
| auto args = matrix_mul::get_matmul_args(); | |||
| for (auto arg : args) { | |||
| size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8, | |||
| k = DIVUP(arg.k, 32) * 32; | |||
| size_t m = (arg.m + 7) / 8 * 8, n = (arg.n + 7) / 8 * 8, | |||
| k = (arg.k + 31) / 32 * 32; | |||
| checker.exec({{m, k}, {n, k}, {m, n}}); | |||
| } | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | |||
| if (cuda::current_device_prop().major < 7 || | |||
| (cuda::current_device_prop().major == 7 && | |||
| cuda::current_device_prop().minor < 5)) { | |||
| printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current " | |||
| "device doesn't support\n"); | |||
| return; | |||
| } | |||
| require_compute_capability(7, 5); | |||
| Benchmarker<MatrixMul> bencher(handle_cuda()); | |||
| using Param = MatrixMul::Param; | |||
| Param param; | |||
| @@ -102,14 +89,7 @@ TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | |||
| } | |||
| TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { | |||
| if (cuda::current_device_prop().major < 7 || | |||
| (cuda::current_device_prop().major == 7 && | |||
| cuda::current_device_prop().minor < 5)) { | |||
| printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as " | |||
| "current " | |||
| "device doesn't support\n"); | |||
| return; | |||
| } | |||
| require_compute_capability(7, 5); | |||
| Benchmarker<MatrixMul> bencher(handle_cuda()); | |||
| using Param = MatrixMul::Param; | |||
| Param param; | |||
| @@ -188,8 +188,7 @@ TEST_F(CUDA, PADDING_REPLICATE2) { | |||
| 6, 7, 7, 8, 9, 9, 9, 9})}); | |||
| } | |||
| // #if MEGDNN_WITH_BENCHMARK | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { | |||
| using Param = Padding::Param; | |||
| @@ -240,5 +239,4 @@ TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { | |||
| run(shapes, param); | |||
| } | |||
| } | |||
| // #endif | |||
| #endif | |||
| @@ -40,6 +40,7 @@ TEST_F(CUDA, ROTATE) { | |||
| } | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_ROTATE) { | |||
| auto run = [&](const TensorShapeArray& shapes) { | |||
| Benchmarker<Rotate> benchmarker(handle_cuda()); | |||
| @@ -74,6 +75,7 @@ TEST_F(CUDA, BENCHMARK_ROTATE) { | |||
| run(shapes); | |||
| } | |||
| #endif | |||
| } // namespace rotate | |||
| } // namespace test | |||
| @@ -42,18 +42,6 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_FORWARD) { | |||
| } | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) { | |||
| auto args = sliding_window_transpose::get_benchmark_args(); | |||
| for (auto&& arg : args) { | |||
| CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda()); | |||
| bencher.set_param(arg.param) | |||
| .set_dtype(0, dtype::Float32()) | |||
| .exec(TensorShapeArray{arg.ishape, {}}); | |||
| } | |||
| } | |||
| #endif | |||
| TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { | |||
| UniformFloatRNG rng(0, 1); | |||
| auto args = sliding_window_transpose::get_args(); | |||
| @@ -78,6 +66,18 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { | |||
| } | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) { | |||
| auto args = sliding_window_transpose::get_benchmark_args(); | |||
| for (auto&& arg : args) { | |||
| CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda()); | |||
| bencher.set_param(arg.param) | |||
| .set_dtype(0, dtype::Float32()) | |||
| .exec(TensorShapeArray{arg.ishape, {}}); | |||
| } | |||
| } | |||
| #endif | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| @@ -33,25 +33,6 @@ TEST_F(CUDA, TYPE_CVT) { | |||
| } | |||
| } | |||
| TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) { | |||
| const size_t RUNS = 3; | |||
| auto run = [&](TensorLayout src, TensorLayout dst) { | |||
| Benchmarker<TypeCvt> benchmarker(handle_cuda()); | |||
| auto&& layout = src; | |||
| benchmarker.set_times(RUNS); | |||
| dst.init_contiguous_stride(); | |||
| auto used = benchmarker.execl({src, dst}); | |||
| printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), | |||
| 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 / | |||
| (1024 * 1024 * 1024)); | |||
| }; | |||
| TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()), | |||
| dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32()); | |||
| run(src, dst); | |||
| } | |||
| TEST_F(CUDA, QUANTIZED_TYPECVT) { | |||
| UniformIntRNG int_rng{-66, 66}; | |||
| Checker<TypeCvt> checker(handle_cuda()); | |||
| @@ -162,6 +143,25 @@ TEST_F(CUDA, TYPE_CVT_BFLOAT16) { | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) { | |||
| const size_t RUNS = 3; | |||
| auto run = [&](TensorLayout src, TensorLayout dst) { | |||
| Benchmarker<TypeCvt> benchmarker(handle_cuda()); | |||
| auto&& layout = src; | |||
| benchmarker.set_times(RUNS); | |||
| dst.init_contiguous_stride(); | |||
| auto used = benchmarker.execl({src, dst}); | |||
| printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), | |||
| 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 / | |||
| (1024 * 1024 * 1024)); | |||
| }; | |||
| TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()), | |||
| dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32()); | |||
| run(src, dst); | |||
| } | |||
| TEST_F(CUDA, BENCHMARK_TYPE_CVT) { | |||
| UniformIntRNG rng{-128, 127}; | |||
| auto run = [&](TensorLayout src, TensorLayout dst) { | |||