 new int8 implement,better accuracy (#749)
* add the armv7a conv3x3s1 implement without overflow,remove old codes
* fix the bug of conv3x3s2 packed int8
* new int8 implement,weight quant by perchanel,better accuracy~
* fix the bug of conv3x3s1 packed int8 neon
* add the naive c fp32 and int8 winograd F(2,3)
* add the neon intrinsic int8 winograd F(2,3)
* optimize the armv7a int8 winograd F(2,3) with neon assembly
* optimize the armv7a int8 winograd F(2,3) input transform with assembly.
* add the requantize layer and int8 relu implement.
* add graph optimize conv1x1s2 -> conv1x1s1,begin optimize int8 aarch64.
* fix int8 bugs
* add the c naive im2col with sgemm
* add aarch64 int8 winograd f23, conv3x3s2 naive implement
* add the int8 sgemm conv7x7s2 on x86/armv7a platform
* optimize the int8 sgemm by neon intrinsic and packed kernel
* optimize the int8 sgemm with packed data
* optimize the int8 sgemm with armv7a neon assembly
* add the int8 sgemm on arm64-v8a platform
* perpare to merge latest codes from master
* add the int8 param files
* In the Class Net,add the fuse_network method
7 years ago  new int8 implement,better accuracy (#749)
* add the armv7a conv3x3s1 implement without overflow,remove old codes
* fix the bug of conv3x3s2 packed int8
* new int8 implement,weight quant by perchanel,better accuracy~
* fix the bug of conv3x3s1 packed int8 neon
* add the naive c fp32 and int8 winograd F(2,3)
* add the neon intrinsic int8 winograd F(2,3)
* optimize the armv7a int8 winograd F(2,3) with neon assembly
* optimize the armv7a int8 winograd F(2,3) input transform with assembly.
* add the requantize layer and int8 relu implement.
* add graph optimize conv1x1s2 -> conv1x1s1,begin optimize int8 aarch64.
* fix int8 bugs
* add the c naive im2col with sgemm
* add aarch64 int8 winograd f23, conv3x3s2 naive implement
* add the int8 sgemm conv7x7s2 on x86/armv7a platform
* optimize the int8 sgemm by neon intrinsic and packed kernel
* optimize the int8 sgemm with packed data
* optimize the int8 sgemm with armv7a neon assembly
* add the int8 sgemm on arm64-v8a platform
* perpare to merge latest codes from master
* add the int8 param files
* In the Class Net,add the fuse_network method
7 years ago |
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- // Tencent is pleased to support the open source community by making ncnn available.
- //
- // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
- //
- // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
- // in compliance with the License. You may obtain a copy of the License at
- //
- // https://opensource.org/licenses/BSD-3-Clause
- //
- // Unless required by applicable law or agreed to in writing, software distributed
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
- // specific language governing permissions and limitations under the License.
-
- #ifdef _WIN32
- #define WIN32_LEAN_AND_MEAN
- #include <windows.h>
- #else // _WIN32
- #include <sys/time.h>
- #endif // _WIN32
-
- #include "benchmark.h"
-
- #if NCNN_BENCHMARK
- #include <stdio.h>
- #include "layer/convolution.h"
- #include "layer/convolutiondepthwise.h"
- #include "layer/deconvolution.h"
- #include "layer/deconvolutiondepthwise.h"
- #endif // NCNN_BENCHMARK
-
- namespace ncnn {
-
- #ifdef _WIN32
- double get_current_time()
- {
- LARGE_INTEGER freq;
- LARGE_INTEGER pc;
- QueryPerformanceFrequency(&freq);
- QueryPerformanceCounter(&pc);
-
- return pc.QuadPart * 1000.0 / freq.QuadPart;
- }
- #else // _WIN32
- double get_current_time()
- {
- struct timeval tv;
- gettimeofday(&tv, NULL);
-
- return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
- }
- #endif // _WIN32
-
- #if NCNN_BENCHMARK
-
- void benchmark(const Layer* layer, double start, double end)
- {
- fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
- fprintf(stderr, " |");
- fprintf(stderr, "\n");
- }
-
- void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end)
- {
- fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
- fprintf(stderr, " | feature_map: %4d x %-4d inch: %4d outch: %4d", bottom_blob.w, bottom_blob.h, bottom_blob.c, top_blob.c);
- if (layer->type == "Convolution")
- {
- fprintf(stderr, " kernel: %1d x %1d stride: %1d x %1d",
- ((Convolution*)layer)->kernel_w,
- ((Convolution*)layer)->kernel_h,
- ((Convolution*)layer)->stride_w,
- ((Convolution*)layer)->stride_h
- );
- }
- else if (layer->type == "ConvolutionDepthWise")
- {
- fprintf(stderr, " kernel: %1d x %1d stride: %1d x %1d",
- ((ConvolutionDepthWise*)layer)->kernel_w,
- ((ConvolutionDepthWise*)layer)->kernel_h,
- ((ConvolutionDepthWise*)layer)->stride_w,
- ((ConvolutionDepthWise*)layer)->stride_h
- );
- }
- else if (layer->type == "Deconvolution")
- {
- fprintf(stderr, " kernel: %1d x %1d stride: %1d x %1d",
- ((Deconvolution*)layer)->kernel_w,
- ((Deconvolution*)layer)->kernel_h,
- ((Deconvolution*)layer)->stride_w,
- ((Deconvolution*)layer)->stride_h
- );
- }
- else if (layer->type == "DeconvolutionDepthWise")
- {
- fprintf(stderr, " kernel: %1d x %1d stride: %1d x %1d",
- ((DeconvolutionDepthWise*)layer)->kernel_w,
- ((DeconvolutionDepthWise*)layer)->kernel_h,
- ((DeconvolutionDepthWise*)layer)->stride_w,
- ((DeconvolutionDepthWise*)layer)->stride_h
- );
- }
- fprintf(stderr, "\n");
- }
-
- #endif // NCNN_BENCHMARK
-
- } // namespace ncnn
|