Merge pull request !2097 from chenzhongming/mastertags/v0.5.0-beta
| @@ -39,12 +39,10 @@ class BatchNormFold2GpuKernel : public GpuKernel { | |||||
| ~BatchNormFold2GpuKernel() override { DestroyResource(); } | ~BatchNormFold2GpuKernel() override { DestroyResource(); } | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | ||||
| if (is_null_input_) { | if (is_null_input_) { | ||||
| return true; | return true; | ||||
| @@ -111,10 +109,7 @@ class BatchNormFold2GpuKernel : public GpuKernel { | |||||
| input_size_list_.push_back(weight_size); // running_std | input_size_list_.push_back(weight_size); // running_std | ||||
| input_size_list_.push_back(weight_size); // running_mean | input_size_list_.push_back(weight_size); // running_mean | ||||
| input_size_list_.push_back(sizeof(int32_t)); // global_step | input_size_list_.push_back(sizeof(int32_t)); // global_step | ||||
| output_size_list_.push_back(input_size); | output_size_list_.push_back(input_size); | ||||
| workspace_size_list_.push_back(sizeof(int32_t)); | |||||
| } | } | ||||
| private: | private: | ||||
| @@ -39,9 +39,7 @@ class BatchNormFold2GradGpuKernel : public GpuKernel { | |||||
| ~BatchNormFold2GradGpuKernel() override { DestroyResource(); } | ~BatchNormFold2GradGpuKernel() override { DestroyResource(); } | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| @@ -47,9 +47,7 @@ class BatchNormFoldGpuKernel : public GpuKernel { | |||||
| ~BatchNormFoldGpuKernel() override { DestroyResource(); } | ~BatchNormFoldGpuKernel() override { DestroyResource(); } | ||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| @@ -46,9 +46,8 @@ class BatchNormFoldGradGpuKernel : public GpuKernel { | |||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | ||||
| (void)workspace; | |||||
| // 'd_batch_mean', 'd_batch_std', 'x', 'batch_mean', 'batch_std', 'current_step' | // 'd_batch_mean', 'd_batch_std', 'x', 'batch_mean', 'batch_std', 'current_step' | ||||
| T *d_batch_mean = GetDeviceAddress<T>(inputs, 0); | T *d_batch_mean = GetDeviceAddress<T>(inputs, 0); | ||||
| T *d_batch_std = GetDeviceAddress<T>(inputs, 1); | T *d_batch_std = GetDeviceAddress<T>(inputs, 1); | ||||
| @@ -139,11 +138,8 @@ class BatchNormFoldGradGpuKernel : public GpuKernel { | |||||
| input_size_list_.push_back(channel_size_); | input_size_list_.push_back(channel_size_); | ||||
| input_size_list_.push_back(channel_size_); | input_size_list_.push_back(channel_size_); | ||||
| input_size_list_.push_back(sizeof(int)); | input_size_list_.push_back(sizeof(int)); | ||||
| // 'dx' | // 'dx' | ||||
| output_size_list_.push_back(input_size_); | output_size_list_.push_back(input_size_); | ||||
| workspace_size_list_.push_back(workspace_size_); | |||||
| } | } | ||||
| private: | private: | ||||
| @@ -33,7 +33,8 @@ class CorrectionMulGpuKernel : public GpuKernel { | |||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | ||||
| auto *weight = GetDeviceAddress<T>(inputs, 0); | auto *weight = GetDeviceAddress<T>(inputs, 0); | ||||
| auto *gamma = GetDeviceAddress<T>(inputs, 1); | auto *gamma = GetDeviceAddress<T>(inputs, 1); | ||||
| @@ -74,10 +75,9 @@ class CorrectionMulGpuKernel : public GpuKernel { | |||||
| input_size_list_.push_back(input_size); // weight | input_size_list_.push_back(input_size); // weight | ||||
| input_size_list_.push_back(weight_size); // gamma | input_size_list_.push_back(weight_size); // gamma | ||||
| input_size_list_.push_back(weight_size); // running_std | input_size_list_.push_back(weight_size); // running_std | ||||
| size_t workspace_size = 0; | |||||
| output_size_list_.push_back(input_size); | output_size_list_.push_back(input_size); | ||||
| workspace_size_list_.push_back(workspace_size); | |||||
| } | } | ||||
| void InitResource() override {} | void InitResource() override {} | ||||
| private: | private: | ||||
| @@ -101,10 +101,9 @@ void FakeQuantGradGpuKernel::InitSizeLists() { | |||||
| input_size_list_.push_back(min_size_); // min | input_size_list_.push_back(min_size_); // min | ||||
| input_size_list_.push_back(max_size_); // max | input_size_list_.push_back(max_size_); // max | ||||
| output_size_list_.push_back(output_size_); | output_size_list_.push_back(output_size_); | ||||
| workspace_size_list_.push_back(workspace_size_); | |||||
| } | } | ||||
| bool FakeQuantGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| bool FakeQuantGradGpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | const std::vector<AddressPtr> &outputs, void *stream_ptr) { | ||||
| float *output = GetDeviceAddress<float>(outputs, 0); | float *output = GetDeviceAddress<float>(outputs, 0); | ||||
| float *gradient = GetDeviceAddress<float>(inputs, 0); | float *gradient = GetDeviceAddress<float>(inputs, 0); | ||||