From c6ab155a6790d7434b71ead5ffa97083372e85ab Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Sat, 12 Apr 2025 17:36:58 +0800 Subject: [PATCH 1/3] conv based speed stft --- src/layer/spectrogram.cpp | 202 +++++++++++++++++++++++-------------- src/layer/spectrogram.h | 6 ++ tests/test_spectrogram.cpp | 72 ++++++++++++- 3 files changed, 202 insertions(+), 78 deletions(-) diff --git a/src/layer/spectrogram.cpp b/src/layer/spectrogram.cpp index f61613157..4b717bf0c 100644 --- a/src/layer/spectrogram.cpp +++ b/src/layer/spectrogram.cpp @@ -22,6 +22,11 @@ Spectrogram::Spectrogram() support_inplace = false; } +Spectrogram::~Spectrogram() +{ + delete conv_transpose; +} + int Spectrogram::load_param(const ParamDict& pd) { n_fft = pd.get(0, 0); @@ -36,7 +41,7 @@ int Spectrogram::load_param(const ParamDict& pd) // assert winlen <= n_fft // generate window - window_data.create(normalized == 2 ? n_fft + 1 : n_fft); + window_data.create(n_fft); { float* p = window_data; for (int i = 0; i < (n_fft - winlen) / 2; i++) @@ -80,10 +85,90 @@ int Spectrogram::load_param(const ParamDict& pd) { sqsum += window_data[i] * window_data[i]; } - window_data[n_fft] = 1.f / sqrt(sqsum); + float scale = 1.f / sqrt(sqsum); + + for (int i = 0; i < n_fft; i++) + { + window_data[i] *= scale; + } + } + } + + Mat theta; + if (onesided) + { + n_freq = n_fft / 2 + 1; + } else + { + n_freq = n_fft; + } + theta.create(n_fft,n_freq,size_t(8)); + + for (int i = 0; i(i)[j] = 2 * 3.14159265358979323846 * i * j / n_fft; + } + } + + Mat real_basis, imag_basis; + real_basis.create(n_fft,n_freq,size_t(8)); + imag_basis.create(n_fft,n_freq,size_t(8)); + + for (int i = 0; i(i)[j] = cos(theta.row(i)[j]); + imag_basis.row(i)[j] = -sin(theta.row(i)[j]); + } + } + + // multiply window + for (int i = 0; i(i)[j] *= window_data[j]; + imag_basis.row(i)[j] *= window_data[j]; + } + } + + if (normalized == 1) + { + double scale = 1.f / sqrt(n_fft); + for (int i = 0; i(i)[j] *= scale; + imag_basis.row(i)[j] *= scale; + } } } + conv_data.create(n_fft,1,n_freq * 2); + + for (int i = 0; i(0)[j]= (float)real_basis.row(i)[j]; + conv_data.channel(i+n_freq).row(0)[j] = (float)imag_basis.row(i)[j]; + } + } + + conv_transpose = ncnn::create_layer("Convolution1D"); + ncnn::ParamDict conv_transpose_pd; + + conv_transpose_pd.set(0,2 * n_freq); // num_output + conv_transpose_pd.set(1,n_fft); // kernel_w + conv_transpose_pd.set(3,hoplen); // stride_w + conv_transpose_pd.set(19,1); // dynamic_weight + + conv_transpose->load_param(conv_transpose_pd); + return 0; } @@ -110,107 +195,70 @@ int Spectrogram::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op // const int frames = size / hoplen + 1; const int frames = (size - n_fft) / hoplen + 1; - const int freqs_onesided = n_fft / 2 + 1; - const int freqs = onesided ? freqs_onesided : n_fft; const size_t elemsize = bottom_blob_bordered.elemsize; + if (elemsize != sizeof(float)) + { + return -100; + } + if (power == 0) { - top_blob.create(2, frames, freqs, elemsize, opt.blob_allocator); + top_blob.create(2, frames, n_freq, elemsize, opt.blob_allocator); } else { - top_blob.create(frames, freqs, elemsize, opt.blob_allocator); + top_blob.create(frames, n_freq, elemsize, opt.blob_allocator); } if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < freqs_onesided; i++) - { - const float* ptr = bottom_blob_bordered; - float* outptr = power == 0 ? top_blob.channel(i) : top_blob.row(i); + std::vector inputs = {bottom_blob_bordered,conv_data}; + std::vector outputs = {Mat()}; - for (int j = 0; j < frames; j++) - { - float re = 0.f; - float im = 0.f; - for (int k = 0; k < n_fft; k++) - { - float v = ptr[k]; + Option opt_conv = opt; + opt_conv.use_packing_layout = false; - // apply window - v *= window_data[k]; + conv_transpose->create_pipeline(opt_conv); + conv_transpose->forward(inputs,outputs,opt_conv); + conv_transpose->destroy_pipeline(opt_conv); - // dft - double angle = 2 * 3.14159265358979323846 * i * k / n_fft; + Mat conv_top_blob = outputs[0]; // (2 * n_freq, frames) + float* conv_top_data = conv_top_blob; - re += v * cosf(angle); // + imag * sinf(angle); - im -= v * sinf(angle); // + imag * cosf(angle); - } - - if (normalized == 1) - { - float norm = 1.f / sqrt(n_fft); - re *= norm; - im *= norm; - } - if (normalized == 2) - { - float norm = window_data[n_fft]; - re *= norm; - im *= norm; - } - - if (power == 0) - { - // complex as real - outptr[0] = re; - outptr[1] = im; - outptr += 2; - } - if (power == 1) - { - // magnitude - outptr[0] = sqrt(re * re + im * im); - outptr += 1; - } - if (power == 2) + if (power == 0) // as complex + { + // copy + for (int i = 0; i(i)[0] = conv_top_data[j * frames + i]; + top_blob.channel(j).row(i)[1] = conv_top_data[(j + n_freq) * frames + i]; } - - ptr += hoplen; } - } - - if (!onesided) + } else { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = freqs_onesided; i < n_fft; i++) + if (power == 1) // magnitude sqrt(re * re + im * im); { - if (power == 0) + // copy + for (int i = 0; i < frames; i++) { - const float* ptr = top_blob.channel(n_fft - i); - float* outptr = top_blob.channel(i); - - for (int j = 0; j < frames; j++) + for (int j = 0; j < n_freq; j++) { - // complex as real - outptr[0] = ptr[0]; - outptr[1] = -ptr[1]; - ptr += 2; - outptr += 2; + top_blob.row(j)[i] = sqrtf(conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]); } } - else // if (power == 1 || power == 2) + } else if (power == 2) // power re * re + im * im; + { + // copy + for (int i = 0; i < frames; i++) { - const float* ptr = top_blob.row(n_fft - i); - float* outptr = top_blob.row(i); - - memcpy(outptr, ptr, frames * sizeof(float)); + for (int j = 0; j< n_freq; j++) + { + top_blob.row(j)[i] = conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]; + } } } } diff --git a/src/layer/spectrogram.h b/src/layer/spectrogram.h index 712dadafd..0c4dde4fc 100644 --- a/src/layer/spectrogram.h +++ b/src/layer/spectrogram.h @@ -23,6 +23,7 @@ class Spectrogram : public Layer { public: Spectrogram(); + ~Spectrogram(); virtual int load_param(const ParamDict& pd); @@ -39,7 +40,12 @@ public: int normalized; // 0=disabled 1=sqrt(n_fft) 2=window-l2-energy int onesided; + int n_freq; + Mat window_data; + Mat conv_data; + + Layer* conv_transpose = 0; }; } // namespace ncnn diff --git a/tests/test_spectrogram.cpp b/tests/test_spectrogram.cpp index b58ddd3cf..a1a7b3925 100644 --- a/tests/test_spectrogram.cpp +++ b/tests/test_spectrogram.cpp @@ -50,9 +50,79 @@ static int test_spectrogram_0() || test_spectrogram(124, 55, 2, 12, 55, 1, 1, 2, 2, 0); } +static int test_spectrogram_eval(int size, int n_fft, int power, int hoplen, int winlen, int window_type, int center, int pad_type, int normalized, int onesided,float * in,float * std) +{ + ncnn::Layer * layer = ncnn::create_layer("Spectrogram"); + + ncnn::ParamDict pd; + pd.set(0, n_fft); + pd.set(1, power); + pd.set(2, hoplen); + pd.set(3, winlen); + pd.set(4, window_type); + pd.set(5, center); + pd.set(6, pad_type); + pd.set(7, normalized); + pd.set(8, onesided); + + ncnn::Mat input = ncnn::Mat(size); + memcpy(input, in, size * sizeof(float)); + + ncnn::Mat output; + + ncnn::Option opt; + opt.num_threads = 2; + + layer->load_param(pd); + layer->create_pipeline(opt); + layer->forward(input, output, opt); + layer->destroy_pipeline(opt); + + const float epsilon = 1e-6; + + for (int i = 0; i < output.c; i++) + { + float * output_data = output.channel(i); + for (int j = 0; j < output.h; j++) + { + for (int k = 0; k < output.w; k++) + { + if (fabs(output_data[j * output.w + k] - std[i * output.h * output.w + j * output.w + k]) > epsilon) + { + fprintf(stderr, "test_spectrogram failed size=%d n_fft=%d power=%d hoplen=%d winlen=%d window_type=%d center=%d pad_type=%d normalized=%d onesided=%d\n", size, n_fft, power, hoplen, winlen, window_type, center, pad_type, normalized, onesided); + return 1; + } + } + } + } + + delete layer; + return 0; +} + +static int test_spectrogram_1() +{ + float input_0[16] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; + float std_0[] = { + 0.05000000f, 0.40000001f, 0.80000001f, 1.20000005f, 1.59999990f, 2.00000000f, 2.40000010f, 2.79999995f, 0.75000000f, 0.05000000f, 0.22360681f, 0.41231057f, 0.60827625f, 0.80622578f, 1.00498760f, 1.20415950f, 1.40356684f, 0.75000000f, 0.05000000f, 0.00000000f, 0.00000000f, 0.00000000f, 0.00000006f, 0.00000000f, 0.00000000f, 0.00000000f, 0.75000000f + }; + float std_1[] = { + 0.80000001f, 1.20000005f, 1.59999990f, 2.00000000f, 2.40000010f, 0.68649411f, 1.02670193f, 1.36751485f, 1.70857072f, 2.04974818f, 0.41231057f, 0.60827625f, 0.80622578f, 1.00498760f, 1.20415950f, 0.13684234f, 0.18942842f, 0.24475159f, 0.30130789f, 0.35851428f, 0.00000000f, 0.00000000f, 0.00000006f, 0.00000000f, 0.00000000f + }; + float std_2[] = { + 0.28284273f, 0.49497476f, 0.70710677f, 0.24271232f, 0.42322639f, 0.60407096f, 0.14577380f, 0.25000000f, 0.35531676f, 0.04838108f, 0.07667736f, 0.10652842f, 0.00000000f, 0.00000002f, 0.00000000f, 0.04838108f, 0.07667736f, 0.10652842f, 0.14577380f, 0.25000000f, 0.35531676f, 0.24271232f, 0.42322639f, 0.60407096f + }; + + return + test_spectrogram_eval(16, 4, 1, 2, 4, 1, 1, 0, 0, 1, input_0, std_0) + || test_spectrogram_eval(16, 8, 1, 2, 4, 1, 0, 0, 0, 1, input_0, std_1) + || test_spectrogram_eval(16, 8, 1, 3, 4, 1, 0, 0, 1, 0, input_0, std_2); + +} + int main() { SRAND(7767517); - return test_spectrogram_0(); + return test_spectrogram_0() || test_spectrogram_1(); } From 18bac4c13754cac966990a23149653fae6d8f57a Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Thu, 29 May 2025 20:38:24 +0800 Subject: [PATCH 2/3] fix error in non-x86 platform --- src/layer/spectrogram.cpp | 204 +++++++++------------- src/layer/spectrogram.h | 8 +- src/layer/x86/spectrogram_x86.cpp | 274 ++++++++++++++++++++++++++++++ src/layer/x86/spectrogram_x86.h | 53 ++++++ 4 files changed, 406 insertions(+), 133 deletions(-) create mode 100644 src/layer/x86/spectrogram_x86.cpp create mode 100644 src/layer/x86/spectrogram_x86.h diff --git a/src/layer/spectrogram.cpp b/src/layer/spectrogram.cpp index 4b717bf0c..af3313727 100644 --- a/src/layer/spectrogram.cpp +++ b/src/layer/spectrogram.cpp @@ -22,11 +22,6 @@ Spectrogram::Spectrogram() support_inplace = false; } -Spectrogram::~Spectrogram() -{ - delete conv_transpose; -} - int Spectrogram::load_param(const ParamDict& pd) { n_fft = pd.get(0, 0); @@ -41,7 +36,7 @@ int Spectrogram::load_param(const ParamDict& pd) // assert winlen <= n_fft // generate window - window_data.create(n_fft); + window_data.create(normalized == 2 ? n_fft + 1 : n_fft); { float* p = window_data; for (int i = 0; i < (n_fft - winlen) / 2; i++) @@ -85,90 +80,10 @@ int Spectrogram::load_param(const ParamDict& pd) { sqsum += window_data[i] * window_data[i]; } - float scale = 1.f / sqrt(sqsum); - - for (int i = 0; i < n_fft; i++) - { - window_data[i] *= scale; - } - } - } - - Mat theta; - if (onesided) - { - n_freq = n_fft / 2 + 1; - } else - { - n_freq = n_fft; - } - theta.create(n_fft,n_freq,size_t(8)); - - for (int i = 0; i(i)[j] = 2 * 3.14159265358979323846 * i * j / n_fft; - } - } - - Mat real_basis, imag_basis; - real_basis.create(n_fft,n_freq,size_t(8)); - imag_basis.create(n_fft,n_freq,size_t(8)); - - for (int i = 0; i(i)[j] = cos(theta.row(i)[j]); - imag_basis.row(i)[j] = -sin(theta.row(i)[j]); - } - } - - // multiply window - for (int i = 0; i(i)[j] *= window_data[j]; - imag_basis.row(i)[j] *= window_data[j]; - } - } - - if (normalized == 1) - { - double scale = 1.f / sqrt(n_fft); - for (int i = 0; i(i)[j] *= scale; - imag_basis.row(i)[j] *= scale; - } + window_data[n_fft] = 1.f / sqrt(sqsum); } } - conv_data.create(n_fft,1,n_freq * 2); - - for (int i = 0; i(0)[j]= (float)real_basis.row(i)[j]; - conv_data.channel(i+n_freq).row(0)[j] = (float)imag_basis.row(i)[j]; - } - } - - conv_transpose = ncnn::create_layer("Convolution1D"); - ncnn::ParamDict conv_transpose_pd; - - conv_transpose_pd.set(0,2 * n_freq); // num_output - conv_transpose_pd.set(1,n_fft); // kernel_w - conv_transpose_pd.set(3,hoplen); // stride_w - conv_transpose_pd.set(19,1); // dynamic_weight - - conv_transpose->load_param(conv_transpose_pd); - return 0; } @@ -195,70 +110,107 @@ int Spectrogram::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op // const int frames = size / hoplen + 1; const int frames = (size - n_fft) / hoplen + 1; + const int freqs_onesided = n_fft / 2 + 1; + const int freqs = onesided ? freqs_onesided : n_fft; const size_t elemsize = bottom_blob_bordered.elemsize; - if (elemsize != sizeof(float)) - { - return -100; - } - if (power == 0) { - top_blob.create(2, frames, n_freq, elemsize, opt.blob_allocator); + top_blob.create(2, frames, freqs, elemsize, opt.blob_allocator); } else { - top_blob.create(frames, n_freq, elemsize, opt.blob_allocator); + top_blob.create(frames, freqs, elemsize, opt.blob_allocator); } if (top_blob.empty()) return -100; - std::vector inputs = {bottom_blob_bordered,conv_data}; - std::vector outputs = {Mat()}; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < freqs_onesided; i++) + { + const float* ptr = bottom_blob_bordered; + float* outptr = power == 0 ? top_blob.channel(i) : top_blob.row(i); - Option opt_conv = opt; - opt_conv.use_packing_layout = false; + for (int j = 0; j < frames; j++) + { + float re = 0.f; + float im = 0.f; + for (int k = 0; k < n_fft; k++) + { + float v = ptr[k]; - conv_transpose->create_pipeline(opt_conv); - conv_transpose->forward(inputs,outputs,opt_conv); - conv_transpose->destroy_pipeline(opt_conv); + // apply window + v *= window_data[k]; - Mat conv_top_blob = outputs[0]; // (2 * n_freq, frames) - float* conv_top_data = conv_top_blob; + // dft + double angle = 2 * 3.14159265358979323846 * i * k / n_fft; - if (power == 0) // as complex - { - // copy - for (int i = 0; i(i)[0] = conv_top_data[j * frames + i]; - top_blob.channel(j).row(i)[1] = conv_top_data[(j + n_freq) * frames + i]; + float norm = 1.f / sqrt(n_fft); + re *= norm; + im *= norm; } + if (normalized == 2) + { + float norm = window_data[n_fft]; + re *= norm; + im *= norm; + } + + if (power == 0) + { + // complex as real + outptr[0] = re; + outptr[1] = im; + outptr += 2; + } + if (power == 1) + { + // magnitude + outptr[0] = sqrt(re * re + im * im); + outptr += 1; + } + if (power == 2) + { + outptr[0] = re * re + im * im; + outptr += 1; + } + + ptr += hoplen; } - } else + } + + if (!onesided) { - if (power == 1) // magnitude sqrt(re * re + im * im); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = freqs_onesided; i < n_fft; i++) { - // copy - for (int i = 0; i < frames; i++) + if (power == 0) { - for (int j = 0; j < n_freq; j++) + const float* ptr = top_blob.channel(n_fft - i); + float* outptr = top_blob.channel(i); + + for (int j = 0; j < frames; j++) { - top_blob.row(j)[i] = sqrtf(conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]); + // complex as real + outptr[0] = ptr[0]; + outptr[1] = -ptr[1]; + ptr += 2; + outptr += 2; } } - } else if (power == 2) // power re * re + im * im; - { - // copy - for (int i = 0; i < frames; i++) + else // if (power == 1 || power == 2) { - for (int j = 0; j< n_freq; j++) - { - top_blob.row(j)[i] = conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]; - } + const float* ptr = top_blob.row(n_fft - i); + float* outptr = top_blob.row(i); + + memcpy(outptr, ptr, frames * sizeof(float)); } } } @@ -266,4 +218,4 @@ int Spectrogram::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op return 0; } -} // namespace ncnn +} // namespace ncnn \ No newline at end of file diff --git a/src/layer/spectrogram.h b/src/layer/spectrogram.h index 0c4dde4fc..4e63fbc13 100644 --- a/src/layer/spectrogram.h +++ b/src/layer/spectrogram.h @@ -23,7 +23,6 @@ class Spectrogram : public Layer { public: Spectrogram(); - ~Spectrogram(); virtual int load_param(const ParamDict& pd); @@ -40,14 +39,9 @@ public: int normalized; // 0=disabled 1=sqrt(n_fft) 2=window-l2-energy int onesided; - int n_freq; - Mat window_data; - Mat conv_data; - - Layer* conv_transpose = 0; }; } // namespace ncnn -#endif // LAYER_SPECTROGRAM_H +#endif // LAYER_SPECTROGRAM_H \ No newline at end of file diff --git a/src/layer/x86/spectrogram_x86.cpp b/src/layer/x86/spectrogram_x86.cpp new file mode 100644 index 000000000..bf16d9e56 --- /dev/null +++ b/src/layer/x86/spectrogram_x86.cpp @@ -0,0 +1,274 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "spectrogram_x86.h" + +namespace ncnn { + +Spectrogram_x86::Spectrogram_x86() + : conv_transpose(0) +{ + one_blob_only = true; + support_inplace = false; +} + +Spectrogram_x86::~Spectrogram_x86() +{ + delete conv_transpose; +} + +int Spectrogram_x86::load_param(const ParamDict& pd) +{ + n_fft = pd.get(0, 0); + power = pd.get(1, 0); + hoplen = pd.get(2, n_fft / 4); + winlen = pd.get(3, n_fft); + window_type = pd.get(4, 0); + center = pd.get(5, 1); + pad_type = pd.get(6, 2); + normalized = pd.get(7, 0); + onesided = pd.get(8, 1); + + // assert winlen <= n_fft + // generate window + window_data.create(n_fft); + { + float* p = window_data; + for (int i = 0; i < (n_fft - winlen) / 2; i++) + { + *p++ = 0.f; + } + if (window_type == 0) + { + // all ones + for (int i = 0; i < winlen; i++) + { + *p++ = 1.f; + } + } + if (window_type == 1) + { + // hann window + for (int i = 0; i < winlen; i++) + { + *p++ = 0.5f * (1 - cosf(2 * 3.14159265358979323846 * i / winlen)); + } + } + if (window_type == 2) + { + // hamming window + for (int i = 0; i < winlen; i++) + { + *p++ = 0.54f - 0.46f * cosf(2 * 3.14159265358979323846 * i / winlen); + } + } + for (int i = 0; i < n_fft - winlen - (n_fft - winlen) / 2; i++) + { + *p++ = 0.f; + } + + // pre-calculated window norm factor + if (normalized == 2) + { + float sqsum = 0.f; + for (int i = 0; i < n_fft; i++) + { + sqsum += window_data[i] * window_data[i]; + } + float scale = 1.f / sqrt(sqsum); + + for (int i = 0; i < n_fft; i++) + { + window_data[i] *= scale; + } + } + } + + Mat theta; + if (onesided) + { + n_freq = n_fft / 2 + 1; + } else + { + n_freq = n_fft; + } + theta.create(n_fft,n_freq,size_t(8)); + + for (int i = 0; i(i)[j] = 2 * 3.14159265358979323846 * i * j / n_fft; + } + } + + Mat real_basis, imag_basis; + real_basis.create(n_fft,n_freq,size_t(8)); + imag_basis.create(n_fft,n_freq,size_t(8)); + + for (int i = 0; i(i)[j] = cos(theta.row(i)[j]); + imag_basis.row(i)[j] = -sin(theta.row(i)[j]); + } + } + + // multiply window + for (int i = 0; i(i)[j] *= window_data[j]; + imag_basis.row(i)[j] *= window_data[j]; + } + } + + if (normalized == 1) + { + double scale = 1.f / sqrt(n_fft); + for (int i = 0; i(i)[j] *= scale; + imag_basis.row(i)[j] *= scale; + } + } + } + + conv_data.create(n_fft,1,n_freq * 2); + + for (int i = 0; i(0)[j]= (float)real_basis.row(i)[j]; + conv_data.channel(i+n_freq).row(0)[j] = (float)imag_basis.row(i)[j]; + } + } + + conv_transpose = ncnn::create_layer("Convolution1D"); + ncnn::ParamDict conv_transpose_pd; + + conv_transpose_pd.set(0,2 * n_freq); // num_output + conv_transpose_pd.set(1,n_fft); // kernel_w + conv_transpose_pd.set(3,hoplen); // stride_w + conv_transpose_pd.set(19,1); // dynamic_weight + + conv_transpose->load_param(conv_transpose_pd); + + return 0; +} + +int Spectrogram_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // https://pytorch.org/audio/stable/generated/torchaudio.functional.spectrogram.html + + // TODO custom window + + Mat bottom_blob_bordered = bottom_blob; + if (center == 1) + { + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + if (pad_type == 0) + copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_CONSTANT, 0.f, opt_b); + if (pad_type == 1) + copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_REPLICATE, 0.f, opt_b); + if (pad_type == 2) + copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_REFLECT, 0.f, opt_b); + } + + const int size = bottom_blob_bordered.w; + + // const int frames = size / hoplen + 1; + const int frames = (size - n_fft) / hoplen + 1; + + const size_t elemsize = bottom_blob_bordered.elemsize; + + if (elemsize != sizeof(float)) + { + return -100; + } + + if (power == 0) + { + top_blob.create(2, frames, n_freq, elemsize, opt.blob_allocator); + } + else + { + top_blob.create(frames, n_freq, elemsize, opt.blob_allocator); + } + if (top_blob.empty()) + return -100; + + std::vector inputs; + inputs.push_back(bottom_blob_bordered); + inputs.push_back(conv_data); + + std::vector outputs; + outputs.push_back(Mat()); + + Option opt_conv = opt; + opt_conv.use_packing_layout = false; + + conv_transpose->create_pipeline(opt_conv); + conv_transpose->forward(inputs,outputs,opt_conv); + conv_transpose->destroy_pipeline(opt_conv); + + Mat conv_top_blob = outputs[0]; // (2 * n_freq, frames) + float* conv_top_data = conv_top_blob; + + if (power == 0) // as complex + { + // copy + for (int i = 0; i(i)[0] = conv_top_data[j * frames + i]; + top_blob.channel(j).row(i)[1] = conv_top_data[(j + n_freq) * frames + i]; + } + } + } else + { + if (power == 1) // magnitude sqrt(re * re + im * im); + { + // copy + for (int i = 0; i < frames; i++) + { + for (int j = 0; j < n_freq; j++) + { + top_blob.row(j)[i] = sqrtf(conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]); + } + } + } else if (power == 2) // power re * re + im * im; + { + // copy + for (int i = 0; i < frames; i++) + { + for (int j = 0; j< n_freq; j++) + { + top_blob.row(j)[i] = conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/spectrogram_x86.h b/src/layer/x86/spectrogram_x86.h new file mode 100644 index 000000000..4678406c8 --- /dev/null +++ b/src/layer/x86/spectrogram_x86.h @@ -0,0 +1,53 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SPECTROGRAM_X86_H +#define LAYER_SPECTROGRAM_X86_H + +#include "spectrogram.h" + +namespace ncnn { + +class Spectrogram_x86 : public Spectrogram +{ +public: + Spectrogram_x86(); + ~Spectrogram_x86(); + + virtual int load_param(const ParamDict& pd); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +public: + int n_fft; + int power; + int hoplen; + int winlen; + int window_type; // 0=ones 1=hann 2=hamming + int center; + int pad_type; // 0=CONSTANT 1=REPLICATE 2=REFLECT + int normalized; // 0=disabled 1=sqrt(n_fft) 2=window-l2-energy + int onesided; + + int n_freq; + + Mat window_data; + Mat conv_data; + + Layer* conv_transpose; +}; + +} // namespace ncnn + +#endif // LAYER_SPECTROGRAM_X86_H From 0546aa4aa23899076a06fc7c6c398e95da5db0ae Mon Sep 17 00:00:00 2001 From: futz12 <56149058+futz12@users.noreply.github.com> Date: Thu, 29 May 2025 12:58:50 +0000 Subject: [PATCH 3/3] apply code-format changes --- src/layer/x86/spectrogram_x86.cpp | 57 ++++++++++++++++--------------- tests/test_spectrogram.cpp | 14 ++++---- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/layer/x86/spectrogram_x86.cpp b/src/layer/x86/spectrogram_x86.cpp index bf16d9e56..7acd68f3e 100644 --- a/src/layer/x86/spectrogram_x86.cpp +++ b/src/layer/x86/spectrogram_x86.cpp @@ -99,27 +99,28 @@ int Spectrogram_x86::load_param(const ParamDict& pd) if (onesided) { n_freq = n_fft / 2 + 1; - } else + } + else { n_freq = n_fft; } - theta.create(n_fft,n_freq,size_t(8)); + theta.create(n_fft, n_freq, size_t(8)); - for (int i = 0; i(i)[j] = 2 * 3.14159265358979323846 * i * j / n_fft; } } Mat real_basis, imag_basis; - real_basis.create(n_fft,n_freq,size_t(8)); - imag_basis.create(n_fft,n_freq,size_t(8)); + real_basis.create(n_fft, n_freq, size_t(8)); + imag_basis.create(n_fft, n_freq, size_t(8)); - for (int i = 0; i(i)[j] = cos(theta.row(i)[j]); imag_basis.row(i)[j] = -sin(theta.row(i)[j]); @@ -127,9 +128,9 @@ int Spectrogram_x86::load_param(const ParamDict& pd) } // multiply window - for (int i = 0; i(i)[j] *= window_data[j]; imag_basis.row(i)[j] *= window_data[j]; @@ -139,9 +140,9 @@ int Spectrogram_x86::load_param(const ParamDict& pd) if (normalized == 1) { double scale = 1.f / sqrt(n_fft); - for (int i = 0; i(i)[j] *= scale; imag_basis.row(i)[j] *= scale; @@ -149,24 +150,24 @@ int Spectrogram_x86::load_param(const ParamDict& pd) } } - conv_data.create(n_fft,1,n_freq * 2); + conv_data.create(n_fft, 1, n_freq * 2); - for (int i = 0; i(0)[j]= (float)real_basis.row(i)[j]; - conv_data.channel(i+n_freq).row(0)[j] = (float)imag_basis.row(i)[j]; + conv_data.channel(i).row(0)[j] = (float)real_basis.row(i)[j]; + conv_data.channel(i + n_freq).row(0)[j] = (float)imag_basis.row(i)[j]; } } conv_transpose = ncnn::create_layer("Convolution1D"); ncnn::ParamDict conv_transpose_pd; - conv_transpose_pd.set(0,2 * n_freq); // num_output - conv_transpose_pd.set(1,n_fft); // kernel_w - conv_transpose_pd.set(3,hoplen); // stride_w - conv_transpose_pd.set(19,1); // dynamic_weight + conv_transpose_pd.set(0, 2 * n_freq); // num_output + conv_transpose_pd.set(1, n_fft); // kernel_w + conv_transpose_pd.set(3, hoplen); // stride_w + conv_transpose_pd.set(19, 1); // dynamic_weight conv_transpose->load_param(conv_transpose_pd); @@ -226,7 +227,7 @@ int Spectrogram_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option opt_conv.use_packing_layout = false; conv_transpose->create_pipeline(opt_conv); - conv_transpose->forward(inputs,outputs,opt_conv); + conv_transpose->forward(inputs, outputs, opt_conv); conv_transpose->destroy_pipeline(opt_conv); Mat conv_top_blob = outputs[0]; // (2 * n_freq, frames) @@ -235,15 +236,16 @@ int Spectrogram_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option if (power == 0) // as complex { // copy - for (int i = 0; i(i)[0] = conv_top_data[j * frames + i]; top_blob.channel(j).row(i)[1] = conv_top_data[(j + n_freq) * frames + i]; } } - } else + } + else { if (power == 1) // magnitude sqrt(re * re + im * im); { @@ -255,12 +257,13 @@ int Spectrogram_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option top_blob.row(j)[i] = sqrtf(conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]); } } - } else if (power == 2) // power re * re + im * im; + } + else if (power == 2) // power re * re + im * im; { // copy for (int i = 0; i < frames; i++) { - for (int j = 0; j< n_freq; j++) + for (int j = 0; j < n_freq; j++) { top_blob.row(j)[i] = conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]; } diff --git a/tests/test_spectrogram.cpp b/tests/test_spectrogram.cpp index a1a7b3925..a36c2051b 100644 --- a/tests/test_spectrogram.cpp +++ b/tests/test_spectrogram.cpp @@ -50,9 +50,9 @@ static int test_spectrogram_0() || test_spectrogram(124, 55, 2, 12, 55, 1, 1, 2, 2, 0); } -static int test_spectrogram_eval(int size, int n_fft, int power, int hoplen, int winlen, int window_type, int center, int pad_type, int normalized, int onesided,float * in,float * std) +static int test_spectrogram_eval(int size, int n_fft, int power, int hoplen, int winlen, int window_type, int center, int pad_type, int normalized, int onesided, float* in, float* std) { - ncnn::Layer * layer = ncnn::create_layer("Spectrogram"); + ncnn::Layer* layer = ncnn::create_layer("Spectrogram"); ncnn::ParamDict pd; pd.set(0, n_fft); @@ -82,7 +82,7 @@ static int test_spectrogram_eval(int size, int n_fft, int power, int hoplen, int for (int i = 0; i < output.c; i++) { - float * output_data = output.channel(i); + float* output_data = output.channel(i); for (int j = 0; j < output.h; j++) { for (int k = 0; k < output.w; k++) @@ -113,11 +113,9 @@ static int test_spectrogram_1() 0.28284273f, 0.49497476f, 0.70710677f, 0.24271232f, 0.42322639f, 0.60407096f, 0.14577380f, 0.25000000f, 0.35531676f, 0.04838108f, 0.07667736f, 0.10652842f, 0.00000000f, 0.00000002f, 0.00000000f, 0.04838108f, 0.07667736f, 0.10652842f, 0.14577380f, 0.25000000f, 0.35531676f, 0.24271232f, 0.42322639f, 0.60407096f }; - return - test_spectrogram_eval(16, 4, 1, 2, 4, 1, 1, 0, 0, 1, input_0, std_0) - || test_spectrogram_eval(16, 8, 1, 2, 4, 1, 0, 0, 0, 1, input_0, std_1) - || test_spectrogram_eval(16, 8, 1, 3, 4, 1, 0, 0, 1, 0, input_0, std_2); - + return test_spectrogram_eval(16, 4, 1, 2, 4, 1, 1, 0, 0, 1, input_0, std_0) + || test_spectrogram_eval(16, 8, 1, 2, 4, 1, 0, 0, 0, 1, input_0, std_1) + || test_spectrogram_eval(16, 8, 1, 3, 4, 1, 0, 0, 1, 0, input_0, std_2); } int main()