// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "spectrogram_x86.h" namespace ncnn { Spectrogram_x86::Spectrogram_x86() : conv_transpose(0) { one_blob_only = true; support_inplace = false; } Spectrogram_x86::~Spectrogram_x86() { delete conv_transpose; } int Spectrogram_x86::load_param(const ParamDict& pd) { n_fft = pd.get(0, 0); power = pd.get(1, 0); hoplen = pd.get(2, n_fft / 4); winlen = pd.get(3, n_fft); window_type = pd.get(4, 0); center = pd.get(5, 1); pad_type = pd.get(6, 2); normalized = pd.get(7, 0); onesided = pd.get(8, 1); // assert winlen <= n_fft // generate window window_data.create(n_fft); { float* p = window_data; for (int i = 0; i < (n_fft - winlen) / 2; i++) { *p++ = 0.f; } if (window_type == 0) { // all ones for (int i = 0; i < winlen; i++) { *p++ = 1.f; } } if (window_type == 1) { // hann window for (int i = 0; i < winlen; i++) { *p++ = 0.5f * (1 - cosf(2 * 3.14159265358979323846 * i / winlen)); } } if (window_type == 2) { // hamming window for (int i = 0; i < winlen; i++) { *p++ = 0.54f - 0.46f * cosf(2 * 3.14159265358979323846 * i / winlen); } } for (int i = 0; i < n_fft - winlen - (n_fft - winlen) / 2; i++) { *p++ = 0.f; } // pre-calculated window norm factor if (normalized == 2) { float sqsum = 0.f; for (int i = 0; i < n_fft; i++) { sqsum += window_data[i] * window_data[i]; } float scale = 1.f / sqrt(sqsum); for (int i = 0; i < n_fft; i++) { window_data[i] *= scale; } } } Mat theta; if (onesided) { n_freq = n_fft / 2 + 1; } else { n_freq = n_fft; } theta.create(n_fft, n_freq, size_t(8)); for (int i = 0; i < n_freq; i++) { for (int j = 0; j < n_fft; j++) { theta.row(i)[j] = 2 * 3.14159265358979323846 * i * j / n_fft; } } Mat real_basis, imag_basis; real_basis.create(n_fft, n_freq, size_t(8)); imag_basis.create(n_fft, n_freq, size_t(8)); for (int i = 0; i < n_freq; i++) { for (int j = 0; j < n_fft; j++) { real_basis.row(i)[j] = cos(theta.row(i)[j]); imag_basis.row(i)[j] = -sin(theta.row(i)[j]); } } // multiply window for (int i = 0; i < n_freq; i++) { for (int j = 0; j < n_fft; j++) { real_basis.row(i)[j] *= window_data[j]; imag_basis.row(i)[j] *= window_data[j]; } } if (normalized == 1) { double scale = 1.f / sqrt(n_fft); for (int i = 0; i < n_freq; i++) { for (int j = 0; j < n_fft; j++) { real_basis.row(i)[j] *= scale; imag_basis.row(i)[j] *= scale; } } } conv_data.create(n_fft, 1, n_freq * 2); for (int i = 0; i < n_freq; i++) { for (int j = 0; j < n_fft; j++) { conv_data.channel(i).row(0)[j] = (float)real_basis.row(i)[j]; conv_data.channel(i + n_freq).row(0)[j] = (float)imag_basis.row(i)[j]; } } conv_transpose = ncnn::create_layer("Convolution1D"); ncnn::ParamDict conv_transpose_pd; conv_transpose_pd.set(0, 2 * n_freq); // num_output conv_transpose_pd.set(1, n_fft); // kernel_w conv_transpose_pd.set(3, hoplen); // stride_w conv_transpose_pd.set(19, 1); // dynamic_weight conv_transpose->load_param(conv_transpose_pd); return 0; } int Spectrogram_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // https://pytorch.org/audio/stable/generated/torchaudio.functional.spectrogram.html // TODO custom window Mat bottom_blob_bordered = bottom_blob; if (center == 1) { Option opt_b = opt; opt_b.blob_allocator = opt.workspace_allocator; if (pad_type == 0) copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_CONSTANT, 0.f, opt_b); if (pad_type == 1) copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_REPLICATE, 0.f, opt_b); if (pad_type == 2) copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_REFLECT, 0.f, opt_b); } const int size = bottom_blob_bordered.w; // const int frames = size / hoplen + 1; const int frames = (size - n_fft) / hoplen + 1; const size_t elemsize = bottom_blob_bordered.elemsize; if (elemsize != sizeof(float)) { return -100; } if (power == 0) { top_blob.create(2, frames, n_freq, elemsize, opt.blob_allocator); } else { top_blob.create(frames, n_freq, elemsize, opt.blob_allocator); } if (top_blob.empty()) return -100; std::vector inputs; inputs.push_back(bottom_blob_bordered); inputs.push_back(conv_data); std::vector outputs; outputs.push_back(Mat()); Option opt_conv = opt; opt_conv.use_packing_layout = false; conv_transpose->create_pipeline(opt_conv); conv_transpose->forward(inputs, outputs, opt_conv); conv_transpose->destroy_pipeline(opt_conv); Mat conv_top_blob = outputs[0]; // (2 * n_freq, frames) float* conv_top_data = conv_top_blob; if (power == 0) // as complex { // copy for (int i = 0; i < frames; i++) { for (int j = 0; j < n_freq; j++) { top_blob.channel(j).row(i)[0] = conv_top_data[j * frames + i]; top_blob.channel(j).row(i)[1] = conv_top_data[(j + n_freq) * frames + i]; } } } else { if (power == 1) // magnitude sqrt(re * re + im * im); { // copy for (int i = 0; i < frames; i++) { for (int j = 0; j < n_freq; j++) { top_blob.row(j)[i] = sqrtf(conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]); } } } else if (power == 2) // power re * re + im * im; { // copy for (int i = 0; i < frames; i++) { for (int j = 0; j < n_freq; j++) { top_blob.row(j)[i] = conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]; } } } } return 0; } } // namespace ncnn