|
|
|
@@ -0,0 +1,277 @@ |
|
|
|
// Tencent is pleased to support the open source community by making ncnn available. |
|
|
|
// |
|
|
|
// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. |
|
|
|
// |
|
|
|
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except |
|
|
|
// in compliance with the License. You may obtain a copy of the License at |
|
|
|
// |
|
|
|
// https://opensource.org/licenses/BSD-3-Clause |
|
|
|
// |
|
|
|
// Unless required by applicable law or agreed to in writing, software distributed |
|
|
|
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR |
|
|
|
// CONDITIONS OF ANY KIND, either express or implied. See the License for the |
|
|
|
// specific language governing permissions and limitations under the License. |
|
|
|
|
|
|
|
#include "spectrogram_x86.h" |
|
|
|
|
|
|
|
namespace ncnn { |
|
|
|
|
|
|
|
Spectrogram_x86::Spectrogram_x86() |
|
|
|
: conv_transpose(0) |
|
|
|
{ |
|
|
|
one_blob_only = true; |
|
|
|
support_inplace = false; |
|
|
|
} |
|
|
|
|
|
|
|
Spectrogram_x86::~Spectrogram_x86() |
|
|
|
{ |
|
|
|
delete conv_transpose; |
|
|
|
} |
|
|
|
|
|
|
|
int Spectrogram_x86::load_param(const ParamDict& pd) |
|
|
|
{ |
|
|
|
n_fft = pd.get(0, 0); |
|
|
|
power = pd.get(1, 0); |
|
|
|
hoplen = pd.get(2, n_fft / 4); |
|
|
|
winlen = pd.get(3, n_fft); |
|
|
|
window_type = pd.get(4, 0); |
|
|
|
center = pd.get(5, 1); |
|
|
|
pad_type = pd.get(6, 2); |
|
|
|
normalized = pd.get(7, 0); |
|
|
|
onesided = pd.get(8, 1); |
|
|
|
|
|
|
|
// assert winlen <= n_fft |
|
|
|
// generate window |
|
|
|
window_data.create(n_fft); |
|
|
|
{ |
|
|
|
float* p = window_data; |
|
|
|
for (int i = 0; i < (n_fft - winlen) / 2; i++) |
|
|
|
{ |
|
|
|
*p++ = 0.f; |
|
|
|
} |
|
|
|
if (window_type == 0) |
|
|
|
{ |
|
|
|
// all ones |
|
|
|
for (int i = 0; i < winlen; i++) |
|
|
|
{ |
|
|
|
*p++ = 1.f; |
|
|
|
} |
|
|
|
} |
|
|
|
if (window_type == 1) |
|
|
|
{ |
|
|
|
// hann window |
|
|
|
for (int i = 0; i < winlen; i++) |
|
|
|
{ |
|
|
|
*p++ = 0.5f * (1 - cosf(2 * 3.14159265358979323846 * i / winlen)); |
|
|
|
} |
|
|
|
} |
|
|
|
if (window_type == 2) |
|
|
|
{ |
|
|
|
// hamming window |
|
|
|
for (int i = 0; i < winlen; i++) |
|
|
|
{ |
|
|
|
*p++ = 0.54f - 0.46f * cosf(2 * 3.14159265358979323846 * i / winlen); |
|
|
|
} |
|
|
|
} |
|
|
|
for (int i = 0; i < n_fft - winlen - (n_fft - winlen) / 2; i++) |
|
|
|
{ |
|
|
|
*p++ = 0.f; |
|
|
|
} |
|
|
|
|
|
|
|
// pre-calculated window norm factor |
|
|
|
if (normalized == 2) |
|
|
|
{ |
|
|
|
float sqsum = 0.f; |
|
|
|
for (int i = 0; i < n_fft; i++) |
|
|
|
{ |
|
|
|
sqsum += window_data[i] * window_data[i]; |
|
|
|
} |
|
|
|
float scale = 1.f / sqrt(sqsum); |
|
|
|
|
|
|
|
for (int i = 0; i < n_fft; i++) |
|
|
|
{ |
|
|
|
window_data[i] *= scale; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
Mat theta; |
|
|
|
if (onesided) |
|
|
|
{ |
|
|
|
n_freq = n_fft / 2 + 1; |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
n_freq = n_fft; |
|
|
|
} |
|
|
|
theta.create(n_fft, n_freq, size_t(8)); |
|
|
|
|
|
|
|
for (int i = 0; i < n_freq; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_fft; j++) |
|
|
|
{ |
|
|
|
theta.row<double>(i)[j] = 2 * 3.14159265358979323846 * i * j / n_fft; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
Mat real_basis, imag_basis; |
|
|
|
real_basis.create(n_fft, n_freq, size_t(8)); |
|
|
|
imag_basis.create(n_fft, n_freq, size_t(8)); |
|
|
|
|
|
|
|
for (int i = 0; i < n_freq; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_fft; j++) |
|
|
|
{ |
|
|
|
real_basis.row<double>(i)[j] = cos(theta.row<double>(i)[j]); |
|
|
|
imag_basis.row<double>(i)[j] = -sin(theta.row<double>(i)[j]); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// multiply window |
|
|
|
for (int i = 0; i < n_freq; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_fft; j++) |
|
|
|
{ |
|
|
|
real_basis.row<double>(i)[j] *= window_data[j]; |
|
|
|
imag_basis.row<double>(i)[j] *= window_data[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if (normalized == 1) |
|
|
|
{ |
|
|
|
double scale = 1.f / sqrt(n_fft); |
|
|
|
for (int i = 0; i < n_freq; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_fft; j++) |
|
|
|
{ |
|
|
|
real_basis.row<double>(i)[j] *= scale; |
|
|
|
imag_basis.row<double>(i)[j] *= scale; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
conv_data.create(n_fft, 1, n_freq * 2); |
|
|
|
|
|
|
|
for (int i = 0; i < n_freq; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_fft; j++) |
|
|
|
{ |
|
|
|
conv_data.channel(i).row<float>(0)[j] = (float)real_basis.row<double>(i)[j]; |
|
|
|
conv_data.channel(i + n_freq).row<float>(0)[j] = (float)imag_basis.row<double>(i)[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
conv_transpose = ncnn::create_layer("Convolution1D"); |
|
|
|
ncnn::ParamDict conv_transpose_pd; |
|
|
|
|
|
|
|
conv_transpose_pd.set(0, 2 * n_freq); // num_output |
|
|
|
conv_transpose_pd.set(1, n_fft); // kernel_w |
|
|
|
conv_transpose_pd.set(3, hoplen); // stride_w |
|
|
|
conv_transpose_pd.set(19, 1); // dynamic_weight |
|
|
|
|
|
|
|
conv_transpose->load_param(conv_transpose_pd); |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
int Spectrogram_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const |
|
|
|
{ |
|
|
|
// https://pytorch.org/audio/stable/generated/torchaudio.functional.spectrogram.html |
|
|
|
|
|
|
|
// TODO custom window |
|
|
|
|
|
|
|
Mat bottom_blob_bordered = bottom_blob; |
|
|
|
if (center == 1) |
|
|
|
{ |
|
|
|
Option opt_b = opt; |
|
|
|
opt_b.blob_allocator = opt.workspace_allocator; |
|
|
|
if (pad_type == 0) |
|
|
|
copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_CONSTANT, 0.f, opt_b); |
|
|
|
if (pad_type == 1) |
|
|
|
copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_REPLICATE, 0.f, opt_b); |
|
|
|
if (pad_type == 2) |
|
|
|
copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, n_fft / 2, n_fft / 2, BORDER_REFLECT, 0.f, opt_b); |
|
|
|
} |
|
|
|
|
|
|
|
const int size = bottom_blob_bordered.w; |
|
|
|
|
|
|
|
// const int frames = size / hoplen + 1; |
|
|
|
const int frames = (size - n_fft) / hoplen + 1; |
|
|
|
|
|
|
|
const size_t elemsize = bottom_blob_bordered.elemsize; |
|
|
|
|
|
|
|
if (elemsize != sizeof(float)) |
|
|
|
{ |
|
|
|
return -100; |
|
|
|
} |
|
|
|
|
|
|
|
if (power == 0) |
|
|
|
{ |
|
|
|
top_blob.create(2, frames, n_freq, elemsize, opt.blob_allocator); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
top_blob.create(frames, n_freq, elemsize, opt.blob_allocator); |
|
|
|
} |
|
|
|
if (top_blob.empty()) |
|
|
|
return -100; |
|
|
|
|
|
|
|
std::vector<Mat> inputs; |
|
|
|
inputs.push_back(bottom_blob_bordered); |
|
|
|
inputs.push_back(conv_data); |
|
|
|
|
|
|
|
std::vector<Mat> outputs; |
|
|
|
outputs.push_back(Mat()); |
|
|
|
|
|
|
|
Option opt_conv = opt; |
|
|
|
opt_conv.use_packing_layout = false; |
|
|
|
|
|
|
|
conv_transpose->create_pipeline(opt_conv); |
|
|
|
conv_transpose->forward(inputs, outputs, opt_conv); |
|
|
|
conv_transpose->destroy_pipeline(opt_conv); |
|
|
|
|
|
|
|
Mat conv_top_blob = outputs[0]; // (2 * n_freq, frames) |
|
|
|
float* conv_top_data = conv_top_blob; |
|
|
|
|
|
|
|
if (power == 0) // as complex |
|
|
|
{ |
|
|
|
// copy |
|
|
|
for (int i = 0; i < frames; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_freq; j++) |
|
|
|
{ |
|
|
|
top_blob.channel(j).row<float>(i)[0] = conv_top_data[j * frames + i]; |
|
|
|
top_blob.channel(j).row<float>(i)[1] = conv_top_data[(j + n_freq) * frames + i]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
if (power == 1) // magnitude sqrt(re * re + im * im); |
|
|
|
{ |
|
|
|
// copy |
|
|
|
for (int i = 0; i < frames; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_freq; j++) |
|
|
|
{ |
|
|
|
top_blob.row<float>(j)[i] = sqrtf(conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else if (power == 2) // power re * re + im * im; |
|
|
|
{ |
|
|
|
// copy |
|
|
|
for (int i = 0; i < frames; i++) |
|
|
|
{ |
|
|
|
for (int j = 0; j < n_freq; j++) |
|
|
|
{ |
|
|
|
top_blob.row<float>(j)[i] = conv_top_data[j * frames + i] * conv_top_data[j * frames + i] + conv_top_data[(j + n_freq) * frames + i] * conv_top_data[(j + n_freq) * frames + i]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
} // namespace ncnn |