From de1c0861baebb0ee0d243a49ca2c93a3afc1859c Mon Sep 17 00:00:00 2001 From: eptq002345 <1598440105@qq.com> Date: Tue, 5 Sep 2023 17:14:33 +0800 Subject: [PATCH] Update transforms.py --- mindspore/dataset/audio/transforms.py | 948 +++++++++++++++++++++++++- 1 file changed, 946 insertions(+), 2 deletions(-) diff --git a/mindspore/dataset/audio/transforms.py b/mindspore/dataset/audio/transforms.py index d5d03e0f3d..feaf97a39a 100644 --- a/mindspore/dataset/audio/transforms.py +++ b/mindspore/dataset/audio/transforms.py @@ -1,4 +1,4 @@ -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2021-2022 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,950 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== """ -The module audio.transforms is inherited from _c_dataengine. +The module audio.transforms is inherited from _c_dataengine and is +implemented based on C++. It's a high performance module to process +audio. Users can apply suitable augmentations on audio data to improve +their training models. """ + +import numpy as np + +import mindspore._c_dataengine as cde +from ..transforms.c_transforms import TensorOperation +from .utils import BorderType, DensityFunction, FadeShape, GainType, Interpolation, MelType, Modulation, NormType, \ + ScaleType, WindowType +from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_biquad, check_bandpass_biquad, \ + check_bandreject_biquad, check_bass_biquad, check_biquad, check_complex_norm, check_compute_deltas, \ + check_contrast, check_db_to_amplitude, check_dc_shift, check_deemph_biquad, check_detect_pitch_frequency, \ + check_dither, check_equalizer_biquad, check_fade, check_flanger, check_gain, check_griffin_lim, \ + check_highpass_biquad, check_lfilter, check_lowpass_biquad, check_magphase, check_mask_along_axis, \ + check_mask_along_axis_iid, check_masking, check_mel_scale, check_mu_law_coding, check_overdrive, \ + check_phase_vocoder, check_phaser, check_riaa_biquad, check_sliding_window_cmn, check_spectral_centroid, \ + check_spectrogram, check_time_stretch, check_treble_biquad, check_vol + + +class AudioTensorOperation(TensorOperation): + """ + Base class of Audio Tensor Ops. + """ + + def __call__(self, *input_tensor_list): + for tensor in input_tensor_list: + if not isinstance(tensor, (np.ndarray,)): + raise TypeError("Input should be NumPy audio, got {}.".format(type(tensor))) + return super().__call__(*input_tensor_list) + + def parse(self): + raise NotImplementedError("AudioTensorOperation has to implement parse() method.") + +# 给音频波形施加双极点全通滤波器,其中心频率和带宽由入参指定。 +class AllpassBiquad(AudioTensorOperation): + ''' + 构建一个全透过比特平滑的复合提取器 + + Args: + sample_rate: 采样率 + central_freq: 中心频率 + Q: 浓度 + + ''' + @check_allpass_biquad + def __init__(self, sample_rate, central_freq, Q=0.707): + + self.sample_rate = sample_rate + self.central_freq = central_freq + self.Q = Q + + def parse(self): + return cde.AllpassBiquadOperation(self.sample_rate, self.central_freq, self.Q) + + +DE_C_SCALE_TYPE = {ScaleType.POWER: cde.ScaleType.DE_SCALE_TYPE_POWER, + ScaleType.MAGNITUDE: cde.ScaleType.DE_SCALE_TYPE_MAGNITUDE} + +# 将输入音频从振幅/功率标度转换为分贝标度。 +class AmplitudeToDB(AudioTensorOperation): + + ''' + Args: + stype: 输入音频的原始标度 + ref_value: 用于计算分贝系数 + amin: 波形取值下界,低于该值的波形将会被裁切 + top_db: 最小截止分贝值 + + ''' + @check_amplitude_to_db + # 定义AmplitudeToDB类 + def __init__(self, stype=ScaleType.POWER, ref_value=1.0, amin=1e-10, top_db=80.0): + self.stype = stype + self.ref_value = ref_value + self.amin = amin + self.top_db = top_db + + def parse(self): + # 返回AmplitudeToDBOperation类的实例 + return cde.AmplitudeToDBOperation(DE_C_SCALE_TYPE[self.stype], self.ref_value, self.amin, self.top_db) + +# 计算复数序列的角度。 +class Angle(AudioTensorOperation): + ''' + 角度转换,输入为一个音频张量,输出为一个角度张量 + ''' + def parse(self): + return cde.AngleOperation() + +class BandBiquad(AudioTensorOperation): + @check_band_biquad + def __init__(self, sample_rate, central_freq, Q=0.707, noise=False): + self.sample_rate = sample_rate + self.central_freq = central_freq + self.Q = Q + self.noise = noise + + def parse(self): + return cde.BandBiquadOperation(self.sample_rate, self.central_freq, self.Q, self.noise) + +# 给音频波形施加双极点巴特沃斯(Butterworth)带通滤波器 +class BandpassBiquad(AudioTensorOperation): + @check_bandpass_biquad + def __init__(self, sample_rate, central_freq, Q=0.707, const_skirt_gain=False): + self.sample_rate = sample_rate + self.central_freq = central_freq + self.Q = Q + self.const_skirt_gain = const_skirt_gain + + def parse(self): + return cde.BandpassBiquadOperation(self.sample_rate, self.central_freq, self.Q, self.const_skirt_gain) + +# 给音频波形施加双极点巴特沃斯(Butterworth)带通滤波器 +class BandrejectBiquad(AudioTensorOperation): + ''' + 构造一个带通滤波器 + ''' + @check_bandreject_biquad + def __init__(self, sample_rate, central_freq, Q=0.707): + ''' + 构造一个带通滤波器 + :param sample_rate: 样本采样率 + :param central_freq: 中心频率 + :param Q: 浓度 + ''' + self.sample_rate = sample_rate + self.central_freq = central_freq + self.Q = Q + + def parse(self): + return cde.BandrejectBiquadOperation(self.sample_rate, self.central_freq, self.Q) + +# 给音频波形施加低音控制效果,即双极点低频搁架滤波器 +class BassBiquad(AudioTensorOperation): + ''' + 低音双二阶滤波器 + ''' + @check_bass_biquad + def __init__(self, sample_rate, gain, central_freq=100.0, Q=0.707): + ''' + :param sample_rate: 波形的采样率 + :param gain: 波形的增益 + :param central_freq: 波形的中心频率 + :param Q: 波形的比特率 + ''' + self.sample_rate = sample_rate + self.gain = gain + self.central_freq = central_freq + self.Q = Q + + def parse(self): + return cde.BassBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q) + +# 给音频波形施加双二阶滤波器 +class Biquad(TensorOperation): + ''' + Biquad类: + 参数: + b0:比特系数b0 + b1:比特系数b1 + b2:比特系数b2 + a0:比特系数a0 + a1:比特系数a1 + a2:比特系数a2 + 返回值: + cde.BiquadOperation + ''' + @check_biquad + def __init__(self, b0, b1, b2, a0, a1, a2): + self.b0 = b0 + self.b1 = b1 + self.b2 = b2 + self.a0 = a0 + self.a1 = a1 + self.a2 = a2 + + def parse(self): + return cde.BiquadOperation(self.b0, self.b1, self.b2, self.a0, self.a1, self.a2) + +# 计算复数序列的范数 +class ComplexNorm(AudioTensorOperation): + ''' + 计算复数归一化的操作 + ''' + @check_complex_norm + def __init__(self, power=1.0): + ''' + :param power: 指数 + ''' + self.power = power + + def parse(self): + return cde.ComplexNormOperation(self.power) + + +DE_C_BORDER_TYPE = { + BorderType.CONSTANT: cde.BorderType.DE_BORDER_CONSTANT, + BorderType.EDGE: cde.BorderType.DE_BORDER_EDGE, + BorderType.REFLECT: cde.BorderType.DE_BORDER_REFLECT, + BorderType.SYMMETRIC: cde.BorderType.DE_BORDER_SYMMETRIC, +} + +# 计算频谱的delta系数,也叫差分系数 +class ComputeDeltas(AudioTensorOperation): + ''' + 计算梯度 + ''' + @check_compute_deltas + def __init__(self, win_length=5, pad_mode=BorderType.EDGE): + ''' + 初始化函数 + :param win_length: 窗口长度 + :param pad_mode: 填充模式 + ''' + self.win_len = win_length + self.pad_mode = pad_mode + + def parse(self): + return cde.ComputeDeltasOperation(self.win_len, DE_C_BORDER_TYPE[self.pad_mode]) + +# 给音频波形施加对比度增强效果 +class Contrast(AudioTensorOperation): + ''' + 计算谱和音频的对比度 + ''' + @check_contrast + def __init__(self, enhancement_amount=75.0): + ''' + :param enhancement_amount: 对比度的增强量 + ''' + self.enhancement_amount = enhancement_amount + + def parse(self): + return cde.ContrastOperation(self.enhancement_amount) + +# 将音频波形从分贝转换为功率或振幅 +class DBToAmplitude(AudioTensorOperation): + ''' + 转换DB值到相位值 + ''' + @check_db_to_amplitude + def __init__(self, ref, power): + ''' + :param ref: 参考值 + :param power: 相位值 + ''' + self.ref = ref + self.power = power + + def parse(self): + return cde.DBToAmplitudeOperation(self.ref, self.power) + +# 对输入音频波形施加直流移位 +class DCShift(AudioTensorOperation): + ''' + 计算音频的DC偏移量 + ''' + @check_dc_shift + def __init__(self, shift, limiter_gain=None): + ''' + :param shift: 音频的DC偏移量 + :param limiter_gain: 限制器的增益 + ''' + self.shift = shift + self.limiter_gain = limiter_gain if limiter_gain else shift + + def parse(self): + return cde.DCShiftOperation(self.shift, self.limiter_gain) + +# 给音频波形施加CD(IEC 60908)去重音(一种高音衰减搁置滤波器)效果 +class DeemphBiquad(AudioTensorOperation): + ''' + 计算DeemphBiquad操作 + ''' + @check_deemph_biquad + def __init__(self, sample_rate): + ''' + :param sample_rate: 波形的采样频率 + ''' + self.sample_rate = sample_rate + + def parse(self): + return cde.DeemphBiquadOperation(self.sample_rate) + +# 检测音调频率 +class DetectPitchFrequency(AudioTensorOperation): + ''' + 计算音高频率的检测 + ''' + @check_detect_pitch_frequency + def __init__(self, sample_rate, frame_time=0.01, win_length=30, freq_low=85, freq_high=3400): + ''' + 初始化检测音高频率的操作 + :param sample_rate: 采样率 + :param frame_time: 滑动窗口的时间间隔 + :param win_length: 窗口的长度 + :param freq_low: 最低频率 + :param freq_high: 最高频率 + ''' + self.sample_rate = sample_rate + self.frame_time = frame_time + self.win_length = win_length + self.freq_low = freq_low + self.freq_high = freq_high + + def parse(self): + return cde.DetectPitchFrequencyOperation(self.sample_rate, self.frame_time, + self.win_length, self.freq_low, self.freq_high) + + +DE_C_DENSITY_FUNCTION = {DensityFunction.TPDF: cde.DensityFunction.DE_DENSITY_FUNCTION_TPDF, + DensityFunction.RPDF: cde.DensityFunction.DE_DENSITY_FUNCTION_RPDF, + DensityFunction.GPDF: cde.DensityFunction.DE_DENSITY_FUNCTION_GPDF} + +# 通过消除非线性截断失真,来抖动增加存储在特定位深的音频的动态感知范围 +class Dither(AudioTensorOperation): + ''' + 添加噪声的类 + ''' + @check_dither + def __init__(self, density_function=DensityFunction.TPDF, noise_shaping=False): + ''' + 初始化添加噪声的类 + :param density_function: 添加噪声的混合模式 + :param noise_shaping: 是否使用噪声矫正 + ''' + self.density_function = density_function + self.noise_shaping = noise_shaping + + def parse(self): + ''' + 解析添加噪声的类 + :return: + ''' + return cde.DitherOperation(DE_C_DENSITY_FUNCTION[self.density_function], self.noise_shaping) + +# 给音频波形施加双二次均衡器滤波器 +class EqualizerBiquad(AudioTensorOperation): + ''' + 比较器比特线的高通滤波器 + ''' + @check_equalizer_biquad + def __init__(self, sample_rate, center_freq, gain, Q=0.707): + ''' + :param sample_rate: 样本采样率 + :param center_freq: 输入频率 + :param gain: 比特率 + :param Q: 浓度 + ''' + self.sample_rate = sample_rate + self.center_freq = center_freq + self.gain = gain + self.Q = Q + + def parse(self): + return cde.EqualizerBiquadOperation(self.sample_rate, self.center_freq, self.gain, self.Q) + + +DE_C_FADE_SHAPE = {FadeShape.QUARTER_SINE: cde.FadeShape.DE_FADE_SHAPE_QUARTER_SINE, + FadeShape.HALF_SINE: cde.FadeShape.DE_FADE_SHAPE_HALF_SINE, + FadeShape.LINEAR: cde.FadeShape.DE_FADE_SHAPE_LINEAR, + FadeShape.LOGARITHMIC: cde.FadeShape.DE_FADE_SHAPE_LOGARITHMIC, + FadeShape.EXPONENTIAL: cde.FadeShape.DE_FADE_SHAPE_EXPONENTIAL} + +# 向波形添加淡入和/或淡出 +class Fade(AudioTensorOperation): + ''' + 淡入淡出操作 + ''' + @check_fade + def __init__(self, fade_in_len=0, fade_out_len=0, fade_shape=FadeShape.LINEAR): + ''' + Args: + fade_in_len: 淡入长度,默认为0 + fade_out_len: 淡出长度,默认为0 + fade_shape: 淡入淡出形状,默认为LINEAR + ''' + self.fade_in_len = fade_in_len + self.fade_out_len = fade_out_len + self.fade_shape = fade_shape + + def parse(self): + return cde.FadeOperation(self.fade_in_len, self.fade_out_len, DE_C_FADE_SHAPE[self.fade_shape]) + + +DE_C_MODULATION = {Modulation.SINUSOIDAL: cde.Modulation.DE_MODULATION_SINUSOIDAL, + Modulation.TRIANGULAR: cde.Modulation.DE_MODULATION_TRIANGULAR} + +DE_C_INTERPOLATION = {Interpolation.LINEAR: cde.Interpolation.DE_INTERPOLATION_LINEAR, + Interpolation.QUADRATIC: cde.Interpolation.DE_INTERPOLATION_QUADRATIC} + +# 给音频施加镶边效果 +class Flanger(AudioTensorOperation): + ''' + Flanger操作 + ''' + @check_flanger + def __init__(self, sample_rate, delay=0.0, depth=2.0, regen=0.0, width=71.0, speed=0.5, + phase=25.0, modulation=Modulation.SINUSOIDAL, interpolation=Interpolation.LINEAR): + ''' + 初始化Flanger操作 + :param sample_rate: 样本频率 + :param delay: 延迟 + :param depth: 深度 + :param regen: 波缓冲 + :param width: 宽度 + :param speed: 速度 + :param phase: 偏移 + :param modulation: 模组 + :param interpolation: 插值 + ''' + self.sample_rate = sample_rate + self.delay = delay + self.depth = depth + self.regen = regen + self.width = width + self.speed = speed + self.phase = phase + self.modulation = modulation + self.interpolation = interpolation + + def parse(self): + return cde.FlangerOperation(self.sample_rate, self.delay, self.depth, self.regen, self.width, self.speed, + self.phase, DE_C_MODULATION[self.modulation], + DE_C_INTERPOLATION[self.interpolation]) + +# 给音频波形施加频域掩码 +class FrequencyMasking(AudioTensorOperation): + ''' + 对音频数据进行频率掩码 + ''' + @check_masking + def __init__(self, iid_masks=False, freq_mask_param=0, mask_start=0, mask_value=0.0): + ''' + 频率掩码参数: + iid_masks:是否使用IID掩码 + freq_mask_param:频率掩码参数 + mask_start:掩码开始位置 + mask_value:掩码值 + ''' + self.iid_masks = iid_masks + self.frequency_mask_param = freq_mask_param + self.mask_start = mask_start + self.mask_value = mask_value + + def parse(self): + return cde.FrequencyMaskingOperation(self.iid_masks, self.frequency_mask_param, self.mask_start, + self.mask_value) + +# 放大或衰减整个音频波形 +class Gain(AudioTensorOperation): + ''' + 计算音频的增益 + ''' + @check_gain + def __init__(self, gain_db=1.0): + ''' + :param gain_db: 增益的数值,取值范围为[-100, 100] + ''' + self.gain_db = gain_db + + def parse(self): + return cde.GainOperation(self.gain_db) + +# 使用Griffin-Lim算法从线性幅度频谱图中计算信号波形 +class GriffinLim(AudioTensorOperation): + ''' + GriffinLim算法 + ''' + @check_griffin_lim + def __init__(self, n_fft=400, n_iter=32, win_length=None, hop_length=None, window_type=WindowType.HANN, + power=2, momentum=0.99, length=None, rand_init=True): + ''' + 参数: + n_fft: 窗口的长度 + n_iter: 步长 + win_length: 窗口的长度,如果没有指定,则使用n_fft + hop_length: 窗口的步长,如果没有指定,则使用win_length的一半 + window_type: 窗口类型 + power: 窗口的加权平方根 + momentum: 梯度移动平均系数 + length: 数据长度,如果没有指定,则使用数据的长度 + rand_init: 是否使用随机初始化 + ''' + self.n_fft = n_fft + self.n_iter = n_iter + self.win_length = win_length if win_length else self.n_fft + self.hop_length = hop_length if hop_length else self.win_length // 2 + self.window_type = window_type + self.power = power + self.momentum = momentum + self.length = length if length else 0 + self.rand_init = rand_init + + def parse(self): + return cde.GriffinLimOperation(self.n_fft, self.n_iter, self.win_length, self.hop_length, + DE_C_WINDOW_TYPE.get(self.window_type), self.power, self.momentum, self.length, + self.rand_init) + +# 给音频波形上施加双二阶高通滤波器 +class HighpassBiquad(AudioTensorOperation): + ''' + 高通滤波器,可以用于高通滤波器的高通滤波器 + ''' + @check_highpass_biquad + def __init__(self, sample_rate, cutoff_freq, Q=0.707): + ''' + :param sample_rate: 样本采样率 + :param cutoff_freq: 过滤频率 + :param Q: 过滤系数 + ''' + self.sample_rate = sample_rate + self.cutoff_freq = cutoff_freq + self.Q = Q + + def parse(self): + return cde.HighpassBiquadOperation(self.sample_rate, self.cutoff_freq, self.Q) + +# 根据指定的差分方程施加IIR滤波器 +class LFilter(AudioTensorOperation): + ''' + 过滤器,可以通过调用LFilterOperation的parse方法获取 + ''' + @check_lfilter + def __init__(self, a_coeffs, b_coeffs, clamp=True): + ''' + :param a_coeffs: a_coeffs[0]为a的系数,a_coeffs[1]为b的系数,a_coeffs[2]为b的系数,... + :param b_coeffs: b_coeffs[0]为a的系数,b_coeffs[1]为b的系数,b_coeffs[2]为b的系数,... + :param clamp: 是否禁用clamp + ''' + self.a_coeffs = a_coeffs + self.b_coeffs = b_coeffs + self.clamp = clamp + + def parse(self): + return cde.LFilterOperation(self.a_coeffs, self.b_coeffs, self.clamp) + +# 给音频波形施加双极点低通滤波器 +class LowpassBiquad(AudioTensorOperation): + ''' + 计算一个低通滤波器,参数为采样率,周期,Q值 + ''' + @check_lowpass_biquad + def __init__(self, sample_rate, cutoff_freq, Q=0.707): + ''' + 初始化低通滤波器 + :param sample_rate: 采样率 + :param cutoff_freq: 周期 + :param Q: Q值 + ''' + self.sample_rate = sample_rate + self.cutoff_freq = cutoff_freq + self.Q = Q + + def parse(self): + return cde.LowpassBiquadOperation(self.sample_rate, self.cutoff_freq, self.Q) + +# 将shape为(..., 2)的复值光谱图分离,输出幅度和相位 +class Magphase(AudioTensorOperation): + ''' + 计算输入的音频数据的相位 + ''' + @check_magphase + def __init__(self, power=1.0): + ''' + 初始化Magphase类 + :param power: 相位的平方根 + ''' + self.power = power + + def parse(self): + return cde.MagphaseOperation(self.power) + +# 对音频波形应用掩码 +class MaskAlongAxis(AudioTensorOperation): + ''' + 按照指定的轴mask,从mask_start开始,按照mask_width宽度mask,mask_value值mask + ''' + @check_mask_along_axis + def __init__(self, mask_start, mask_width, mask_value, axis): + ''' + :param mask_start: 指定mask的起始位置 + :param mask_width: 指定mask的宽度 + :param mask_value: 指定mask的值 + :param axis: 指定mask的轴 + ''' + self.mask_start = mask_start + self.mask_width = mask_width + self.mask_value = mask_value + self.axis = axis + + def parse(self): + return cde.MaskAlongAxisOperation(self.mask_start, self.mask_width, self.mask_value, self.axis) + +# 对音频波形沿 axis 轴应用掩码 +class MaskAlongAxisIID(AudioTensorOperation): + ''' + 按照指定的维度mask,指定的值 + ''' + @check_mask_along_axis_iid + def __init__(self, mask_param, mask_value, axis): + ''' + :param mask_param: mask的参数 + :param mask_value: mask的值 + :param axis: mask的维度 + ''' + self.mask_param = mask_param + self.mask_value = mask_value + self.axis = axis + + def parse(self): + return cde.MaskAlongAxisIIDOperation(self.mask_param, self.mask_value, self.axis) + + +DE_C_MEL_TYPE = {MelType.SLANEY: cde.MelType.DE_MEL_TYPE_SLANEY, + MelType.HTK: cde.MelType.DE_MEL_TYPE_HTK} + +DE_C_NORM_TYPE = {NormType.NONE: cde.NormType.DE_NORM_TYPE_NONE, + NormType.SLANEY: cde.NormType.DE_NORM_TYPE_SLANEY} + +# 将普通STFT转换为梅尔尺度的STFT +class MelScale(AudioTensorOperation): + ''' + MelScale操作 + ''' + @check_mel_scale + def __init__(self, n_mels=128, sample_rate=16000, f_min=0, f_max=None, n_stft=201, norm=NormType.NONE, + mel_type=MelType.HTK): + ''' + 初始化MelScale操作 + :param n_mels: mel空间的大小 + :param sample_rate: 采样率 + :param f_min: 将音频转换为mel空间的最小频率 + :param f_max: 将音频转换为mel空间的最大频率,如果没有指定则默认为采样率的一半 + :param n_stft: 将音频转换为mel空间的维度 + :param norm: 标准化类型 + :param mel_type: mel空间类型 + ''' + self.n_mels = n_mels + self.sample_rate = sample_rate + self.f_min = f_min + self.f_max = f_max if f_max is not None else sample_rate // 2 + self.n_stft = n_stft + self.norm = norm + self.mel_type = mel_type + + def parse(self): + return cde.MelScaleOperation(self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_stft, + DE_C_NORM_TYPE[self.norm], DE_C_MEL_TYPE[self.mel_type]) + +# 解码mu-law编码的信号,参考 mu-law算法 +class MuLawDecoding(AudioTensorOperation): + ''' + 解码MuLaw编码的数据 + ''' + @check_mu_law_coding + def __init__(self, quantization_channels=256): + ''' + 构造函数 + :param quantization_channels: 目标量化通道数 + ''' + self.quantization_channels = quantization_channels + + def parse(self): + return cde.MuLawDecodingOperation(self.quantization_channels) + +# 基于mu-law压缩的信号编码 +class MuLawEncoding(AudioTensorOperation): + ''' + 编码器 + ''' + @check_mu_law_coding + def __init__(self, quantization_channels=256): + ''' + 初始化编码器 + :param quantization_channels: 目标量化通道数 + ''' + self.quantization_channels = quantization_channels + + def parse(self): + return cde.MuLawEncodingOperation(self.quantization_channels) + +# 给音频波形施加过载效果 +class Overdrive(AudioTensorOperation): + ''' + 设置音频的音量和颜色 + ''' + @check_overdrive + def __init__(self, gain=20.0, color=20.0): + ''' + :param gain: 音量 + :param color: 颜色 + ''' + self.gain = gain + self.color = color + + def parse(self): + return cde.OverdriveOperation(self.gain, self.color) + +# 给音频波形施加相位效果 +class Phaser(AudioTensorOperation): + ''' + Phaser操作: + ''' + @check_phaser + def __init__(self, sample_rate, gain_in=0.4, gain_out=0.74, + delay_ms=3.0, decay=0.4, mod_speed=0.5, sinusoidal=True): + ''' + 初始化Phaser操作 + 参数: + sample_rate:采样率 + gain_in:输入增益 + gain_out:输出增益 + delay_ms:延迟毫秒数 + decay:衰减系数 + mod_speed:模拟速度 + sinusoidal:是否模拟 + ''' + self.decay = decay + self.delay_ms = delay_ms + self.gain_in = gain_in + self.gain_out = gain_out + self.mod_speed = mod_speed + self.sample_rate = sample_rate + self.sinusoidal = sinusoidal + + def parse(self): + return cde.PhaserOperation(self.sample_rate, self.gain_in, self.gain_out, + self.delay_ms, self.decay, self.mod_speed, self.sinusoidal) + +# 对给定的STFT频谱,在不改变音高的情况下以一定比率进行加速 +class PhaseVocoder(AudioTensorOperation): + ''' + 基于频谱的频谱转换 + ''' + @check_phase_vocoder + def __init__(self, rate, phase_advance): + ''' + :param rate: 采样率 + :param phase_advance: 频谱转换的时间间隔 + ''' + self.rate = rate + self.phase_advance = cde.Tensor(phase_advance) + + def parse(self): + return cde.PhaseVocoderOperation(self.rate, self.phase_advance) + +# 对输入音频波形施加RIAA均衡 +class RiaaBiquad(AudioTensorOperation): + ''' + 构建一个RiaaBiquad操作 + ''' + @check_riaa_biquad + def __init__(self, sample_rate): + ''' + 构建一个RiaaBiquad操作 + :param sample_rate: 样本频率 + ''' + self.sample_rate = sample_rate + + def parse(self): + return cde.RiaaBiquadOperation(self.sample_rate) + +# 对每个话语应用滑动窗口倒谱均值(和可选方差)归一化 +class SlidingWindowCmn(AudioTensorOperation): + ''' + 计算滑动窗口的CMN + ''' + @check_sliding_window_cmn + def __init__(self, cmn_window=600, min_cmn_window=100, center=False, norm_vars=False): + ''' + 设置滑动窗口的CMN参数 + :param cmn_window: 滑动窗口的长度 + :param min_cmn_window: 最小的CMN长度 + :param center: 是否在滑动窗口的中间 + :param norm_vars: 是否归一化变量 + ''' + self.cmn_window = cmn_window + self.min_cmn_window = min_cmn_window + self.center = center + self.norm_vars = norm_vars + + def parse(self): + return cde.SlidingWindowCmnOperation(self.cmn_window, self.min_cmn_window, self.center, self.norm_vars) + + +DE_C_WINDOW_TYPE = {WindowType.BARTLETT: cde.WindowType.DE_WINDOW_TYPE_BARTLETT, + WindowType.BLACKMAN: cde.WindowType.DE_WINDOW_TYPE_BLACKMAN, + WindowType.HAMMING: cde.WindowType.DE_WINDOW_TYPE_HAMMING, + WindowType.HANN: cde.WindowType.DE_WINDOW_TYPE_HANN, + WindowType.KAISER: cde.WindowType.DE_WINDOW_TYPE_KAISER} + +# 计算每个通道沿时间轴的频谱中心 +class SpectralCentroid(TensorOperation): + ''' + 计算振幅谱中心点 + ''' + @check_spectral_centroid + def __init__(self, sample_rate, n_fft=400, win_length=None, hop_length=None, pad=0, window=WindowType.HANN): + ''' + :param sample_rate: 样本采样率 + :param n_fft: 快速傅里叶变换的窗长度 + :param win_length: 窗长度,默认为n_fft + :param hop_length: 间隔长度,默认为win_length // 2 + :param pad: 填充,默认为0 + :param window: 窗函数,默认为HANN + ''' + self.sample_rate = sample_rate + self.pad = pad + self.window = window + self.n_fft = n_fft + self.win_length = win_length if win_length else n_fft + self.hop_length = hop_length if hop_length else self.win_length // 2 + + def parse(self): + return cde.SpectralCentroidOperation(self.sample_rate, self.n_fft, self.win_length, self.hop_length, + self.pad, DE_C_WINDOW_TYPE[self.window]) + +# 从音频信号创建其频谱 +class Spectrogram(TensorOperation): + ''' + 计算频谱图 + ''' + @check_spectrogram + def __init__(self, n_fft=400, win_length=None, hop_length=None, pad=0, window=WindowType.HANN, power=2.0, + normalized=False, center=True, pad_mode=BorderType.REFLECT, onesided=True): + ''' + 初始化 + :param n_fft: 窗口大小 + :param win_length: 窗口长度,如果为None则使用窗口大小 + :param hop_length: 间隔长度 + :param pad: 填充 + :param window: 窗口类型 + :param power: 细胞平方 + :param normalized: 标准化 + :param center: 偏移 + :param pad_mode: 填充模式 + :param onesided: 奇数频谱 + ''' + self.n_fft = n_fft + self.win_length = win_length if win_length else n_fft + self.hop_length = hop_length if hop_length else self.win_length // 2 + self.pad = pad + self.window = window + self.power = power + self.normalized = normalized + self.center = center + self.pad_mode = pad_mode + self.onesided = onesided + + def parse(self): + return cde.SpectrogramOperation(self.n_fft, self.win_length, self.hop_length, self.pad, + DE_C_WINDOW_TYPE[self.window], self.power, self.normalized, + self.center, DE_C_BORDER_TYPE[self.pad_mode], self.onesided) + +# 给音频波形施加时域掩码 +class TimeMasking(AudioTensorOperation): + ''' + 掩码时间序列 + ''' + @check_masking + def __init__(self, iid_masks=False, time_mask_param=0, mask_start=0, mask_value=0.0): + ''' + 参数: + iid_masks:是否使用IID掩码 + time_mask_param:时间掩码参数 + mask_start:掩码起始位置 + mask_value:掩码值 + ''' + self.iid_masks = iid_masks + self.time_mask_param = time_mask_param + self.mask_start = mask_start + self.mask_value = mask_value + + def parse(self): + return cde.TimeMaskingOperation(self.iid_masks, self.time_mask_param, self.mask_start, self.mask_value) + +# 以给定的比例拉伸音频短时傅里叶(Short Time Fourier Transform, STFT)频谱的时域,但不改变音频的音高 +class TimeStretch(AudioTensorOperation): + ''' + 按照指定的hop_length和n_freq,将音频转换成固定的比例比特率 + ''' + @check_time_stretch + def __init__(self, hop_length=None, n_freq=201, fixed_rate=None): + ''' + :param hop_length: 比特率的步长 + :param n_freq: 频率的数量 + :param fixed_rate: 固定的比例 + ''' + self.n_freq = n_freq + self.fixed_rate = fixed_rate + + n_fft = (n_freq - 1) * 2 + self.hop_length = hop_length if hop_length is not None else n_fft // 2 + self.fixed_rate = fixed_rate if fixed_rate is not None else 1 + + def parse(self): + return cde.TimeStretchOperation(self.hop_length, self.n_freq, self.fixed_rate) + +# 给音频波形施加高音音调控制效果 +class TrebleBiquad(AudioTensorOperation): + ''' + 提供音频谱滤波器的类 + ''' + @check_treble_biquad + def __init__(self, sample_rate, gain, central_freq=3000, Q=0.707): + ''' + 初始化 + :param sample_rate: 样本采样率 + :param gain: 增益 + :param central_freq: 中心频率 + :param Q: 系数 + ''' + self.sample_rate = sample_rate + self.gain = gain + self.central_freq = central_freq + self.Q = Q + + def parse(self): + return cde.TrebleBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q) + + +DE_C_GAIN_TYPE = {GainType.AMPLITUDE: cde.GainType.DE_GAIN_TYPE_AMPLITUDE, + GainType.POWER: cde.GainType.DE_GAIN_TYPE_POWER, + GainType.DB: cde.GainType.DE_GAIN_TYPE_DB} + +# 调整波形的音量 +class Vol(AudioTensorOperation): + ''' + 添加一个增益和增益类型的操作 + ''' + @check_vol + def __init__(self, gain, gain_type=GainType.AMPLITUDE): + ''' + :param gain: 增益值 + :param gain_type: 增益类型 + ''' + self.gain = gain + self.gain_type = gain_type + + def parse(self): + return cde.VolOperation(self.gain, DE_C_GAIN_TYPE[self.gain_type])