| @@ -1,4 +1,4 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # Copyright 2021-2022 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| @@ -11,6 +11,950 @@ | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| The module audio.transforms is inherited from _c_dataengine. | |||
| The module audio.transforms is inherited from _c_dataengine and is | |||
| implemented based on C++. It's a high performance module to process | |||
| audio. Users can apply suitable augmentations on audio data to improve | |||
| their training models. | |||
| """ | |||
| import numpy as np | |||
| import mindspore._c_dataengine as cde | |||
| from ..transforms.c_transforms import TensorOperation | |||
| from .utils import BorderType, DensityFunction, FadeShape, GainType, Interpolation, MelType, Modulation, NormType, \ | |||
| ScaleType, WindowType | |||
| from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_biquad, check_bandpass_biquad, \ | |||
| check_bandreject_biquad, check_bass_biquad, check_biquad, check_complex_norm, check_compute_deltas, \ | |||
| check_contrast, check_db_to_amplitude, check_dc_shift, check_deemph_biquad, check_detect_pitch_frequency, \ | |||
| check_dither, check_equalizer_biquad, check_fade, check_flanger, check_gain, check_griffin_lim, \ | |||
| check_highpass_biquad, check_lfilter, check_lowpass_biquad, check_magphase, check_mask_along_axis, \ | |||
| check_mask_along_axis_iid, check_masking, check_mel_scale, check_mu_law_coding, check_overdrive, \ | |||
| check_phase_vocoder, check_phaser, check_riaa_biquad, check_sliding_window_cmn, check_spectral_centroid, \ | |||
| check_spectrogram, check_time_stretch, check_treble_biquad, check_vol | |||
| class AudioTensorOperation(TensorOperation): | |||
| """ | |||
| Base class of Audio Tensor Ops. | |||
| """ | |||
| def __call__(self, *input_tensor_list): | |||
| for tensor in input_tensor_list: | |||
| if not isinstance(tensor, (np.ndarray,)): | |||
| raise TypeError("Input should be NumPy audio, got {}.".format(type(tensor))) | |||
| return super().__call__(*input_tensor_list) | |||
| def parse(self): | |||
| raise NotImplementedError("AudioTensorOperation has to implement parse() method.") | |||
| # 给音频波形施加双极点全通滤波器,其中心频率和带宽由入参指定。 | |||
| class AllpassBiquad(AudioTensorOperation): | |||
| ''' | |||
| 构建一个全透过比特平滑的复合提取器 | |||
| Args: | |||
| sample_rate: 采样率 | |||
| central_freq: 中心频率 | |||
| Q: 浓度 | |||
| ''' | |||
| @check_allpass_biquad | |||
| def __init__(self, sample_rate, central_freq, Q=0.707): | |||
| self.sample_rate = sample_rate | |||
| self.central_freq = central_freq | |||
| self.Q = Q | |||
| def parse(self): | |||
| return cde.AllpassBiquadOperation(self.sample_rate, self.central_freq, self.Q) | |||
| DE_C_SCALE_TYPE = {ScaleType.POWER: cde.ScaleType.DE_SCALE_TYPE_POWER, | |||
| ScaleType.MAGNITUDE: cde.ScaleType.DE_SCALE_TYPE_MAGNITUDE} | |||
| # 将输入音频从振幅/功率标度转换为分贝标度。 | |||
| class AmplitudeToDB(AudioTensorOperation): | |||
| ''' | |||
| Args: | |||
| stype: 输入音频的原始标度 | |||
| ref_value: 用于计算分贝系数 | |||
| amin: 波形取值下界,低于该值的波形将会被裁切 | |||
| top_db: 最小截止分贝值 | |||
| ''' | |||
| @check_amplitude_to_db | |||
| # 定义AmplitudeToDB类 | |||
| def __init__(self, stype=ScaleType.POWER, ref_value=1.0, amin=1e-10, top_db=80.0): | |||
| self.stype = stype | |||
| self.ref_value = ref_value | |||
| self.amin = amin | |||
| self.top_db = top_db | |||
| def parse(self): | |||
| # 返回AmplitudeToDBOperation类的实例 | |||
| return cde.AmplitudeToDBOperation(DE_C_SCALE_TYPE[self.stype], self.ref_value, self.amin, self.top_db) | |||
| # 计算复数序列的角度。 | |||
| class Angle(AudioTensorOperation): | |||
| ''' | |||
| 角度转换,输入为一个音频张量,输出为一个角度张量 | |||
| ''' | |||
| def parse(self): | |||
| return cde.AngleOperation() | |||
| class BandBiquad(AudioTensorOperation): | |||
| @check_band_biquad | |||
| def __init__(self, sample_rate, central_freq, Q=0.707, noise=False): | |||
| self.sample_rate = sample_rate | |||
| self.central_freq = central_freq | |||
| self.Q = Q | |||
| self.noise = noise | |||
| def parse(self): | |||
| return cde.BandBiquadOperation(self.sample_rate, self.central_freq, self.Q, self.noise) | |||
| # 给音频波形施加双极点巴特沃斯(Butterworth)带通滤波器 | |||
| class BandpassBiquad(AudioTensorOperation): | |||
| @check_bandpass_biquad | |||
| def __init__(self, sample_rate, central_freq, Q=0.707, const_skirt_gain=False): | |||
| self.sample_rate = sample_rate | |||
| self.central_freq = central_freq | |||
| self.Q = Q | |||
| self.const_skirt_gain = const_skirt_gain | |||
| def parse(self): | |||
| return cde.BandpassBiquadOperation(self.sample_rate, self.central_freq, self.Q, self.const_skirt_gain) | |||
| # 给音频波形施加双极点巴特沃斯(Butterworth)带通滤波器 | |||
| class BandrejectBiquad(AudioTensorOperation): | |||
| ''' | |||
| 构造一个带通滤波器 | |||
| ''' | |||
| @check_bandreject_biquad | |||
| def __init__(self, sample_rate, central_freq, Q=0.707): | |||
| ''' | |||
| 构造一个带通滤波器 | |||
| :param sample_rate: 样本采样率 | |||
| :param central_freq: 中心频率 | |||
| :param Q: 浓度 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.central_freq = central_freq | |||
| self.Q = Q | |||
| def parse(self): | |||
| return cde.BandrejectBiquadOperation(self.sample_rate, self.central_freq, self.Q) | |||
| # 给音频波形施加低音控制效果,即双极点低频搁架滤波器 | |||
| class BassBiquad(AudioTensorOperation): | |||
| ''' | |||
| 低音双二阶滤波器 | |||
| ''' | |||
| @check_bass_biquad | |||
| def __init__(self, sample_rate, gain, central_freq=100.0, Q=0.707): | |||
| ''' | |||
| :param sample_rate: 波形的采样率 | |||
| :param gain: 波形的增益 | |||
| :param central_freq: 波形的中心频率 | |||
| :param Q: 波形的比特率 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.gain = gain | |||
| self.central_freq = central_freq | |||
| self.Q = Q | |||
| def parse(self): | |||
| return cde.BassBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q) | |||
| # 给音频波形施加双二阶滤波器 | |||
| class Biquad(TensorOperation): | |||
| ''' | |||
| Biquad类: | |||
| 参数: | |||
| b0:比特系数b0 | |||
| b1:比特系数b1 | |||
| b2:比特系数b2 | |||
| a0:比特系数a0 | |||
| a1:比特系数a1 | |||
| a2:比特系数a2 | |||
| 返回值: | |||
| cde.BiquadOperation | |||
| ''' | |||
| @check_biquad | |||
| def __init__(self, b0, b1, b2, a0, a1, a2): | |||
| self.b0 = b0 | |||
| self.b1 = b1 | |||
| self.b2 = b2 | |||
| self.a0 = a0 | |||
| self.a1 = a1 | |||
| self.a2 = a2 | |||
| def parse(self): | |||
| return cde.BiquadOperation(self.b0, self.b1, self.b2, self.a0, self.a1, self.a2) | |||
| # 计算复数序列的范数 | |||
| class ComplexNorm(AudioTensorOperation): | |||
| ''' | |||
| 计算复数归一化的操作 | |||
| ''' | |||
| @check_complex_norm | |||
| def __init__(self, power=1.0): | |||
| ''' | |||
| :param power: 指数 | |||
| ''' | |||
| self.power = power | |||
| def parse(self): | |||
| return cde.ComplexNormOperation(self.power) | |||
| DE_C_BORDER_TYPE = { | |||
| BorderType.CONSTANT: cde.BorderType.DE_BORDER_CONSTANT, | |||
| BorderType.EDGE: cde.BorderType.DE_BORDER_EDGE, | |||
| BorderType.REFLECT: cde.BorderType.DE_BORDER_REFLECT, | |||
| BorderType.SYMMETRIC: cde.BorderType.DE_BORDER_SYMMETRIC, | |||
| } | |||
| # 计算频谱的delta系数,也叫差分系数 | |||
| class ComputeDeltas(AudioTensorOperation): | |||
| ''' | |||
| 计算梯度 | |||
| ''' | |||
| @check_compute_deltas | |||
| def __init__(self, win_length=5, pad_mode=BorderType.EDGE): | |||
| ''' | |||
| 初始化函数 | |||
| :param win_length: 窗口长度 | |||
| :param pad_mode: 填充模式 | |||
| ''' | |||
| self.win_len = win_length | |||
| self.pad_mode = pad_mode | |||
| def parse(self): | |||
| return cde.ComputeDeltasOperation(self.win_len, DE_C_BORDER_TYPE[self.pad_mode]) | |||
| # 给音频波形施加对比度增强效果 | |||
| class Contrast(AudioTensorOperation): | |||
| ''' | |||
| 计算谱和音频的对比度 | |||
| ''' | |||
| @check_contrast | |||
| def __init__(self, enhancement_amount=75.0): | |||
| ''' | |||
| :param enhancement_amount: 对比度的增强量 | |||
| ''' | |||
| self.enhancement_amount = enhancement_amount | |||
| def parse(self): | |||
| return cde.ContrastOperation(self.enhancement_amount) | |||
| # 将音频波形从分贝转换为功率或振幅 | |||
| class DBToAmplitude(AudioTensorOperation): | |||
| ''' | |||
| 转换DB值到相位值 | |||
| ''' | |||
| @check_db_to_amplitude | |||
| def __init__(self, ref, power): | |||
| ''' | |||
| :param ref: 参考值 | |||
| :param power: 相位值 | |||
| ''' | |||
| self.ref = ref | |||
| self.power = power | |||
| def parse(self): | |||
| return cde.DBToAmplitudeOperation(self.ref, self.power) | |||
| # 对输入音频波形施加直流移位 | |||
| class DCShift(AudioTensorOperation): | |||
| ''' | |||
| 计算音频的DC偏移量 | |||
| ''' | |||
| @check_dc_shift | |||
| def __init__(self, shift, limiter_gain=None): | |||
| ''' | |||
| :param shift: 音频的DC偏移量 | |||
| :param limiter_gain: 限制器的增益 | |||
| ''' | |||
| self.shift = shift | |||
| self.limiter_gain = limiter_gain if limiter_gain else shift | |||
| def parse(self): | |||
| return cde.DCShiftOperation(self.shift, self.limiter_gain) | |||
| # 给音频波形施加CD(IEC 60908)去重音(一种高音衰减搁置滤波器)效果 | |||
| class DeemphBiquad(AudioTensorOperation): | |||
| ''' | |||
| 计算DeemphBiquad操作 | |||
| ''' | |||
| @check_deemph_biquad | |||
| def __init__(self, sample_rate): | |||
| ''' | |||
| :param sample_rate: 波形的采样频率 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| def parse(self): | |||
| return cde.DeemphBiquadOperation(self.sample_rate) | |||
| # 检测音调频率 | |||
| class DetectPitchFrequency(AudioTensorOperation): | |||
| ''' | |||
| 计算音高频率的检测 | |||
| ''' | |||
| @check_detect_pitch_frequency | |||
| def __init__(self, sample_rate, frame_time=0.01, win_length=30, freq_low=85, freq_high=3400): | |||
| ''' | |||
| 初始化检测音高频率的操作 | |||
| :param sample_rate: 采样率 | |||
| :param frame_time: 滑动窗口的时间间隔 | |||
| :param win_length: 窗口的长度 | |||
| :param freq_low: 最低频率 | |||
| :param freq_high: 最高频率 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.frame_time = frame_time | |||
| self.win_length = win_length | |||
| self.freq_low = freq_low | |||
| self.freq_high = freq_high | |||
| def parse(self): | |||
| return cde.DetectPitchFrequencyOperation(self.sample_rate, self.frame_time, | |||
| self.win_length, self.freq_low, self.freq_high) | |||
| DE_C_DENSITY_FUNCTION = {DensityFunction.TPDF: cde.DensityFunction.DE_DENSITY_FUNCTION_TPDF, | |||
| DensityFunction.RPDF: cde.DensityFunction.DE_DENSITY_FUNCTION_RPDF, | |||
| DensityFunction.GPDF: cde.DensityFunction.DE_DENSITY_FUNCTION_GPDF} | |||
| # 通过消除非线性截断失真,来抖动增加存储在特定位深的音频的动态感知范围 | |||
| class Dither(AudioTensorOperation): | |||
| ''' | |||
| 添加噪声的类 | |||
| ''' | |||
| @check_dither | |||
| def __init__(self, density_function=DensityFunction.TPDF, noise_shaping=False): | |||
| ''' | |||
| 初始化添加噪声的类 | |||
| :param density_function: 添加噪声的混合模式 | |||
| :param noise_shaping: 是否使用噪声矫正 | |||
| ''' | |||
| self.density_function = density_function | |||
| self.noise_shaping = noise_shaping | |||
| def parse(self): | |||
| ''' | |||
| 解析添加噪声的类 | |||
| :return: | |||
| ''' | |||
| return cde.DitherOperation(DE_C_DENSITY_FUNCTION[self.density_function], self.noise_shaping) | |||
| # 给音频波形施加双二次均衡器滤波器 | |||
| class EqualizerBiquad(AudioTensorOperation): | |||
| ''' | |||
| 比较器比特线的高通滤波器 | |||
| ''' | |||
| @check_equalizer_biquad | |||
| def __init__(self, sample_rate, center_freq, gain, Q=0.707): | |||
| ''' | |||
| :param sample_rate: 样本采样率 | |||
| :param center_freq: 输入频率 | |||
| :param gain: 比特率 | |||
| :param Q: 浓度 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.center_freq = center_freq | |||
| self.gain = gain | |||
| self.Q = Q | |||
| def parse(self): | |||
| return cde.EqualizerBiquadOperation(self.sample_rate, self.center_freq, self.gain, self.Q) | |||
| DE_C_FADE_SHAPE = {FadeShape.QUARTER_SINE: cde.FadeShape.DE_FADE_SHAPE_QUARTER_SINE, | |||
| FadeShape.HALF_SINE: cde.FadeShape.DE_FADE_SHAPE_HALF_SINE, | |||
| FadeShape.LINEAR: cde.FadeShape.DE_FADE_SHAPE_LINEAR, | |||
| FadeShape.LOGARITHMIC: cde.FadeShape.DE_FADE_SHAPE_LOGARITHMIC, | |||
| FadeShape.EXPONENTIAL: cde.FadeShape.DE_FADE_SHAPE_EXPONENTIAL} | |||
| # 向波形添加淡入和/或淡出 | |||
| class Fade(AudioTensorOperation): | |||
| ''' | |||
| 淡入淡出操作 | |||
| ''' | |||
| @check_fade | |||
| def __init__(self, fade_in_len=0, fade_out_len=0, fade_shape=FadeShape.LINEAR): | |||
| ''' | |||
| Args: | |||
| fade_in_len: 淡入长度,默认为0 | |||
| fade_out_len: 淡出长度,默认为0 | |||
| fade_shape: 淡入淡出形状,默认为LINEAR | |||
| ''' | |||
| self.fade_in_len = fade_in_len | |||
| self.fade_out_len = fade_out_len | |||
| self.fade_shape = fade_shape | |||
| def parse(self): | |||
| return cde.FadeOperation(self.fade_in_len, self.fade_out_len, DE_C_FADE_SHAPE[self.fade_shape]) | |||
| DE_C_MODULATION = {Modulation.SINUSOIDAL: cde.Modulation.DE_MODULATION_SINUSOIDAL, | |||
| Modulation.TRIANGULAR: cde.Modulation.DE_MODULATION_TRIANGULAR} | |||
| DE_C_INTERPOLATION = {Interpolation.LINEAR: cde.Interpolation.DE_INTERPOLATION_LINEAR, | |||
| Interpolation.QUADRATIC: cde.Interpolation.DE_INTERPOLATION_QUADRATIC} | |||
| # 给音频施加镶边效果 | |||
| class Flanger(AudioTensorOperation): | |||
| ''' | |||
| Flanger操作 | |||
| ''' | |||
| @check_flanger | |||
| def __init__(self, sample_rate, delay=0.0, depth=2.0, regen=0.0, width=71.0, speed=0.5, | |||
| phase=25.0, modulation=Modulation.SINUSOIDAL, interpolation=Interpolation.LINEAR): | |||
| ''' | |||
| 初始化Flanger操作 | |||
| :param sample_rate: 样本频率 | |||
| :param delay: 延迟 | |||
| :param depth: 深度 | |||
| :param regen: 波缓冲 | |||
| :param width: 宽度 | |||
| :param speed: 速度 | |||
| :param phase: 偏移 | |||
| :param modulation: 模组 | |||
| :param interpolation: 插值 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.delay = delay | |||
| self.depth = depth | |||
| self.regen = regen | |||
| self.width = width | |||
| self.speed = speed | |||
| self.phase = phase | |||
| self.modulation = modulation | |||
| self.interpolation = interpolation | |||
| def parse(self): | |||
| return cde.FlangerOperation(self.sample_rate, self.delay, self.depth, self.regen, self.width, self.speed, | |||
| self.phase, DE_C_MODULATION[self.modulation], | |||
| DE_C_INTERPOLATION[self.interpolation]) | |||
| # 给音频波形施加频域掩码 | |||
| class FrequencyMasking(AudioTensorOperation): | |||
| ''' | |||
| 对音频数据进行频率掩码 | |||
| ''' | |||
| @check_masking | |||
| def __init__(self, iid_masks=False, freq_mask_param=0, mask_start=0, mask_value=0.0): | |||
| ''' | |||
| 频率掩码参数: | |||
| iid_masks:是否使用IID掩码 | |||
| freq_mask_param:频率掩码参数 | |||
| mask_start:掩码开始位置 | |||
| mask_value:掩码值 | |||
| ''' | |||
| self.iid_masks = iid_masks | |||
| self.frequency_mask_param = freq_mask_param | |||
| self.mask_start = mask_start | |||
| self.mask_value = mask_value | |||
| def parse(self): | |||
| return cde.FrequencyMaskingOperation(self.iid_masks, self.frequency_mask_param, self.mask_start, | |||
| self.mask_value) | |||
| # 放大或衰减整个音频波形 | |||
| class Gain(AudioTensorOperation): | |||
| ''' | |||
| 计算音频的增益 | |||
| ''' | |||
| @check_gain | |||
| def __init__(self, gain_db=1.0): | |||
| ''' | |||
| :param gain_db: 增益的数值,取值范围为[-100, 100] | |||
| ''' | |||
| self.gain_db = gain_db | |||
| def parse(self): | |||
| return cde.GainOperation(self.gain_db) | |||
| # 使用Griffin-Lim算法从线性幅度频谱图中计算信号波形 | |||
| class GriffinLim(AudioTensorOperation): | |||
| ''' | |||
| GriffinLim算法 | |||
| ''' | |||
| @check_griffin_lim | |||
| def __init__(self, n_fft=400, n_iter=32, win_length=None, hop_length=None, window_type=WindowType.HANN, | |||
| power=2, momentum=0.99, length=None, rand_init=True): | |||
| ''' | |||
| 参数: | |||
| n_fft: 窗口的长度 | |||
| n_iter: 步长 | |||
| win_length: 窗口的长度,如果没有指定,则使用n_fft | |||
| hop_length: 窗口的步长,如果没有指定,则使用win_length的一半 | |||
| window_type: 窗口类型 | |||
| power: 窗口的加权平方根 | |||
| momentum: 梯度移动平均系数 | |||
| length: 数据长度,如果没有指定,则使用数据的长度 | |||
| rand_init: 是否使用随机初始化 | |||
| ''' | |||
| self.n_fft = n_fft | |||
| self.n_iter = n_iter | |||
| self.win_length = win_length if win_length else self.n_fft | |||
| self.hop_length = hop_length if hop_length else self.win_length // 2 | |||
| self.window_type = window_type | |||
| self.power = power | |||
| self.momentum = momentum | |||
| self.length = length if length else 0 | |||
| self.rand_init = rand_init | |||
| def parse(self): | |||
| return cde.GriffinLimOperation(self.n_fft, self.n_iter, self.win_length, self.hop_length, | |||
| DE_C_WINDOW_TYPE.get(self.window_type), self.power, self.momentum, self.length, | |||
| self.rand_init) | |||
| # 给音频波形上施加双二阶高通滤波器 | |||
| class HighpassBiquad(AudioTensorOperation): | |||
| ''' | |||
| 高通滤波器,可以用于高通滤波器的高通滤波器 | |||
| ''' | |||
| @check_highpass_biquad | |||
| def __init__(self, sample_rate, cutoff_freq, Q=0.707): | |||
| ''' | |||
| :param sample_rate: 样本采样率 | |||
| :param cutoff_freq: 过滤频率 | |||
| :param Q: 过滤系数 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.cutoff_freq = cutoff_freq | |||
| self.Q = Q | |||
| def parse(self): | |||
| return cde.HighpassBiquadOperation(self.sample_rate, self.cutoff_freq, self.Q) | |||
| # 根据指定的差分方程施加IIR滤波器 | |||
| class LFilter(AudioTensorOperation): | |||
| ''' | |||
| 过滤器,可以通过调用LFilterOperation的parse方法获取 | |||
| ''' | |||
| @check_lfilter | |||
| def __init__(self, a_coeffs, b_coeffs, clamp=True): | |||
| ''' | |||
| :param a_coeffs: a_coeffs[0]为a的系数,a_coeffs[1]为b的系数,a_coeffs[2]为b的系数,... | |||
| :param b_coeffs: b_coeffs[0]为a的系数,b_coeffs[1]为b的系数,b_coeffs[2]为b的系数,... | |||
| :param clamp: 是否禁用clamp | |||
| ''' | |||
| self.a_coeffs = a_coeffs | |||
| self.b_coeffs = b_coeffs | |||
| self.clamp = clamp | |||
| def parse(self): | |||
| return cde.LFilterOperation(self.a_coeffs, self.b_coeffs, self.clamp) | |||
| # 给音频波形施加双极点低通滤波器 | |||
| class LowpassBiquad(AudioTensorOperation): | |||
| ''' | |||
| 计算一个低通滤波器,参数为采样率,周期,Q值 | |||
| ''' | |||
| @check_lowpass_biquad | |||
| def __init__(self, sample_rate, cutoff_freq, Q=0.707): | |||
| ''' | |||
| 初始化低通滤波器 | |||
| :param sample_rate: 采样率 | |||
| :param cutoff_freq: 周期 | |||
| :param Q: Q值 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.cutoff_freq = cutoff_freq | |||
| self.Q = Q | |||
| def parse(self): | |||
| return cde.LowpassBiquadOperation(self.sample_rate, self.cutoff_freq, self.Q) | |||
| # 将shape为(..., 2)的复值光谱图分离,输出幅度和相位 | |||
| class Magphase(AudioTensorOperation): | |||
| ''' | |||
| 计算输入的音频数据的相位 | |||
| ''' | |||
| @check_magphase | |||
| def __init__(self, power=1.0): | |||
| ''' | |||
| 初始化Magphase类 | |||
| :param power: 相位的平方根 | |||
| ''' | |||
| self.power = power | |||
| def parse(self): | |||
| return cde.MagphaseOperation(self.power) | |||
| # 对音频波形应用掩码 | |||
| class MaskAlongAxis(AudioTensorOperation): | |||
| ''' | |||
| 按照指定的轴mask,从mask_start开始,按照mask_width宽度mask,mask_value值mask | |||
| ''' | |||
| @check_mask_along_axis | |||
| def __init__(self, mask_start, mask_width, mask_value, axis): | |||
| ''' | |||
| :param mask_start: 指定mask的起始位置 | |||
| :param mask_width: 指定mask的宽度 | |||
| :param mask_value: 指定mask的值 | |||
| :param axis: 指定mask的轴 | |||
| ''' | |||
| self.mask_start = mask_start | |||
| self.mask_width = mask_width | |||
| self.mask_value = mask_value | |||
| self.axis = axis | |||
| def parse(self): | |||
| return cde.MaskAlongAxisOperation(self.mask_start, self.mask_width, self.mask_value, self.axis) | |||
| # 对音频波形沿 axis 轴应用掩码 | |||
| class MaskAlongAxisIID(AudioTensorOperation): | |||
| ''' | |||
| 按照指定的维度mask,指定的值 | |||
| ''' | |||
| @check_mask_along_axis_iid | |||
| def __init__(self, mask_param, mask_value, axis): | |||
| ''' | |||
| :param mask_param: mask的参数 | |||
| :param mask_value: mask的值 | |||
| :param axis: mask的维度 | |||
| ''' | |||
| self.mask_param = mask_param | |||
| self.mask_value = mask_value | |||
| self.axis = axis | |||
| def parse(self): | |||
| return cde.MaskAlongAxisIIDOperation(self.mask_param, self.mask_value, self.axis) | |||
| DE_C_MEL_TYPE = {MelType.SLANEY: cde.MelType.DE_MEL_TYPE_SLANEY, | |||
| MelType.HTK: cde.MelType.DE_MEL_TYPE_HTK} | |||
| DE_C_NORM_TYPE = {NormType.NONE: cde.NormType.DE_NORM_TYPE_NONE, | |||
| NormType.SLANEY: cde.NormType.DE_NORM_TYPE_SLANEY} | |||
| # 将普通STFT转换为梅尔尺度的STFT | |||
| class MelScale(AudioTensorOperation): | |||
| ''' | |||
| MelScale操作 | |||
| ''' | |||
| @check_mel_scale | |||
| def __init__(self, n_mels=128, sample_rate=16000, f_min=0, f_max=None, n_stft=201, norm=NormType.NONE, | |||
| mel_type=MelType.HTK): | |||
| ''' | |||
| 初始化MelScale操作 | |||
| :param n_mels: mel空间的大小 | |||
| :param sample_rate: 采样率 | |||
| :param f_min: 将音频转换为mel空间的最小频率 | |||
| :param f_max: 将音频转换为mel空间的最大频率,如果没有指定则默认为采样率的一半 | |||
| :param n_stft: 将音频转换为mel空间的维度 | |||
| :param norm: 标准化类型 | |||
| :param mel_type: mel空间类型 | |||
| ''' | |||
| self.n_mels = n_mels | |||
| self.sample_rate = sample_rate | |||
| self.f_min = f_min | |||
| self.f_max = f_max if f_max is not None else sample_rate // 2 | |||
| self.n_stft = n_stft | |||
| self.norm = norm | |||
| self.mel_type = mel_type | |||
| def parse(self): | |||
| return cde.MelScaleOperation(self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_stft, | |||
| DE_C_NORM_TYPE[self.norm], DE_C_MEL_TYPE[self.mel_type]) | |||
| # 解码mu-law编码的信号,参考 mu-law算法 | |||
| class MuLawDecoding(AudioTensorOperation): | |||
| ''' | |||
| 解码MuLaw编码的数据 | |||
| ''' | |||
| @check_mu_law_coding | |||
| def __init__(self, quantization_channels=256): | |||
| ''' | |||
| 构造函数 | |||
| :param quantization_channels: 目标量化通道数 | |||
| ''' | |||
| self.quantization_channels = quantization_channels | |||
| def parse(self): | |||
| return cde.MuLawDecodingOperation(self.quantization_channels) | |||
| # 基于mu-law压缩的信号编码 | |||
| class MuLawEncoding(AudioTensorOperation): | |||
| ''' | |||
| 编码器 | |||
| ''' | |||
| @check_mu_law_coding | |||
| def __init__(self, quantization_channels=256): | |||
| ''' | |||
| 初始化编码器 | |||
| :param quantization_channels: 目标量化通道数 | |||
| ''' | |||
| self.quantization_channels = quantization_channels | |||
| def parse(self): | |||
| return cde.MuLawEncodingOperation(self.quantization_channels) | |||
| # 给音频波形施加过载效果 | |||
| class Overdrive(AudioTensorOperation): | |||
| ''' | |||
| 设置音频的音量和颜色 | |||
| ''' | |||
| @check_overdrive | |||
| def __init__(self, gain=20.0, color=20.0): | |||
| ''' | |||
| :param gain: 音量 | |||
| :param color: 颜色 | |||
| ''' | |||
| self.gain = gain | |||
| self.color = color | |||
| def parse(self): | |||
| return cde.OverdriveOperation(self.gain, self.color) | |||
| # 给音频波形施加相位效果 | |||
| class Phaser(AudioTensorOperation): | |||
| ''' | |||
| Phaser操作: | |||
| ''' | |||
| @check_phaser | |||
| def __init__(self, sample_rate, gain_in=0.4, gain_out=0.74, | |||
| delay_ms=3.0, decay=0.4, mod_speed=0.5, sinusoidal=True): | |||
| ''' | |||
| 初始化Phaser操作 | |||
| 参数: | |||
| sample_rate:采样率 | |||
| gain_in:输入增益 | |||
| gain_out:输出增益 | |||
| delay_ms:延迟毫秒数 | |||
| decay:衰减系数 | |||
| mod_speed:模拟速度 | |||
| sinusoidal:是否模拟 | |||
| ''' | |||
| self.decay = decay | |||
| self.delay_ms = delay_ms | |||
| self.gain_in = gain_in | |||
| self.gain_out = gain_out | |||
| self.mod_speed = mod_speed | |||
| self.sample_rate = sample_rate | |||
| self.sinusoidal = sinusoidal | |||
| def parse(self): | |||
| return cde.PhaserOperation(self.sample_rate, self.gain_in, self.gain_out, | |||
| self.delay_ms, self.decay, self.mod_speed, self.sinusoidal) | |||
| # 对给定的STFT频谱,在不改变音高的情况下以一定比率进行加速 | |||
| class PhaseVocoder(AudioTensorOperation): | |||
| ''' | |||
| 基于频谱的频谱转换 | |||
| ''' | |||
| @check_phase_vocoder | |||
| def __init__(self, rate, phase_advance): | |||
| ''' | |||
| :param rate: 采样率 | |||
| :param phase_advance: 频谱转换的时间间隔 | |||
| ''' | |||
| self.rate = rate | |||
| self.phase_advance = cde.Tensor(phase_advance) | |||
| def parse(self): | |||
| return cde.PhaseVocoderOperation(self.rate, self.phase_advance) | |||
| # 对输入音频波形施加RIAA均衡 | |||
| class RiaaBiquad(AudioTensorOperation): | |||
| ''' | |||
| 构建一个RiaaBiquad操作 | |||
| ''' | |||
| @check_riaa_biquad | |||
| def __init__(self, sample_rate): | |||
| ''' | |||
| 构建一个RiaaBiquad操作 | |||
| :param sample_rate: 样本频率 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| def parse(self): | |||
| return cde.RiaaBiquadOperation(self.sample_rate) | |||
| # 对每个话语应用滑动窗口倒谱均值(和可选方差)归一化 | |||
| class SlidingWindowCmn(AudioTensorOperation): | |||
| ''' | |||
| 计算滑动窗口的CMN | |||
| ''' | |||
| @check_sliding_window_cmn | |||
| def __init__(self, cmn_window=600, min_cmn_window=100, center=False, norm_vars=False): | |||
| ''' | |||
| 设置滑动窗口的CMN参数 | |||
| :param cmn_window: 滑动窗口的长度 | |||
| :param min_cmn_window: 最小的CMN长度 | |||
| :param center: 是否在滑动窗口的中间 | |||
| :param norm_vars: 是否归一化变量 | |||
| ''' | |||
| self.cmn_window = cmn_window | |||
| self.min_cmn_window = min_cmn_window | |||
| self.center = center | |||
| self.norm_vars = norm_vars | |||
| def parse(self): | |||
| return cde.SlidingWindowCmnOperation(self.cmn_window, self.min_cmn_window, self.center, self.norm_vars) | |||
| DE_C_WINDOW_TYPE = {WindowType.BARTLETT: cde.WindowType.DE_WINDOW_TYPE_BARTLETT, | |||
| WindowType.BLACKMAN: cde.WindowType.DE_WINDOW_TYPE_BLACKMAN, | |||
| WindowType.HAMMING: cde.WindowType.DE_WINDOW_TYPE_HAMMING, | |||
| WindowType.HANN: cde.WindowType.DE_WINDOW_TYPE_HANN, | |||
| WindowType.KAISER: cde.WindowType.DE_WINDOW_TYPE_KAISER} | |||
| # 计算每个通道沿时间轴的频谱中心 | |||
| class SpectralCentroid(TensorOperation): | |||
| ''' | |||
| 计算振幅谱中心点 | |||
| ''' | |||
| @check_spectral_centroid | |||
| def __init__(self, sample_rate, n_fft=400, win_length=None, hop_length=None, pad=0, window=WindowType.HANN): | |||
| ''' | |||
| :param sample_rate: 样本采样率 | |||
| :param n_fft: 快速傅里叶变换的窗长度 | |||
| :param win_length: 窗长度,默认为n_fft | |||
| :param hop_length: 间隔长度,默认为win_length // 2 | |||
| :param pad: 填充,默认为0 | |||
| :param window: 窗函数,默认为HANN | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.pad = pad | |||
| self.window = window | |||
| self.n_fft = n_fft | |||
| self.win_length = win_length if win_length else n_fft | |||
| self.hop_length = hop_length if hop_length else self.win_length // 2 | |||
| def parse(self): | |||
| return cde.SpectralCentroidOperation(self.sample_rate, self.n_fft, self.win_length, self.hop_length, | |||
| self.pad, DE_C_WINDOW_TYPE[self.window]) | |||
| # 从音频信号创建其频谱 | |||
| class Spectrogram(TensorOperation): | |||
| ''' | |||
| 计算频谱图 | |||
| ''' | |||
| @check_spectrogram | |||
| def __init__(self, n_fft=400, win_length=None, hop_length=None, pad=0, window=WindowType.HANN, power=2.0, | |||
| normalized=False, center=True, pad_mode=BorderType.REFLECT, onesided=True): | |||
| ''' | |||
| 初始化 | |||
| :param n_fft: 窗口大小 | |||
| :param win_length: 窗口长度,如果为None则使用窗口大小 | |||
| :param hop_length: 间隔长度 | |||
| :param pad: 填充 | |||
| :param window: 窗口类型 | |||
| :param power: 细胞平方 | |||
| :param normalized: 标准化 | |||
| :param center: 偏移 | |||
| :param pad_mode: 填充模式 | |||
| :param onesided: 奇数频谱 | |||
| ''' | |||
| self.n_fft = n_fft | |||
| self.win_length = win_length if win_length else n_fft | |||
| self.hop_length = hop_length if hop_length else self.win_length // 2 | |||
| self.pad = pad | |||
| self.window = window | |||
| self.power = power | |||
| self.normalized = normalized | |||
| self.center = center | |||
| self.pad_mode = pad_mode | |||
| self.onesided = onesided | |||
| def parse(self): | |||
| return cde.SpectrogramOperation(self.n_fft, self.win_length, self.hop_length, self.pad, | |||
| DE_C_WINDOW_TYPE[self.window], self.power, self.normalized, | |||
| self.center, DE_C_BORDER_TYPE[self.pad_mode], self.onesided) | |||
| # 给音频波形施加时域掩码 | |||
| class TimeMasking(AudioTensorOperation): | |||
| ''' | |||
| 掩码时间序列 | |||
| ''' | |||
| @check_masking | |||
| def __init__(self, iid_masks=False, time_mask_param=0, mask_start=0, mask_value=0.0): | |||
| ''' | |||
| 参数: | |||
| iid_masks:是否使用IID掩码 | |||
| time_mask_param:时间掩码参数 | |||
| mask_start:掩码起始位置 | |||
| mask_value:掩码值 | |||
| ''' | |||
| self.iid_masks = iid_masks | |||
| self.time_mask_param = time_mask_param | |||
| self.mask_start = mask_start | |||
| self.mask_value = mask_value | |||
| def parse(self): | |||
| return cde.TimeMaskingOperation(self.iid_masks, self.time_mask_param, self.mask_start, self.mask_value) | |||
| # 以给定的比例拉伸音频短时傅里叶(Short Time Fourier Transform, STFT)频谱的时域,但不改变音频的音高 | |||
| class TimeStretch(AudioTensorOperation): | |||
| ''' | |||
| 按照指定的hop_length和n_freq,将音频转换成固定的比例比特率 | |||
| ''' | |||
| @check_time_stretch | |||
| def __init__(self, hop_length=None, n_freq=201, fixed_rate=None): | |||
| ''' | |||
| :param hop_length: 比特率的步长 | |||
| :param n_freq: 频率的数量 | |||
| :param fixed_rate: 固定的比例 | |||
| ''' | |||
| self.n_freq = n_freq | |||
| self.fixed_rate = fixed_rate | |||
| n_fft = (n_freq - 1) * 2 | |||
| self.hop_length = hop_length if hop_length is not None else n_fft // 2 | |||
| self.fixed_rate = fixed_rate if fixed_rate is not None else 1 | |||
| def parse(self): | |||
| return cde.TimeStretchOperation(self.hop_length, self.n_freq, self.fixed_rate) | |||
| # 给音频波形施加高音音调控制效果 | |||
| class TrebleBiquad(AudioTensorOperation): | |||
| ''' | |||
| 提供音频谱滤波器的类 | |||
| ''' | |||
| @check_treble_biquad | |||
| def __init__(self, sample_rate, gain, central_freq=3000, Q=0.707): | |||
| ''' | |||
| 初始化 | |||
| :param sample_rate: 样本采样率 | |||
| :param gain: 增益 | |||
| :param central_freq: 中心频率 | |||
| :param Q: 系数 | |||
| ''' | |||
| self.sample_rate = sample_rate | |||
| self.gain = gain | |||
| self.central_freq = central_freq | |||
| self.Q = Q | |||
| def parse(self): | |||
| return cde.TrebleBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q) | |||
| DE_C_GAIN_TYPE = {GainType.AMPLITUDE: cde.GainType.DE_GAIN_TYPE_AMPLITUDE, | |||
| GainType.POWER: cde.GainType.DE_GAIN_TYPE_POWER, | |||
| GainType.DB: cde.GainType.DE_GAIN_TYPE_DB} | |||
| # 调整波形的音量 | |||
| class Vol(AudioTensorOperation): | |||
| ''' | |||
| 添加一个增益和增益类型的操作 | |||
| ''' | |||
| @check_vol | |||
| def __init__(self, gain, gain_type=GainType.AMPLITUDE): | |||
| ''' | |||
| :param gain: 增益值 | |||
| :param gain_type: 增益类型 | |||
| ''' | |||
| self.gain = gain | |||
| self.gain_type = gain_type | |||
| def parse(self): | |||
| return cde.VolOperation(self.gain, DE_C_GAIN_TYPE[self.gain_type]) | |||