You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

transforms.py 46 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. The module audio.transforms is inherited from _c_dataengine and is
  17. implemented based on C++. It's a high performance module to process
  18. audio. Users can apply suitable augmentations on audio data to improve
  19. their training models.
  20. """
  21. import numpy as np
  22. import mindspore._c_dataengine as cde
  23. from ..transforms.c_transforms import TensorOperation
  24. from .utils import BorderType, FadeShape, GainType, Interpolation, Modulation, ScaleType
  25. from .validators import check_allpass_biquad, check_amplitude_to_db, check_band_biquad, check_bandpass_biquad, \
  26. check_bandreject_biquad, check_bass_biquad, check_biquad, check_complex_norm, check_compute_deltas, \
  27. check_contrast, check_db_to_amplitude, check_dc_shift, check_deemph_biquad, check_detect_pitch_frequency, \
  28. check_equalizer_biquad, check_fade, check_flanger, check_highpass_biquad, check_lfilter, check_lowpass_biquad, \
  29. check_magphase, check_masking, check_mu_law_coding, check_overdrive, check_phaser, check_riaa_biquad, \
  30. check_sliding_window_cmn, check_time_stretch, check_treble_biquad, check_vol
  31. class AudioTensorOperation(TensorOperation):
  32. """
  33. Base class of Audio Tensor Ops.
  34. """
  35. def __call__(self, *input_tensor_list):
  36. for tensor in input_tensor_list:
  37. if not isinstance(tensor, (np.ndarray,)):
  38. raise TypeError("Input should be NumPy audio, got {}.".format(type(tensor)))
  39. return super().__call__(*input_tensor_list)
  40. def parse(self):
  41. raise NotImplementedError("AudioTensorOperation has to implement parse() method.")
  42. class AllpassBiquad(AudioTensorOperation):
  43. """
  44. Design two-pole all-pass filter for audio waveform of dimension of (..., time).
  45. Args:
  46. sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  47. central_freq (float): central frequency (in Hz).
  48. Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  49. Examples:
  50. >>> import numpy as np
  51. >>>
  52. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  53. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  54. >>> transforms = [audio.AllpassBiquad(44100, 200.0)]
  55. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  56. """
  57. @check_allpass_biquad
  58. def __init__(self, sample_rate, central_freq, Q=0.707):
  59. self.sample_rate = sample_rate
  60. self.central_freq = central_freq
  61. self.Q = Q
  62. def parse(self):
  63. return cde.AllpassBiquadOperation(self.sample_rate, self.central_freq, self.Q)
  64. DE_C_SCALETYPE_TYPE = {ScaleType.MAGNITUDE: cde.ScaleType.DE_SCALETYPE_MAGNITUDE,
  65. ScaleType.POWER: cde.ScaleType.DE_SCALETYPE_POWER}
  66. class AmplitudeToDB(AudioTensorOperation):
  67. """
  68. Converts the input tensor from amplitude/power scale to decibel scale.
  69. Args:
  70. stype (ScaleType, optional): Scale of the input tensor (default=ScaleType.POWER).
  71. It can be one of ScaleType.MAGNITUDE or ScaleType.POWER.
  72. ref_value (float, optional): Param for generate db_multiplier.
  73. amin (float, optional): Lower bound to clamp the input waveform. It must be greater than zero.
  74. top_db (float, optional): Minimum cut-off decibels. The range of values is non-negative.
  75. Commonly set at 80 (default=80.0).
  76. Examples:
  77. >>> import numpy as np
  78. >>>
  79. >>> waveform = np.random.random([1, 400//2+1, 30])
  80. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  81. >>> transforms = [audio.AmplitudeToDB(stype=ScaleType.POWER)]
  82. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  83. """
  84. @check_amplitude_to_db
  85. def __init__(self, stype=ScaleType.POWER, ref_value=1.0, amin=1e-10, top_db=80.0):
  86. self.stype = stype
  87. self.ref_value = ref_value
  88. self.amin = amin
  89. self.top_db = top_db
  90. def parse(self):
  91. return cde.AmplitudeToDBOperation(DE_C_SCALETYPE_TYPE[self.stype], self.ref_value, self.amin, self.top_db)
  92. class Angle(AudioTensorOperation):
  93. """
  94. Calculate the angle of the complex number sequence of shape (..., 2).
  95. The first dimension represents the real part while the second represents the imaginary.
  96. Examples:
  97. >>> import numpy as np
  98. >>>
  99. >>> waveform = np.array([[1.43, 5.434], [23.54, 89.38]])
  100. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  101. >>> transforms = [audio.Angle()]
  102. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  103. """
  104. def parse(self):
  105. return cde.AngleOperation()
  106. class BandBiquad(AudioTensorOperation):
  107. """
  108. Design two-pole band filter for audio waveform of dimension of (..., time).
  109. Args:
  110. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  111. central_freq (float): Central frequency (in Hz).
  112. Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  113. noise (bool, optional) : If True, uses the alternate mode for un-pitched audio (e.g. percussion).
  114. If False, uses mode oriented to pitched audio, i.e. voice, singing, or instrumental music (default=False).
  115. Examples:
  116. >>> import numpy as np
  117. >>>
  118. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  119. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  120. >>> transforms = [audio.BandBiquad(44100, 200.0)]
  121. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  122. """
  123. @check_band_biquad
  124. def __init__(self, sample_rate, central_freq, Q=0.707, noise=False):
  125. self.sample_rate = sample_rate
  126. self.central_freq = central_freq
  127. self.Q = Q
  128. self.noise = noise
  129. def parse(self):
  130. return cde.BandBiquadOperation(self.sample_rate, self.central_freq, self.Q, self.noise)
  131. class BandpassBiquad(AudioTensorOperation):
  132. """
  133. Design two-pole band-pass filter. Similar to SoX implementation.
  134. Args:
  135. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  136. central_freq (float): Central frequency (in Hz).
  137. Q (float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0,1] (default=0.707).
  138. const_skirt_gain (bool, optional) : If True, uses a constant skirt gain (peak gain = Q).
  139. If False, uses a constant 0dB peak gain (default=False).
  140. Examples:
  141. >>> import numpy as np
  142. >>>
  143. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  144. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  145. >>> transforms = [audio.BandpassBiquad(44100, 200.0)]
  146. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  147. """
  148. @check_bandpass_biquad
  149. def __init__(self, sample_rate, central_freq, Q=0.707, const_skirt_gain=False):
  150. self.sample_rate = sample_rate
  151. self.central_freq = central_freq
  152. self.Q = Q
  153. self.const_skirt_gain = const_skirt_gain
  154. def parse(self):
  155. return cde.BandpassBiquadOperation(self.sample_rate, self.central_freq, self.Q, self.const_skirt_gain)
  156. class BandrejectBiquad(AudioTensorOperation):
  157. """
  158. Design two-pole band filter for audio waveform of dimension of (..., time).
  159. Args:
  160. sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  161. central_freq (float): central frequency (in Hz).
  162. Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  163. Examples:
  164. >>> import numpy as np
  165. >>>
  166. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03],[9.246826171875e-03, 1.0894775390625e-02]])
  167. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  168. >>> transforms = [audio.BandrejectBiquad(44100, 200.0)]
  169. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  170. """
  171. @check_bandreject_biquad
  172. def __init__(self, sample_rate, central_freq, Q=0.707):
  173. self.sample_rate = sample_rate
  174. self.central_freq = central_freq
  175. self.Q = Q
  176. def parse(self):
  177. return cde.BandrejectBiquadOperation(self.sample_rate, self.central_freq, self.Q)
  178. class BassBiquad(AudioTensorOperation):
  179. """
  180. Design a bass tone-control effect for audio waveform of dimension of (..., time).
  181. Args:
  182. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  183. gain (float): Desired gain at the boost (or attenuation) in dB.
  184. central_freq (float): Central frequency (in Hz) (default=100.0).
  185. Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  186. Examples:
  187. >>> import numpy as np
  188. >>>
  189. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  190. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  191. >>> transforms = [audio.BassBiquad(44100, 100.0)]
  192. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  193. """
  194. @check_bass_biquad
  195. def __init__(self, sample_rate, gain, central_freq=100.0, Q=0.707):
  196. self.sample_rate = sample_rate
  197. self.gain = gain
  198. self.central_freq = central_freq
  199. self.Q = Q
  200. def parse(self):
  201. return cde.BassBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q)
  202. class Biquad(TensorOperation):
  203. """
  204. Perform a biquad filter of input tensor.
  205. Args:
  206. b0 (float): Numerator coefficient of current input, x[n].
  207. b1 (float): Numerator coefficient of input one time step ago x[n-1].
  208. b2 (float): Numerator coefficient of input two time steps ago x[n-2].
  209. a0 (float): Denominator coefficient of current output y[n], the value can't be zero, typically 1.
  210. a1 (float): Denominator coefficient of current output y[n-1].
  211. a2 (float): Denominator coefficient of current output y[n-2].
  212. Examples:
  213. >>> import numpy as np
  214. >>>
  215. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  216. >>> biquad_op = audio.Biquad(0.01, 0.02, 0.13, 1, 0.12, 0.3)
  217. >>> waveform_filtered = biquad_op(waveform)
  218. """
  219. @check_biquad
  220. def __init__(self, b0, b1, b2, a0, a1, a2):
  221. self.b0 = b0
  222. self.b1 = b1
  223. self.b2 = b2
  224. self.a0 = a0
  225. self.a1 = a1
  226. self.a2 = a2
  227. def parse(self):
  228. return cde.BiquadOperation(self.b0, self.b1, self.b2, self.a0, self.a1, self.a2)
  229. class ComplexNorm(AudioTensorOperation):
  230. """
  231. Compute the norm of complex tensor input.
  232. Args:
  233. power (float, optional): Power of the norm, which must be non-negative (default=1.0).
  234. Examples:
  235. >>> import numpy as np
  236. >>>
  237. >>> waveform = np.random.random([2, 4, 2])
  238. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  239. >>> transforms = [audio.ComplexNorm()]
  240. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  241. """
  242. @check_complex_norm
  243. def __init__(self, power=1.0):
  244. self.power = power
  245. def parse(self):
  246. return cde.ComplexNormOperation(self.power)
  247. DE_C_BORDER_TYPE = {
  248. BorderType.CONSTANT: cde.BorderType.DE_BORDER_CONSTANT,
  249. BorderType.EDGE: cde.BorderType.DE_BORDER_EDGE,
  250. BorderType.REFLECT: cde.BorderType.DE_BORDER_REFLECT,
  251. BorderType.SYMMETRIC: cde.BorderType.DE_BORDER_SYMMETRIC,
  252. }
  253. class ComputeDeltas(AudioTensorOperation):
  254. """
  255. Compute delta coefficients of a spectrogram.
  256. Args:
  257. win_length (int): The window length used for computing delta, must be no less than 3 (default=5).
  258. mode (BorderType): Mode parameter passed to padding (default=BorderType.EDGE).It can be any of
  259. [BorderType.CONSTANT, BorderType.EDGE, BorderType.REFLECT, BordBorderTypeer.SYMMETRIC].
  260. - BorderType.CONSTANT, means it fills the border with constant values.
  261. - BorderType.EDGE, means it pads with the last value on the edge.
  262. - BorderType.REFLECT, means it reflects the values on the edge omitting the last
  263. value of edge.
  264. - BorderType.SYMMETRIC, means it reflects the values on the edge repeating the last
  265. value of edge.
  266. Examples:
  267. >>> import numpy as np
  268. >>>
  269. >>> waveform = np.random.random([1, 400//2+1, 30])
  270. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  271. >>> transforms = [audio.ComputeDeltas(win_length=7, pad_mode = BorderType.EDGE)]
  272. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  273. """
  274. @check_compute_deltas
  275. def __init__(self, win_length=5, pad_mode=BorderType.EDGE):
  276. self.win_len = win_length
  277. self.pad_mode = pad_mode
  278. def parse(self):
  279. return cde.ComputeDeltasOperation(self.win_len, DE_C_BORDER_TYPE[self.pad_mode])
  280. class Contrast(AudioTensorOperation):
  281. """
  282. Apply contrast effect. Similar to SoX implementation.
  283. Comparable with compression, this effect modifies an audio signal to make it sound louder.
  284. Args:
  285. enhancement_amount (float): Controls the amount of the enhancement. Allowed range is [0, 100] (default=75.0).
  286. Note that enhancement_amount equal to 0 still gives a significant contrast enhancement.
  287. Examples:
  288. >>> import numpy as np
  289. >>>
  290. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  291. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  292. >>> transforms = [audio.Contrast()]
  293. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  294. """
  295. @check_contrast
  296. def __init__(self, enhancement_amount=75.0):
  297. self.enhancement_amount = enhancement_amount
  298. def parse(self):
  299. return cde.ContrastOperation(self.enhancement_amount)
  300. class DBToAmplitude(AudioTensorOperation):
  301. """
  302. Turn a waveform from the decibel scale to the power/amplitude scale.
  303. Args:
  304. ref (float): Reference which the output will be scaled by.
  305. power (float): If power equals 1, will compute DB to power. If 0.5, will compute DB to amplitude.
  306. Examples:
  307. >>> import numpy as np
  308. >>>
  309. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  310. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  311. >>> transforms = [audio.DBToAmplitude(0.5, 0.5)]
  312. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  313. """
  314. @check_db_to_amplitude
  315. def __init__(self, ref, power):
  316. self.ref = ref
  317. self.power = power
  318. def parse(self):
  319. return cde.DBToAmplitudeOperation(self.ref, self.power)
  320. class DCShift(AudioTensorOperation):
  321. """
  322. Apply a DC shift to the audio.
  323. Args:
  324. shift (float): The amount to shift the audio, the value must be in the range [-2.0, 2.0].
  325. limiter_gain (float, optional): Used only on peaks to prevent clipping,
  326. the value should be much less than 1, such as 0.05 or 0.02.
  327. Examples:
  328. >>> import numpy as np
  329. >>>
  330. >>> waveform = np.array([0.60, 0.97, -1.04, -1.26, 0.97, 0.91, 0.48, 0.93])
  331. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  332. >>> transforms = [audio.DCShift(0.5, 0.02)]
  333. >>> numpy_slices_dataset = numpy_slices_dataset.map(operation=transforms, input_columns=["audio"])
  334. """
  335. @check_dc_shift
  336. def __init__(self, shift, limiter_gain=None):
  337. self.shift = shift
  338. self.limiter_gain = limiter_gain if limiter_gain else shift
  339. def parse(self):
  340. return cde.DCShiftOperation(self.shift, self.limiter_gain)
  341. class DeemphBiquad(AudioTensorOperation):
  342. """
  343. Design two-pole deemph filter for audio waveform of dimension of (..., time).
  344. Args:
  345. sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz),
  346. the value must be 44100 or 48000.
  347. Examples:
  348. >>> import numpy as np
  349. >>>
  350. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  351. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  352. >>> transforms = [audio.DeemphBiquad(44100)]
  353. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  354. """
  355. @check_deemph_biquad
  356. def __init__(self, sample_rate):
  357. self.sample_rate = sample_rate
  358. def parse(self):
  359. return cde.DeemphBiquadOperation(self.sample_rate)
  360. class DetectPitchFrequency(AudioTensorOperation):
  361. """
  362. Detect pitch frequency.
  363. It is implemented using normalized cross-correlation function and median smoothing.
  364. Args:
  365. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  366. frame_time (float, optional): Duration of a frame, the value must be greater than zero (default=0.01).
  367. win_length (int, optional): The window length for median smoothing (in number of frames), the value must be
  368. greater than zero (default=30).
  369. freq_low (int, optional): Lowest frequency that can be detected (Hz), the value must be greater than zero
  370. (default=85).
  371. freq_high (int, optional): Highest frequency that can be detected (Hz), the value must be greater than zero
  372. (default=3400).
  373. Examples:
  374. >>> import numpy as np
  375. >>>
  376. >>> waveform = np.array([[0.716064e-03, 5.347656e-03, 6.246826e-03, 2.089477e-02, 7.138305e-02],
  377. ... [4.156616e-02, 1.394653e-02, 3.550292e-02, 0.614379e-02, 3.840209e-02]])
  378. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  379. >>> transforms = [audio.DetectPitchFrequency(30, 0.1, 3, 5, 25)]
  380. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  381. """
  382. @check_detect_pitch_frequency
  383. def __init__(self, sample_rate, frame_time=0.01, win_length=30, freq_low=85, freq_high=3400):
  384. self.sample_rate = sample_rate
  385. self.frame_time = frame_time
  386. self.win_length = win_length
  387. self.freq_low = freq_low
  388. self.freq_high = freq_high
  389. def parse(self):
  390. return cde.DetectPitchFrequencyOperation(self.sample_rate, self.frame_time,
  391. self.win_length, self.freq_low, self.freq_high)
  392. class EqualizerBiquad(AudioTensorOperation):
  393. """
  394. Design biquad equalizer filter and perform filtering. Similar to SoX implementation.
  395. Args:
  396. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  397. center_freq (float): Central frequency (in Hz).
  398. gain (float): Desired gain at the boost (or attenuation) in dB.
  399. Q (float, optional): https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  400. Examples:
  401. >>> import numpy as np
  402. >>>
  403. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  404. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  405. >>> transforms = [audio.EqualizerBiquad(44100, 1500, 5.5, 0.7)]
  406. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  407. """
  408. @check_equalizer_biquad
  409. def __init__(self, sample_rate, center_freq, gain, Q=0.707):
  410. self.sample_rate = sample_rate
  411. self.center_freq = center_freq
  412. self.gain = gain
  413. self.Q = Q
  414. def parse(self):
  415. return cde.EqualizerBiquadOperation(self.sample_rate, self.center_freq, self.gain, self.Q)
  416. DE_C_FADESHAPE_TYPE = {FadeShape.LINEAR: cde.FadeShape.DE_FADESHAPE_LINEAR,
  417. FadeShape.EXPONENTIAL: cde.FadeShape.DE_FADESHAPE_EXPONENTIAL,
  418. FadeShape.LOGARITHMIC: cde.FadeShape.DE_FADESHAPE_LOGARITHMIC,
  419. FadeShape.QUARTERSINE: cde.FadeShape.DE_FADESHAPE_QUARTERSINE,
  420. FadeShape.HALFSINE: cde.FadeShape.DE_FADESHAPE_HALFSINE}
  421. class Fade(AudioTensorOperation):
  422. """
  423. Add a fade in and/or fade out to an waveform.
  424. Args:
  425. fade_in_len (int, optional): Length of fade-in (time frames), which must be non-negative (default=0).
  426. fade_out_len (int, optional): Length of fade-out (time frames), which must be non-negative (default=0).
  427. fade_shape (FadeShape, optional): Shape of fade (default=FadeShape.LINEAR). Can be one of
  428. [FadeShape.LINEAR, FadeShape.EXPONENTIAL, FadeShape.LOGARITHMIC, FadeShape.QUARTERSINC, FadeShape.HALFSINC].
  429. -FadeShape.LINEAR, means it linear to 0.
  430. -FadeShape.EXPONENTIAL, means it tend to 0 in an exponential function.
  431. -FadeShape.LOGARITHMIC, means it tend to 0 in an logrithmic function.
  432. -FadeShape.QUARTERSINE, means it tend to 0 in an quarter sin function.
  433. -FadeShape.HALFSINE, means it tend to 0 in an half sin function.
  434. Raises:
  435. RuntimeError: If fade_in_len exceeds waveform length.
  436. RuntimeError: If fade_out_len exceeds waveform length.
  437. Examples:
  438. >>> import numpy as np
  439. >>>
  440. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03, 9.246826171875e-03, 1.0894775390625e-02]])
  441. >>> dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  442. >>> transforms = [audio.Fade(fade_in_len=3, fade_out_len=2, fade_shape=FadeShape.LINEAR)]
  443. >>> dataset = dataset.map(operations=transforms, input_columns=["audio"])
  444. """
  445. @check_fade
  446. def __init__(self, fade_in_len=0, fade_out_len=0, fade_shape=FadeShape.LINEAR):
  447. self.fade_in_len = fade_in_len
  448. self.fade_out_len = fade_out_len
  449. self.fade_shape = fade_shape
  450. def parse(self):
  451. return cde.FadeOperation(self.fade_in_len, self.fade_out_len, DE_C_FADESHAPE_TYPE[self.fade_shape])
  452. DE_C_MODULATION_TYPE = {Modulation.SINUSOIDAL: cde.Modulation.DE_MODULATION_SINUSOIDAL,
  453. Modulation.TRIANGULAR: cde.Modulation.DE_MODULATION_TRIANGULAR}
  454. DE_C_INTERPOLATION_TYPE = {Interpolation.LINEAR: cde.Interpolation.DE_INTERPOLATION_LINEAR,
  455. Interpolation.QUADRATIC: cde.Interpolation.DE_INTERPOLATION_QUADRATIC}
  456. class Flanger(AudioTensorOperation):
  457. """
  458. Apply a flanger effect to the audio.
  459. Args:
  460. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz).
  461. delay (float, optional): Desired delay in milliseconds (ms), range: [0, 30] (default=0.0).
  462. depth (float, optional): Desired delay depth in milliseconds (ms), range: [0, 10] (default=2.0).
  463. regen (float, optional): Desired regen (feedback gain) in dB, range: [-95, 95] (default=0.0).
  464. width (float, optional): Desired width (delay gain) in dB, range: [0, 100] (default=71.0).
  465. speed (float, optional): Modulation speed in Hz, range: [0.1, 10] (default=0.5).
  466. phase (float, optional): Percentage phase-shift for multi-channel, range: [0, 100] (default=25.0).
  467. modulation (Modulation, optional): Modulation of the input tensor (default=Modulation.SINUSOIDAL).
  468. It can be one of Modulation.SINUSOIDAL or Modulation.TRIANGULAR.
  469. interpolation (Interpolation, optional): Interpolation of the input tensor (default=Interpolation.LINEAR).
  470. It can be one of Interpolation.LINEAR or Interpolation.QUADRATIC.
  471. Examples:
  472. >>> import numpy as np
  473. >>>
  474. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  475. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  476. >>> transforms = [audio.Flanger(44100)]
  477. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  478. """
  479. @check_flanger
  480. def __init__(self, sample_rate, delay=0.0, depth=2.0, regen=0.0, width=71.0, speed=0.5,
  481. phase=25.0, modulation=Modulation.SINUSOIDAL, interpolation=Interpolation.LINEAR):
  482. self.sample_rate = sample_rate
  483. self.delay = delay
  484. self.depth = depth
  485. self.regen = regen
  486. self.width = width
  487. self.speed = speed
  488. self.phase = phase
  489. self.modulation = modulation
  490. self.interpolation = interpolation
  491. def parse(self):
  492. return cde.FlangerOperation(self.sample_rate, self.delay, self.depth, self.regen, self.width, self.speed,
  493. self.phase, DE_C_MODULATION_TYPE[self.modulation],
  494. DE_C_INTERPOLATION_TYPE[self.interpolation])
  495. class FrequencyMasking(AudioTensorOperation):
  496. """
  497. Apply masking to a spectrogram in the frequency domain.
  498. Args:
  499. iid_masks (bool, optional): Whether to apply different masks to each example (default=false).
  500. frequency_mask_param (int): Maximum possible length of the mask, range: [0, freq_length] (default=0).
  501. Indices uniformly sampled from [0, frequency_mask_param].
  502. mask_start (int): Mask start takes effect when iid_masks=true,
  503. range: [0, freq_length-frequency_mask_param] (default=0).
  504. mask_value (double): Mask value (default=0.0).
  505. Examples:
  506. >>> import numpy as np
  507. >>>
  508. >>> waveform = np.random.random([1, 3, 2])
  509. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  510. >>> transforms = [audio.FrequencyMasking(frequency_mask_param=1)]
  511. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  512. """
  513. @check_masking
  514. def __init__(self, iid_masks=False, frequency_mask_param=0, mask_start=0, mask_value=0.0):
  515. self.iid_masks = iid_masks
  516. self.frequency_mask_param = frequency_mask_param
  517. self.mask_start = mask_start
  518. self.mask_value = mask_value
  519. def parse(self):
  520. return cde.FrequencyMaskingOperation(self.iid_masks, self.frequency_mask_param, self.mask_start,
  521. self.mask_value)
  522. class HighpassBiquad(AudioTensorOperation):
  523. """
  524. Design biquad highpass filter and perform filtering. Similar to SoX implementation.
  525. Args:
  526. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  527. cutoff_freq (float): Filter cutoff frequency (in Hz).
  528. Q (float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  529. Examples:
  530. >>> import numpy as np
  531. >>>
  532. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  533. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  534. >>> transforms = [audio.HighpassBiquad(44100, 1500, 0.7)]
  535. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  536. """
  537. @check_highpass_biquad
  538. def __init__(self, sample_rate, cutoff_freq, Q=0.707):
  539. self.sample_rate = sample_rate
  540. self.cutoff_freq = cutoff_freq
  541. self.Q = Q
  542. def parse(self):
  543. return cde.HighpassBiquadOperation(self.sample_rate, self.cutoff_freq, self.Q)
  544. class LFilter(AudioTensorOperation):
  545. """
  546. Design two-pole filter for audio waveform of dimension of (..., time).
  547. Args:
  548. a_coeffs (sequence): denominator coefficients of difference equation of dimension of (n_order + 1).
  549. Lower delays coefficients are first, e.g. [a0, a1, a2, ...].
  550. Must be same size as b_coeffs (pad with 0's as necessary).
  551. b_coeffs (sequence): numerator coefficients of difference equation of dimension of (n_order + 1).
  552. Lower delays coefficients are first, e.g. [b0, b1, b2, ...].
  553. Must be same size as a_coeffs (pad with 0's as necessary).
  554. clamp (bool, optional): If True, clamp the output signal to be in the range [-1, 1] (default=True).
  555. Examples:
  556. >>> import numpy as np
  557. >>>
  558. >>> waveform = np.array([[2.716064453125e-03, 6.34765625e-03], [9.246826171875e-03, 1.0894775390625e-02]])
  559. >>> a_coeffs = [0.1, 0.2, 0.3]
  560. >>> b_coeffs = [0.1, 0.2, 0.3]
  561. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  562. >>> transforms = [audio.LFilter(a_coeffs, b_coeffs)]
  563. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  564. """
  565. @check_lfilter
  566. def __init__(self, a_coeffs, b_coeffs, clamp=True):
  567. self.a_coeffs = a_coeffs
  568. self.b_coeffs = b_coeffs
  569. self.clamp = clamp
  570. def parse(self):
  571. return cde.LFilterOperation(self.a_coeffs, self.b_coeffs, self.clamp)
  572. class LowpassBiquad(AudioTensorOperation):
  573. """
  574. Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
  575. Args:
  576. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  577. cutoff_freq (float): Filter cutoff frequency.
  578. Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  579. Examples:
  580. >>> import numpy as np
  581. >>>
  582. >>> waveform = np.array([[0.8236, 0.2049, 0.3335], [0.5933, 0.9911, 0.2482],
  583. ... [0.3007, 0.9054, 0.7598], [0.5394, 0.2842, 0.5634], [0.6363, 0.2226, 0.2288]])
  584. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  585. >>> transforms = [audio.LowpassBiquad(4000, 1500, 0.7)]
  586. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  587. """
  588. @check_lowpass_biquad
  589. def __init__(self, sample_rate, cutoff_freq, Q=0.707):
  590. self.sample_rate = sample_rate
  591. self.cutoff_freq = cutoff_freq
  592. self.Q = Q
  593. def parse(self):
  594. return cde.LowpassBiquadOperation(self.sample_rate, self.cutoff_freq, self.Q)
  595. class Magphase(AudioTensorOperation):
  596. """
  597. Separate a complex-valued spectrogram with shape (..., 2) into its magnitude and phase.
  598. Args:
  599. power (float): Power of the norm, which must be non-negative (default=1.0).
  600. Examples:
  601. >>> import numpy as np
  602. >>>
  603. >>> waveform = np.random.random([2, 4, 2])
  604. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  605. >>> transforms = [audio.Magphase()]
  606. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  607. """
  608. @check_magphase
  609. def __init__(self, power=1.0):
  610. self.power = power
  611. def parse(self):
  612. return cde.MagphaseOperation(self.power)
  613. class MuLawDecoding(AudioTensorOperation):
  614. """
  615. Decode mu-law encoded signal.
  616. Args:
  617. quantization_channels (int): Number of channels, which must be positive (Default: 256).
  618. Examples:
  619. >>> import numpy as np
  620. >>>
  621. >>> waveform = np.random.random([1, 3, 4])
  622. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  623. >>> transforms = [audio.MuLawDecoding()]
  624. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  625. """
  626. @check_mu_law_coding
  627. def __init__(self, quantization_channels=256):
  628. self.quantization_channels = quantization_channels
  629. def parse(self):
  630. return cde.MuLawDecodingOperation(self.quantization_channels)
  631. class MuLawEncoding(AudioTensorOperation):
  632. """
  633. Encode signal based on mu-law companding.
  634. Args:
  635. quantization_channels (int): Number of channels, which must be positive (Default: 256).
  636. Examples:
  637. >>> import numpy as np
  638. >>>
  639. >>> waveform = np.random.random([0.1, 0.3, 0.4])
  640. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  641. >>> transforms = [audio.MuLawEncoding()]
  642. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  643. """
  644. @check_mu_law_coding
  645. def __init__(self, quantization_channels=256):
  646. self.quantization_channels = quantization_channels
  647. def parse(self):
  648. return cde.MuLawEncodingOperation(self.quantization_channels)
  649. class Overdrive(AudioTensorOperation):
  650. """
  651. Apply overdrive on input audio.
  652. Args:
  653. gain (float): Desired gain at the boost (or attenuation) in dB, in range of [0, 100] (default=20.0).
  654. color (float): Controls the amount of even harmonic content in the over-driven output,
  655. in range of [0, 100] (default=20.0).
  656. Examples:
  657. >>> import numpy as np
  658. >>>
  659. >>> waveform = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
  660. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  661. >>> transforms = [audio.Overdrive()]
  662. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  663. """
  664. @check_overdrive
  665. def __init__(self, gain=20.0, color=20.0):
  666. self.gain = gain
  667. self.color = color
  668. def parse(self):
  669. return cde.OverdriveOperation(self.gain, self.color)
  670. class Phaser(AudioTensorOperation):
  671. """
  672. Apply a phasing effect to the audio.
  673. Args:
  674. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz).
  675. gain_in (float): Desired input gain at the boost (or attenuation) in dB.
  676. Allowed range of values is [0, 1] (default=0.4).
  677. gain_out (float): Desired output gain at the boost (or attenuation) in dB.
  678. Allowed range of values is [0, 1e9] (default=0.74).
  679. delay_ms (float): Desired delay in milli seconds. Allowed range of values is [0, 5] (default=3.0).
  680. decay (float): Desired decay relative to gain-in. Allowed range of values is [0, 0.99] (default=0.4).
  681. mod_speed (float): Modulation speed in Hz. Allowed range of values is [0.1, 2] (default=0.5).
  682. sinusoidal (bool): If True, use sinusoidal modulation (preferable for multiple instruments).
  683. If False, use triangular modulation (gives single instruments a sharper
  684. phasing effect) (default=True).
  685. Examples:
  686. >>> import numpy as np
  687. >>>
  688. >>> waveform = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
  689. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  690. >>> transforms = [audio.Phaser(44100)]
  691. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  692. """
  693. @check_phaser
  694. def __init__(self, sample_rate, gain_in=0.4, gain_out=0.74,
  695. delay_ms=3.0, decay=0.4, mod_speed=0.5, sinusoidal=True):
  696. self.decay = decay
  697. self.delay_ms = delay_ms
  698. self.gain_in = gain_in
  699. self.gain_out = gain_out
  700. self.mod_speed = mod_speed
  701. self.sample_rate = sample_rate
  702. self.sinusoidal = sinusoidal
  703. def parse(self):
  704. return cde.PhaserOperation(self.sample_rate, self.gain_in, self.gain_out,
  705. self.delay_ms, self.decay, self.mod_speed, self.sinusoidal)
  706. class RiaaBiquad(AudioTensorOperation):
  707. """
  708. Apply RIAA vinyl playback equalization. Similar to SoX implementation.
  709. Args:
  710. sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz),
  711. can only be one of 44100, 48000, 88200, 96000.
  712. Examples:
  713. >>> import numpy as np
  714. >>>
  715. >>> waveform = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
  716. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  717. >>> transforms = [audio.RiaaBiquad(44100)]
  718. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  719. """
  720. @check_riaa_biquad
  721. def __init__(self, sample_rate):
  722. self.sample_rate = sample_rate
  723. def parse(self):
  724. return cde.RiaaBiquadOperation(self.sample_rate)
  725. class SlidingWindowCmn(AudioTensorOperation):
  726. """
  727. Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
  728. Args:
  729. cmn_window (int, optional): Window in frames for running average CMN computation (default=600).
  730. min_cmn_window (int, optional): Minimum CMN window used at start of decoding (adds latency only at start).
  731. Only applicable if center is False, ignored if center is True (default=100).
  732. center (bool, optional): If True, use a window centered on the current frame. If False, window is
  733. to the left. (default=False).
  734. norm_vars (bool, optional): If True, normalize variance to one. (default=False).
  735. Examples:
  736. >>> import numpy as np
  737. >>>
  738. >>> waveform = np.array([[[1, 2, 3], [4, 5, 6]]], dtype=np.float64)
  739. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  740. >>> transforms = [audio.SlidingWindowCmn()]
  741. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  742. """
  743. @check_sliding_window_cmn
  744. def __init__(self, cmn_window=600, min_cmn_window=100, center=False, norm_vars=False):
  745. self.cmn_window = cmn_window
  746. self.min_cmn_window = min_cmn_window
  747. self.center = center
  748. self.norm_vars = norm_vars
  749. def parse(self):
  750. return cde.SlidingWindowCmnOperation(self.cmn_window, self.min_cmn_window, self.center, self.norm_vars)
  751. class TimeMasking(AudioTensorOperation):
  752. """
  753. Apply masking to a spectrogram in the time domain.
  754. Args:
  755. iid_masks (bool, optional): Whether to apply different masks to each example (default=false).
  756. time_mask_param (int): Maximum possible length of the mask, range: [0, time_length] (default=0).
  757. Indices uniformly sampled from [0, time_mask_param].
  758. mask_start (int): Mask start takes effect when iid_masks=true,
  759. range: [0, time_length-time_mask_param] (default=0).
  760. mask_value (double): Mask value (default=0.0).
  761. Examples:
  762. >>> import numpy as np
  763. >>>
  764. >>> waveform = np.random.random([1, 3, 2])
  765. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  766. >>> transforms = [audio.TimeMasking(time_mask_param=1)]
  767. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  768. """
  769. @check_masking
  770. def __init__(self, iid_masks=False, time_mask_param=0, mask_start=0, mask_value=0.0):
  771. self.iid_masks = iid_masks
  772. self.time_mask_param = time_mask_param
  773. self.mask_start = mask_start
  774. self.mask_value = mask_value
  775. def parse(self):
  776. return cde.TimeMaskingOperation(self.iid_masks, self.time_mask_param, self.mask_start, self.mask_value)
  777. class TimeStretch(AudioTensorOperation):
  778. """
  779. Stretch STFT in time at a given rate, without changing the pitch.
  780. Args:
  781. hop_length (int, optional): Length of hop between STFT windows (default=None, will use ((n_freq - 1) * 2) // 2).
  782. n_freq (int, optional): Number of filter banks form STFT (default=201).
  783. fixed_rate (float, optional): Rate to speed up or slow down the input in time
  784. (default=None, will keep the original rate).
  785. Examples:
  786. >>> import numpy as np
  787. >>>
  788. >>> waveform = np.random.random([1, 30])
  789. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  790. >>> transforms = [audio.TimeStretch()]
  791. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  792. """
  793. @check_time_stretch
  794. def __init__(self, hop_length=None, n_freq=201, fixed_rate=None):
  795. self.n_freq = n_freq
  796. self.fixed_rate = fixed_rate
  797. n_fft = (n_freq - 1) * 2
  798. self.hop_length = hop_length if hop_length is not None else n_fft // 2
  799. self.fixed_rate = fixed_rate if fixed_rate is not None else 1
  800. def parse(self):
  801. return cde.TimeStretchOperation(self.hop_length, self.n_freq, self.fixed_rate)
  802. class TrebleBiquad(AudioTensorOperation):
  803. """
  804. Design a treble tone-control effect. Similar to SoX implementation.
  805. Args:
  806. sample_rate (int): Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
  807. gain (float): Desired gain at the boost (or attenuation) in dB.
  808. central_freq (float, optional): Central frequency (in Hz) (default=3000).
  809. Q(float, optional): Quality factor, https://en.wikipedia.org/wiki/Q_factor, range: (0, 1] (default=0.707).
  810. Examples:
  811. >>> import numpy as np
  812. >>>
  813. >>> waveform = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
  814. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  815. >>> transforms = [audio.TrebleBiquad(44100, 200.0)]
  816. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  817. """
  818. @check_treble_biquad
  819. def __init__(self, sample_rate, gain, central_freq=3000, Q=0.707):
  820. self.sample_rate = sample_rate
  821. self.gain = gain
  822. self.central_freq = central_freq
  823. self.Q = Q
  824. def parse(self):
  825. return cde.TrebleBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q)
  826. DE_C_GAINTYPE_TYPE = {GainType.AMPLITUDE: cde.GainType.DE_GAINTYPE_AMPLITUDE,
  827. GainType.POWER: cde.GainType.DE_GAINTYPE_POWER,
  828. GainType.DB: cde.GainType.DE_GAINTYPE_DB}
  829. class Vol(AudioTensorOperation):
  830. """
  831. Apply amplification or attenuation to the whole waveform.
  832. Args:
  833. gain (float): Value of gain adjustment.
  834. If gain_type = amplitude, gain stands for nonnegative amplitude ratio.
  835. If gain_type = power, gain stands for power.
  836. If gain_type = db, gain stands for decibels.
  837. gain_type (GainType, optional): Type of gain, contains the following three enumeration values
  838. GainType.AMPLITUDE, GainType.POWER and GainType.DB (default=GainType.AMPLITUDE).
  839. Examples:
  840. >>> import numpy as np
  841. >>>
  842. >>> waveform = np.random.random([20, 30])
  843. >>> numpy_slices_dataset = ds.NumpySlicesDataset(data=waveform, column_names=["audio"])
  844. >>> transforms = [audio.Vol(gain=10, gain_type=GainType.DB)]
  845. >>> numpy_slices_dataset = numpy_slices_dataset.map(operations=transforms, input_columns=["audio"])
  846. """
  847. @check_vol
  848. def __init__(self, gain, gain_type=GainType.AMPLITUDE):
  849. self.gain = gain
  850. self.gain_type = gain_type
  851. def parse(self):
  852. return cde.VolOperation(self.gain, DE_C_GAINTYPE_TYPE[self.gain_type])