Merge pull request !17631 from QingfengLi/FreqMasktags/v1.5.0-rc1
| @@ -23,6 +23,7 @@ | |||
| #include "minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/bass_biquad_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/time_masking_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/time_stretch_ir.h" | |||
| @@ -135,6 +136,27 @@ std::shared_ptr<TensorOperation> BassBiquad::Parse() { | |||
| return std::make_shared<BassBiquadOperation>(data_->sample_rate_, data_->gain_, data_->central_freq_, data_->Q_); | |||
| } | |||
| // FrequencyMasking Transform Operation. | |||
| struct FrequencyMasking::Data { | |||
| Data(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value) | |||
| : iid_masks_(iid_masks), | |||
| frequency_mask_param_(frequency_mask_param), | |||
| mask_start_(mask_start), | |||
| mask_value_(mask_value) {} | |||
| int32_t frequency_mask_param_; | |||
| int32_t mask_start_; | |||
| bool iid_masks_; | |||
| double mask_value_; | |||
| }; | |||
| FrequencyMasking::FrequencyMasking(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value) | |||
| : data_(std::make_shared<Data>(iid_masks, frequency_mask_param, mask_start, mask_value)) {} | |||
| std::shared_ptr<TensorOperation> FrequencyMasking::Parse() { | |||
| return std::make_shared<FrequencyMaskingOperation>(data_->iid_masks_, data_->frequency_mask_param_, | |||
| data_->mask_start_, data_->mask_value_); | |||
| } | |||
| // TimeMasking Transform Operation. | |||
| struct TimeMasking::Data { | |||
| Data(bool iid_masks, int64_t time_mask_param, int64_t mask_start, double mask_value) | |||
| @@ -24,6 +24,7 @@ | |||
| #include "minddata/dataset/audio/ir/kernels/bandpass_biquad_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/bandreject_biquad_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/bass_biquad_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/time_masking_ir.h" | |||
| #include "minddata/dataset/audio/ir/kernels/time_stretch_ir.h" | |||
| #include "minddata/dataset/include/dataset/transforms.h" | |||
| @@ -115,6 +116,19 @@ PYBIND_REGISTER( | |||
| })); | |||
| })); | |||
| PYBIND_REGISTER( | |||
| FrequencyMaskingOperation, 1, ([](const py::module *m) { | |||
| (void) | |||
| py::class_<audio::FrequencyMaskingOperation, TensorOperation, std::shared_ptr<audio::FrequencyMaskingOperation>>( | |||
| *m, "FrequencyMaskingOperation") | |||
| .def(py::init([](bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value) { | |||
| auto frequency_masking = | |||
| std::make_shared<audio::FrequencyMaskingOperation>(iid_masks, frequency_mask_param, mask_start, mask_value); | |||
| THROW_IF_ERROR(frequency_masking->ValidateParams()); | |||
| return frequency_masking; | |||
| })); | |||
| })); | |||
| PYBIND_REGISTER( | |||
| TimeMaskingOperation, 1, ([](const py::module *m) { | |||
| (void)py::class_<audio::TimeMaskingOperation, TensorOperation, std::shared_ptr<audio::TimeMaskingOperation>>( | |||
| @@ -9,6 +9,7 @@ add_library(audio-ir-kernels OBJECT | |||
| bandpass_biquad_ir.cc | |||
| bandreject_biquad_ir.cc | |||
| bass_biquad_ir.cc | |||
| frequency_masking_ir.cc | |||
| time_masking_ir.cc | |||
| time_stretch_ir.cc | |||
| ) | |||
| @@ -0,0 +1,61 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/audio/ir/kernels/frequency_masking_ir.h" | |||
| #include "minddata/dataset/audio/kernels/frequency_masking_op.h" | |||
| #include "minddata/dataset/audio/ir/validators.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| namespace audio { | |||
| FrequencyMaskingOperation::FrequencyMaskingOperation(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, | |||
| double mask_value) | |||
| : iid_masks_(iid_masks), | |||
| frequency_mask_param_(frequency_mask_param), | |||
| mask_start_(mask_start), | |||
| mask_value_(mask_value) {} | |||
| FrequencyMaskingOperation::~FrequencyMaskingOperation() = default; | |||
| Status FrequencyMaskingOperation::ValidateParams() { | |||
| RETURN_IF_NOT_OK(CheckIntScalarNonNegative("FrequencyMasking", "frequency_mask_param", frequency_mask_param_)); | |||
| RETURN_IF_NOT_OK(CheckIntScalarNonNegative("FrequencyMasking", "mask_start", mask_start_)); | |||
| return Status::OK(); | |||
| } | |||
| std::shared_ptr<TensorOp> FrequencyMaskingOperation::Build() { | |||
| std::shared_ptr<FrequencyMaskingOp> tensor_op = | |||
| std::make_shared<FrequencyMaskingOp>(iid_masks_, frequency_mask_param_, mask_start_, mask_value_); | |||
| return tensor_op; | |||
| } | |||
| std::string FrequencyMaskingOperation::Name() const { return kFrequencyMaskingOperation; } | |||
| Status FrequencyMaskingOperation::to_json(nlohmann::json *out_json) { | |||
| nlohmann::json args; | |||
| args["frequency_mask_param"] = frequency_mask_param_; | |||
| args["mask_start"] = mask_start_; | |||
| args["iid_masks"] = iid_masks_; | |||
| args["mask_value"] = mask_value_; | |||
| *out_json = args; | |||
| return Status::OK(); | |||
| } | |||
| } // namespace audio | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,56 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_FREQUENCY_MASKING_IR_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_FREQUENCY_MASKING_IR_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "include/api/status.h" | |||
| #include "minddata/dataset/kernels/ir/tensor_operation.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| namespace audio { | |||
| constexpr char kFrequencyMaskingOperation[] = "FrequencyMasking"; | |||
| class FrequencyMaskingOperation : public TensorOperation { | |||
| public: | |||
| FrequencyMaskingOperation(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, double mask_value); | |||
| ~FrequencyMaskingOperation(); | |||
| std::shared_ptr<TensorOp> Build() override; | |||
| Status ValidateParams() override; | |||
| std::string Name() const override; | |||
| Status to_json(nlohmann::json *out_json) override; | |||
| private: | |||
| int32_t frequency_mask_param_; | |||
| int32_t mask_start_; | |||
| bool iid_masks_; | |||
| double mask_value_; | |||
| }; // class FrequencyMaskingOperation | |||
| } // namespace audio | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_IR_KERNELS_FREQUENCY_MASKING_IR_H_ | |||
| @@ -10,6 +10,7 @@ add_library(audio-kernels OBJECT | |||
| bandpass_biquad_op.cc | |||
| bandreject_biquad_op.cc | |||
| bass_biquad_op.cc | |||
| frequency_masking_op.cc | |||
| time_masking_op.cc | |||
| time_stretch_op.cc | |||
| ) | |||
| @@ -399,8 +399,7 @@ Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr | |||
| Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_width, | |||
| int64_t mask_start, double mask_value, int axis) { | |||
| if (axis != 2 && axis != 1) { | |||
| RETURN_STATUS_UNEXPECTED( | |||
| "MaskAlongAxis: only support Time and Frequency masking, the axis should be equal to 1 or 2."); | |||
| RETURN_STATUS_UNEXPECTED("MaskAlongAxis: only support Time and Frequency masking, axis should be 1 or 2."); | |||
| } | |||
| TensorShape input_shape = input->shape(); | |||
| // squeeze input | |||
| @@ -409,9 +408,9 @@ Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tenso | |||
| int check_dim_ind = (axis == 1) ? -2 : -1; | |||
| CHECK_FAIL_RETURN_UNEXPECTED(0 <= mask_start && mask_start <= input_shape[check_dim_ind], | |||
| "MaskAlongAxis: mask_start should be smaller than the length of chosen dim."); | |||
| "MaskAlongAxis: mask_start should be less than the length of chosen dimension."); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(mask_start + mask_width <= input_shape[check_dim_ind], | |||
| "MaskAlongAxis: mask_width with mask_start is out of bounds."); | |||
| "MaskAlongAxis: the sum of mask_start and mask_width is out of bounds."); | |||
| int64_t cell_size = input->type().SizeInBytes(); | |||
| @@ -208,7 +208,6 @@ Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr | |||
| /// \return Status code | |||
| Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int64_t mask_width, | |||
| int64_t mask_start, double mask_value, int axis); | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_ | |||
| @@ -0,0 +1,66 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "minddata/dataset/audio/kernels/frequency_masking_op.h" | |||
| #include "minddata/dataset/audio/kernels/audio_utils.h" | |||
| #include "minddata/dataset/kernels/data/data_utils.h" | |||
| #include "minddata/dataset/util/random.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| // constructor | |||
| FrequencyMaskingOp::FrequencyMaskingOp(bool iid_masks, int32_t frequency_mask_param, int32_t mask_start, | |||
| double mask_value) | |||
| : frequency_mask_param_(frequency_mask_param), | |||
| mask_start_(mask_start), | |||
| iid_masks_(iid_masks), | |||
| mask_value_(mask_value) { | |||
| rnd_.seed(GetSeed()); | |||
| } | |||
| // main function | |||
| Status FrequencyMaskingOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) { | |||
| IO_CHECK(input, output); | |||
| // input <..., freq, time> | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input->Rank() >= 2, | |||
| "FrequencyMasking: input tensor is not in shape of <..., freq, time>."); | |||
| TensorShape input_shape = input->shape(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED( | |||
| input_shape[-2] >= frequency_mask_param_, | |||
| "FrequencyMasking: frequency_mask_param should be less than the length of frequency dimension."); | |||
| std::shared_ptr<Tensor> input_tensor; | |||
| // typecast | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input->type() != DataType::DE_STRING, | |||
| "FrequencyMasking: input tensor type should be float, but got string."); | |||
| if (input->type() != DataType::DE_FLOAT64) { | |||
| RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32))); | |||
| } else { | |||
| input_tensor = input; | |||
| } | |||
| auto mask_val = | |||
| input->type() != DataType::DE_FLOAT64 ? static_cast<float>(mask_value_) : static_cast<double>(mask_value_); | |||
| // iid_masks - whether to apply different masks to each example/channel. | |||
| if (iid_masks_ == false) { | |||
| return MaskAlongAxis(input_tensor, output, frequency_mask_param_, mask_start_, mask_val, 1); | |||
| } else { | |||
| return RandomMaskAlongAxis(input_tensor, output, frequency_mask_param_, mask_val, 1, rnd_); | |||
| } | |||
| } | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,52 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_FREQUENCY_MASKING_OP_H_ | |||
| #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_FREQUENCY_MASKING_OP_H_ | |||
| #include <memory> | |||
| #include <random> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "minddata/dataset/core/tensor.h" | |||
| #include "minddata/dataset/kernels/tensor_op.h" | |||
| #include "minddata/dataset/util/status.h" | |||
| namespace mindspore { | |||
| namespace dataset { | |||
| class FrequencyMaskingOp : public TensorOp { | |||
| public: | |||
| explicit FrequencyMaskingOp(bool iid_masks = false, int32_t frequency_mask_param = 0, int32_t mask_start = 0, | |||
| double mask_value_ = 0.0); | |||
| ~FrequencyMaskingOp() override = default; | |||
| Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override; | |||
| std::string Name() const override { return kFrequencyMaskingOp; } | |||
| private: | |||
| bool iid_masks_; | |||
| int32_t frequency_mask_param_; | |||
| int32_t mask_start_; | |||
| double mask_value_; | |||
| std::mt19937 rnd_; | |||
| }; | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_FREQUENCY_MASKING_OP_H_ | |||
| @@ -36,7 +36,7 @@ Status TimeMaskingOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input->Rank() >= 2, "TimeMasking: input dimension must be greater than 2."); | |||
| TensorShape input_shape = input->shape(); | |||
| CHECK_FAIL_RETURN_UNEXPECTED(input_shape[-1] >= time_mask_param_, | |||
| "TimeMasking: input time_mask_param should be smaller than the length of time dim."); | |||
| "TimeMasking: time_mask_param should be less than the length of time dimension."); | |||
| std::shared_ptr<Tensor> input_tensor; | |||
| // typecast | |||
| @@ -187,19 +187,22 @@ class BassBiquad final : public TensorTransform { | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| /// \brief TimeStretch TensorTransform | |||
| /// \notes Stretch STFT in time at a given rate, without changing the pitch. | |||
| class TimeStretch final : public TensorTransform { | |||
| /// \brief FrequencyMasking TensorTransform. | |||
| /// \notes Apply masking to a spectrogram in the frequency domain. | |||
| class FrequencyMasking final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] hop_length Length of hop between STFT windows. Default: None. | |||
| /// \param[in] n_freq Number of filter banks form STFT. Default: 201. | |||
| /// \param[in] fixed_rate Rate to speed up or slow down the input in time. Default: None. | |||
| explicit TimeStretch(float hop_length = std::numeric_limits<float>::quiet_NaN(), int n_freq = 201, | |||
| float fixed_rate = std::numeric_limits<float>::quiet_NaN()); | |||
| /// \param[in] iid_masks Whether to apply different masks to each example. | |||
| /// \param[in] frequency_mask_param Maximum possible length of the mask. | |||
| /// Indices uniformly sampled from [0, frequency_mask_param]. | |||
| /// Mask width when iid_masks=true. | |||
| /// \param[in] mask_start Mask start when iid_masks=true. | |||
| /// \param[in] mask_value Mask value. | |||
| explicit FrequencyMasking(bool iid_masks = false, int32_t frequency_mask_param = 0, int32_t mask_start = 0, | |||
| double mask_value = 0.0); | |||
| /// \brief Destructor. | |||
| ~TimeStretch() = default; | |||
| ~FrequencyMasking() = default; | |||
| protected: | |||
| /// \brief Function to convert TensorTransform object into a TensorOperation object. | |||
| @@ -237,6 +240,30 @@ class TimeMasking final : public TensorTransform { | |||
| struct Data; | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| /// \brief TimeStretch TensorTransform | |||
| /// \notes Stretch STFT in time at a given rate, without changing the pitch. | |||
| class TimeStretch final : public TensorTransform { | |||
| public: | |||
| /// \brief Constructor. | |||
| /// \param[in] hop_length Length of hop between STFT windows. Default: None. | |||
| /// \param[in] n_freq Number of filter banks form STFT. Default: 201. | |||
| /// \param[in] fixed_rate Rate to speed up or slow down the input in time. Default: None. | |||
| explicit TimeStretch(float hop_length = std::numeric_limits<float>::quiet_NaN(), int n_freq = 201, | |||
| float fixed_rate = std::numeric_limits<float>::quiet_NaN()); | |||
| /// \brief Destructor. | |||
| ~TimeStretch() = default; | |||
| protected: | |||
| /// \brief Function to convert TensorTransform object into a TensorOperation object. | |||
| /// \return Shared pointer to TensorOperation object. | |||
| std::shared_ptr<TensorOperation> Parse() override; | |||
| private: | |||
| struct Data; | |||
| std::shared_ptr<Data> data_; | |||
| }; | |||
| } // namespace audio | |||
| } // namespace dataset | |||
| } // namespace mindspore | |||
| @@ -145,6 +145,7 @@ constexpr char kBandBiquadOp[] = "BandBiquadOp"; | |||
| constexpr char kBandpassBiquadOp[] = "BandpassBiquadOp"; | |||
| constexpr char kBandrejectBiquadOp[] = "BandrejectBiquadOp"; | |||
| constexpr char kBassBiquadOp[] = "BassBiquadOp"; | |||
| constexpr char kFrequencyMaskingOp[] = "FrequencyMaskingOp"; | |||
| constexpr char kTimeMaskingOp[] = "TimeMaskingOp"; | |||
| constexpr char kTimeStretchOp[] = "TimeStretchOp"; | |||
| @@ -251,37 +251,37 @@ class BassBiquad(AudioTensorOperation): | |||
| return cde.BassBiquadOperation(self.sample_rate, self.gain, self.central_freq, self.Q) | |||
| class TimeStretch(AudioTensorOperation): | |||
| class FrequencyMasking(AudioTensorOperation): | |||
| """ | |||
| Stretch STFT in time at a given rate, without changing the pitch. | |||
| Apply masking to a spectrogram in the frequency domain. | |||
| Args: | |||
| hop_length (int, optional): Length of hop between STFT windows (default=None). | |||
| n_freq (int, optional): Number of filter banks form STFT (default=201). | |||
| fixed_rate (float, optional): Rate to speed up or slow down the input in time (default=None). | |||
| iid_masks (bool, optional): Whether to apply different masks to each example (default=false). | |||
| frequency_mask_param (int): Maximum possible length of the mask (default=0). | |||
| Indices uniformly sampled from [0, frequency_mask_param]. | |||
| mask_start (int): Mask start when iid_masks=true (default=0). | |||
| mask_value (double): Mask value (default=0.0). | |||
| Examples: | |||
| >>> freq = 44100 | |||
| >>> num_frame = 30 | |||
| >>> def gen(): | |||
| ... np.random.seed(0) | |||
| ... data = np.random.random([freq, num_frame]) | |||
| ... yield (np.array(data, dtype=np.float32), ) | |||
| >>> data1 = ds.GeneratorDataset(source=gen, column_names=["multi_dimensional_data"]) | |||
| >>> transforms = [py_audio.TimeStretch()] | |||
| >>> data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"]) | |||
| ... random.seed(0) | |||
| ... data = numpy.random.random([1, 3, 2]) | |||
| ... yield (numpy.array(data, dtype=numpy.float32),) | |||
| >>> dataset = ds.GeneratorDataset(source=gen, | |||
| ... column_names=["multi_dim_data"]) | |||
| >>> dataset = dataset.map(operations=FrequencyMasking(frequency_mask_param=1), | |||
| ... input_columns=["multi_dim_data"]) | |||
| """ | |||
| @check_time_stretch | |||
| def __init__(self, hop_length=None, n_freq=201, fixed_rate=None): | |||
| self.n_freq = n_freq | |||
| self.fixed_rate = fixed_rate | |||
| n_fft = (n_freq - 1) * 2 | |||
| self.hop_length = hop_length if hop_length is not None else n_fft // 2 | |||
| self.fixed_rate = fixed_rate if fixed_rate is not None else np.nan | |||
| @check_masking | |||
| def __init__(self, iid_masks=False, frequency_mask_param=0, mask_start=0, mask_value=0.0): | |||
| self.iid_masks = iid_masks | |||
| self.frequency_mask_param = frequency_mask_param | |||
| self.mask_start = mask_start | |||
| self.mask_value = mask_value | |||
| def parse(self): | |||
| return cde.TimeStretchOperation(self.hop_length, self.n_freq, self.fixed_rate) | |||
| return cde.FrequencyMaskingOperation(self.iid_masks, self.frequency_mask_param, self.mask_start, | |||
| self.mask_value) | |||
| class TimeMasking(AudioTensorOperation): | |||
| @@ -314,3 +314,36 @@ class TimeMasking(AudioTensorOperation): | |||
| def parse(self): | |||
| return cde.TimeMaskingOperation(self.iid_masks, self.time_mask_param, self.mask_start, self.mask_value) | |||
| class TimeStretch(AudioTensorOperation): | |||
| """ | |||
| Stretch STFT in time at a given rate, without changing the pitch. | |||
| Args: | |||
| hop_length (int, optional): Length of hop between STFT windows (default=None). | |||
| n_freq (int, optional): Number of filter banks form STFT (default=201). | |||
| fixed_rate (float, optional): Rate to speed up or slow down the input in time (default=None). | |||
| Examples: | |||
| >>> freq = 44100 | |||
| >>> num_frame = 30 | |||
| >>> def gen(): | |||
| ... np.random.seed(0) | |||
| ... data = np.random.random([freq, num_frame]) | |||
| ... yield (np.array(data, dtype=np.float32), ) | |||
| >>> data1 = ds.GeneratorDataset(source=gen, column_names=["multi_dimensional_data"]) | |||
| >>> transforms = [py_audio.TimeStretch()] | |||
| >>> data1 = data1.map(operations=transforms, input_columns=["multi_dimensional_data"]) | |||
| """ | |||
| @check_time_stretch | |||
| def __init__(self, hop_length=None, n_freq=201, fixed_rate=None): | |||
| self.n_freq = n_freq | |||
| self.fixed_rate = fixed_rate | |||
| n_fft = (n_freq - 1) * 2 | |||
| self.hop_length = hop_length if hop_length is not None else n_fft // 2 | |||
| self.fixed_rate = fixed_rate if fixed_rate is not None else np.nan | |||
| def parse(self): | |||
| return cde.TimeStretchOperation(self.hop_length, self.n_freq, self.fixed_rate) | |||
| @@ -167,6 +167,26 @@ def check_bass_biquad(method): | |||
| return new_method | |||
| def check_masking(method): | |||
| """Wrapper method to check the parameters of time_masking and frequency_masking""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [iid_masks, mask_param, mask_start, mask_value], _ = parse_user_args( | |||
| method, *args, **kwargs) | |||
| type_check(iid_masks, (bool,), "iid_masks") | |||
| type_check(mask_param, (int,), "mask_param") | |||
| check_value(mask_param, (0, FLOAT_MAX_INTEGER), "mask_param") | |||
| type_check(mask_start, (int,), "mask_start") | |||
| check_value(mask_start, (0, FLOAT_MAX_INTEGER), "mask_start") | |||
| type_check(mask_value, (int, float), "mask_value") | |||
| check_value(mask_value, (0, DOUBLE_MAX_INTEGER), "mask_value") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_time_stretch(method): | |||
| """Wrapper method to check the parameters of time_stretch.""" | |||
| @wraps(method) | |||
| @@ -186,22 +206,3 @@ def check_time_stretch(method): | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| def check_masking(method): | |||
| """Wrapper method to check the parameters of time_masking and frequency_masking""" | |||
| @wraps(method) | |||
| def new_method(self, *args, **kwargs): | |||
| [iid_masks, mask_param, mask_start, mask_value], _ = parse_user_args( | |||
| method, *args, **kwargs) | |||
| type_check(iid_masks, (bool,), "iid_masks") | |||
| type_check(mask_param, (int,), "mask_param") | |||
| check_value(mask_param, (0, FLOAT_MAX_INTEGER), "mask_param") | |||
| type_check(mask_start, (int,), "mask_start") | |||
| check_value(mask_start, (0, FLOAT_MAX_INTEGER), "mask_start") | |||
| type_check(mask_value, (int, float), "mask_value") | |||
| check_value(mask_value, (0, DOUBLE_MAX_INTEGER), "mask_value") | |||
| return method(self, *args, **kwargs) | |||
| return new_method | |||
| @@ -19,6 +19,8 @@ | |||
| #include "minddata/dataset/include/dataset/audio.h" | |||
| #include "minddata/dataset/include/dataset/datasets.h" | |||
| #include "minddata/dataset/include/dataset/execute.h" | |||
| #include "minddata/dataset/include/dataset/transforms.h" | |||
| using namespace mindspore::dataset; | |||
| using mindspore::LogStream; | |||
| @@ -487,3 +489,64 @@ TEST_F(MindDataTestPipeline, TestAnglePipelineError) { | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| EXPECT_ERROR(iter->GetNextRow(&row)); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestFrequencyMaskingPipeline) { | |||
| MS_LOG(INFO) << "Doing TestFrequencyMasking Pipeline."; | |||
| // Original waveform | |||
| std::shared_ptr<SchemaObj> schema = Schema(); | |||
| ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {200, 200})); | |||
| std::shared_ptr<Dataset> ds = RandomData(50, schema); | |||
| EXPECT_NE(ds, nullptr); | |||
| ds = ds->SetNumWorkers(4); | |||
| EXPECT_NE(ds, nullptr); | |||
| auto frequencymasking = audio::FrequencyMasking(true, 6); | |||
| ds = ds->Map({frequencymasking}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Filtered waveform by bandbiquad | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| EXPECT_NE(ds, nullptr); | |||
| std::unordered_map<std::string, mindspore::MSTensor> row; | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| std::vector<int64_t> expected = {200, 200}; | |||
| int i = 0; | |||
| while (row.size() != 0) { | |||
| auto col = row["inputData"]; | |||
| ASSERT_EQ(col.Shape(), expected); | |||
| ASSERT_EQ(col.Shape().size(), 2); | |||
| ASSERT_EQ(col.DataType(), mindspore::DataType::kNumberTypeFloat32); | |||
| ASSERT_OK(iter->GetNextRow(&row)); | |||
| i++; | |||
| } | |||
| EXPECT_EQ(i, 50); | |||
| iter->Stop(); | |||
| } | |||
| TEST_F(MindDataTestPipeline, TestFrequencyMaskingWrongArgs) { | |||
| MS_LOG(INFO) << "Doing TestFrequencyMasking with wrong args."; | |||
| // Original waveform | |||
| std::shared_ptr<SchemaObj> schema = Schema(); | |||
| ASSERT_OK(schema->add_column("inputData", mindspore::DataType::kNumberTypeFloat32, {20, 20})); | |||
| std::shared_ptr<Dataset> ds = RandomData(50, schema); | |||
| EXPECT_NE(ds, nullptr); | |||
| ds = ds->SetNumWorkers(4); | |||
| EXPECT_NE(ds, nullptr); | |||
| auto frequencymasking = audio::FrequencyMasking(true, -100); | |||
| ds = ds->Map({frequencymasking}); | |||
| EXPECT_NE(ds, nullptr); | |||
| // Filtered waveform by bandbiquad | |||
| std::shared_ptr<Iterator> iter = ds->CreateIterator(); | |||
| // Expect failure | |||
| EXPECT_EQ(iter, nullptr); | |||
| } | |||
| @@ -197,6 +197,19 @@ TEST_F(MindDataTestExecute, TestCrop) { | |||
| EXPECT_EQ(image.Shape()[1], 15); | |||
| } | |||
| TEST_F(MindDataTestExecute, TestFrequencyMasking) { | |||
| MS_LOG(INFO) << "Doing TestFrequencyMasking."; | |||
| std::shared_ptr<Tensor> input_tensor_; | |||
| TensorShape s = TensorShape({6, 2}); | |||
| ASSERT_OK(Tensor::CreateFromVector( | |||
| std::vector<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}), s, &input_tensor_)); | |||
| auto input_tensor = mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(input_tensor_)); | |||
| std::shared_ptr<TensorTransform> frequency_masking_op = std::make_shared<audio::FrequencyMasking>(true, 2); | |||
| mindspore::dataset::Execute transform({frequency_masking_op}); | |||
| Status status = transform(input_tensor, &input_tensor); | |||
| EXPECT_TRUE(status.IsOk()); | |||
| } | |||
| TEST_F(MindDataTestExecute, TestTimeMasking) { | |||
| MS_LOG(INFO) << "Doing TestTimeMasking."; | |||
| std::shared_ptr<Tensor> input_tensor_; | |||
| @@ -0,0 +1,137 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Testing FrequencyMasking op in DE. | |||
| """ | |||
| import numpy as np | |||
| import pytest | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.audio.transforms as atf | |||
| from mindspore import log as logger | |||
| CHANNEL = 2 | |||
| FREQ = 30 | |||
| TIME = 30 | |||
| def gen(shape): | |||
| np.random.seed(0) | |||
| data = np.random.random(shape) | |||
| yield(np.array(data, dtype=np.float32),) | |||
| def _count_unequal_element(data_expected, data_me, rtol, atol): | |||
| """ Precision calculation func """ | |||
| assert data_expected.shape == data_me.shape | |||
| total_count = len(data_expected.flatten()) | |||
| error = np.abs(data_expected - data_me) | |||
| greater = np.greater(error, atol + np.abs(data_expected) * rtol) | |||
| loss_count = np.count_nonzero(greater) | |||
| assert (loss_count / total_count) < rtol, \ | |||
| "\ndata_expected_std:{0}\ndata_me_error:{1}\nloss:{2}". \ | |||
| format(data_expected[greater], data_me[greater], error[greater]) | |||
| def allclose_nparray(data_expected, data_me, rtol, atol, equal_nan=True): | |||
| """ Precision calculation formula """ | |||
| if np.any(np.isnan(data_expected)): | |||
| assert np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan) | |||
| elif not np.allclose(data_me, data_expected, rtol, atol, equal_nan=equal_nan): | |||
| _count_unequal_element(data_expected, data_me, rtol, atol) | |||
| else: | |||
| assert True | |||
| def test_func_frequency_masking_eager_random_input(): | |||
| """ mindspore eager mode normal testcase:frequency_masking op""" | |||
| logger.info("test frequency_masking op") | |||
| spectrogram = next(gen((CHANNEL, FREQ, TIME)))[0] | |||
| out_put = atf.FrequencyMasking(False, 3, 1, 10)(spectrogram) | |||
| assert out_put.shape == (CHANNEL, FREQ, TIME) | |||
| def test_func_frequency_masking_eager_precision(): | |||
| """ mindspore eager mode normal testcase:frequency_masking op""" | |||
| logger.info("test frequency_masking op") | |||
| spectrogram = np.array([[[0.17274511, 0.85174704, 0.07162686, -0.45436913], | |||
| [-1.045921, -1.8204843, 0.62333095, -0.09532598], | |||
| [1.8175547, -0.25779432, -0.58152324, -0.00221091]], | |||
| [[-1.205032, 0.18922766, -0.5277673, -1.3090396], | |||
| [1.8914849, -0.97001046, -0.23726775, 0.00525892], | |||
| [-1.0271876, 0.33526883, 1.7413973, 0.12313101]]]).astype(np.float32) | |||
| out_ms = atf.FrequencyMasking(False, 2, 0, 0)(spectrogram) | |||
| out_benchmark = np.array([[[0.0, 0.0, 0.0, 0.0], | |||
| [0.0, 0.0, 0.0, 0.0], | |||
| [1.8175547, -0.25779432, -0.58152324, -0.00221091]], | |||
| [[0.0, 0.0, 0.0, 0.0], | |||
| [0.0, 0.0, 0.0, 0.0], | |||
| [-1.0271876, 0.33526883, 1.7413973, 0.12313101]]]).astype(np.float32) | |||
| allclose_nparray(out_ms, out_benchmark, 0.0001, 0.0001) | |||
| def test_func_frequency_masking_pipeline(): | |||
| """ mindspore pipeline mode normal testcase:frequency_masking op""" | |||
| logger.info("test frequency_masking op, pipeline") | |||
| generator = gen([CHANNEL, FREQ, TIME]) | |||
| data1 = ds.GeneratorDataset(source=generator, column_names=[ | |||
| "multi_dimensional_data"]) | |||
| transforms = [ | |||
| atf.FrequencyMasking(True, 8) | |||
| ] | |||
| data1 = data1.map(operations=transforms, input_columns=[ | |||
| "multi_dimensional_data"]) | |||
| for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): | |||
| out_put = item["multi_dimensional_data"] | |||
| assert out_put.shape == (CHANNEL, FREQ, TIME) | |||
| def test_frequency_masking_invalid_input(): | |||
| def test_invalid_param(test_name, iid_masks, frequency_mask_param, mask_start, error, error_msg): | |||
| logger.info("Test FrequencyMasking with wrong params: {0}".format(test_name)) | |||
| with pytest.raises(error) as error_info: | |||
| atf.FrequencyMasking(iid_masks, frequency_mask_param, mask_start) | |||
| assert error_msg in str(error_info.value) | |||
| def test_invalid_input(test_name, iid_masks, frequency_mask_param, mask_start, error, error_msg): | |||
| logger.info("Test FrequencyMasking with wrong params: {0}".format(test_name)) | |||
| with pytest.raises(error) as error_info: | |||
| spectrogram = next(gen((CHANNEL, FREQ, TIME)))[0] | |||
| _ = atf.FrequencyMasking(iid_masks, frequency_mask_param, mask_start)(spectrogram) | |||
| assert error_msg in str(error_info.value) | |||
| test_invalid_param("invalid mask_start", True, 2, -10, ValueError, | |||
| "Input mask_start is not within the required interval of [0, 16777216].") | |||
| test_invalid_param("invalid mask_param", True, -2, 10, ValueError, | |||
| "Input mask_param is not within the required interval of [0, 16777216].") | |||
| test_invalid_param("invalid iid_masks", "True", 2, 10, TypeError, | |||
| "Argument iid_masks with value True is not of type [<class 'bool'>], but got <class 'str'>.") | |||
| test_invalid_input("invalid mask_start", False, 2, 100, RuntimeError, | |||
| "MaskAlongAxis: mask_start should be less than the length of chosen dimension.") | |||
| test_invalid_input("invalid mask_width", False, 200, 2, RuntimeError, | |||
| "FrequencyMasking: frequency_mask_param should be less than the length of frequency dimension.") | |||
| if __name__ == "__main__": | |||
| test_func_frequency_masking_eager_random_input() | |||
| test_func_frequency_masking_eager_precision() | |||
| test_func_frequency_masking_pipeline() | |||
| test_frequency_masking_invalid_input() | |||
| @@ -125,9 +125,9 @@ def test_time_masking_invalid_input(): | |||
| "Argument iid_masks with value True is not of type [<class 'bool'>], but got <class 'str'>.") | |||
| test_invalid_input("invalid mask_start", False, 2, 100, RuntimeError, | |||
| "MaskAlongAxis: mask_start should be smaller than the length of chosen dim.") | |||
| "MaskAlongAxis: mask_start should be less than the length of chosen dimension.") | |||
| test_invalid_input("invalid mask_width", False, 200, 2, RuntimeError, | |||
| "TimeMasking: input time_mask_param should be smaller than the length of time dim.") | |||
| "TimeMasking: time_mask_param should be less than the length of time dimension.") | |||
| if __name__ == "__main__": | |||