Add noise filtering on whipser to be able to use speakers (#847)

This PR makes it possible to use speakers when talking to an AI through voice. It is done by filtering out the output of the TTS from the input of the STT using fuzzy matching.
1 year ago · 844e94a6be
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -491,6 +491,8 @@ jobs:
            target: aarch64-unknown-linux-musl
          - runner: ubuntu-22.04
            target: armv7-unknown-linux-musleabihf
          - runner: ubuntu-22.04
            target: x86_64-pc-windows-gnu
          - runner: macos-13
            target: aarch64-apple-darwin
          - runner: macos-13
@@ -501,9 +503,13 @@ jobs:
      - uses: r7kamura/rust-problem-matchers@v1.1.0
      - name: "Add toolchains"
        run: rustup target add ${{ matrix.platform.target }}
      - name: "Build"
      - name: Install system-level dependencies
        if: runner.target == 'x86_64-pc-windows-gnu'
        run: |
          sudo apt install g++-mingw-w64-x86-64 gcc-mingw-w64-x86-64
      - name: "Check"
        uses: actions-rs/cargo@v1
        with:
          use-cross: true
          command: check
          args: --target  ${{ matrix.platform.target }} -p dora-cli
          args: --target  ${{ matrix.platform.target }} --all --exclude dora-node-api-python --exclude dora-operator-api-python --exclude dora-ros2-bridge-python
--- a/.github/workflows/node_hub_test.sh
+++ b/.github/workflows/node_hub_test.sh
@@ -27,8 +27,8 @@ else
        cargo test

        pip install "maturin[zig]"
        maturin build --zig --release
        # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
        maturin build --zig 
        # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel on multiple platforms
        if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
            # Free up ubuntu space
            sudo apt-get clean
@@ -37,29 +37,18 @@ else
            sudo rm -rf /opt/ghc/

            maturin publish --skip-existing --zig
        fi

        # aarch64-unknown-linux-gnu
        rustup target add aarch64-unknown-linux-gnu
        maturin build --target aarch64-unknown-linux-gnu --zig --release
        # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
        if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
            # aarch64-unknown-linux-gnu
            rustup target add aarch64-unknown-linux-gnu
            maturin publish --target aarch64-unknown-linux-gnu --skip-existing --zig
        fi
                
        # armv7-unknown-linux-musleabihf
        rustup target add armv7-unknown-linux-musleabihf
        maturin build --target armv7-unknown-linux-musleabihf --zig --release
        # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
        if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
            # armv7-unknown-linux-musleabihf
            rustup target add armv7-unknown-linux-musleabihf
            # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
            maturin publish --target armv7-unknown-linux-musleabihf --skip-existing --zig
        fi

        # x86_64-pc-windows-gnu
        rustup target add x86_64-pc-windows-gnu
        maturin build --target x86_64-pc-windows-gnu --release
        # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
        if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
            # x86_64-pc-windows-gnu
            rustup target add x86_64-pc-windows-gnu
            # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
            maturin publish --target x86_64-pc-windows-gnu --skip-existing 
        fi

--- a/_typos.toml
+++ b/_typos.toml
@@ -1,3 +1,4 @@
 [default.extend-identifiers]
 # *sigh* this just isn't worth the cost of fixing
 DeviceNDArray = "DeviceNDArray"
 Feedforward_2nd_Gain = "Feedforward_2nd_Gain"
--- a/apis/c++/node/build.rs
+++ b/apis/c++/node/build.rs
@@ -9,10 +9,7 @@ fn main() {
    println!("cargo:rerun-if-changed=src/lib.rs");

    // rename header files
    let src_dir = target_dir()
        .join("cxxbridge")
        .join("dora-node-api-cxx")
        .join("src");
    let src_dir = origin_dir();
    let target_dir = src_dir.parent().unwrap();
    std::fs::copy(src_dir.join("lib.rs.h"), target_dir.join("dora-node-api.h")).unwrap();
    std::fs::copy(
@@ -28,8 +25,8 @@ fn main() {
    bridge_files.clear();
 }

 fn target_dir() -> PathBuf {
    std::env::var("CARGO_TARGET_DIR")
 fn origin_dir() -> PathBuf {
    let default_target = std::env::var("CARGO_TARGET_DIR")
        .map(PathBuf::from)
        .unwrap_or_else(|_| {
            let root = Path::new(env!("CARGO_MANIFEST_DIR"))
@@ -37,12 +34,26 @@ fn target_dir() -> PathBuf {
                .nth(3)
                .unwrap();
            root.join("target")
        })
        });
    let cross_target = default_target
        .join(std::env::var("TARGET").unwrap())
        .join("cxxbridge")
        .join("dora-node-api-cxx")
        .join("src");

    if cross_target.exists() {
        cross_target
    } else {
        default_target
            .join("cxxbridge")
            .join("dora-node-api-cxx")
            .join("src")
    }
 }

 #[cfg(feature = "ros2-bridge")]
 mod ros2 {
    use super::target_dir;
    use super::origin_dir;
    use std::{
        io::{BufRead, BufReader},
        path::{Component, Path, PathBuf},
@@ -113,10 +124,7 @@ mod ros2 {
            .join("ros2_bindings.rs.cc");

        // copy message files to target directory
        let target_path = target_dir()
            .join("cxxbridge")
            .join("dora-node-api-cxx")
            .join("dora-ros2-bindings.h");
        let target_path = origin_dir().parent().unwrap().join("dora-ros2-bindings.h");

        std::fs::copy(&header_path, &target_path).unwrap();
        println!("cargo:rerun-if-changed={}", header_path.display());
--- a/examples/llm/qwen-dev-interruption.yml
+++ b/examples/llm/qwen-dev-interruption.yml
@@ -0,0 +1,59 @@
 nodes:
  - id: dora-microphone
    build: pip install -e ../../node-hub/dora-microphone
    path: dora-microphone
    inputs:
      tick: dora/timer/millis/2000
    outputs:
      - audio

  - id: dora-vad
    build: pip install -e ../../node-hub/dora-vad
    path: dora-vad
    inputs:
      audio: dora-microphone/audio
    outputs:
      - audio
      - timestamp_start

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    path: dora-distil-whisper
    inputs:
      input: dora-vad/audio
    outputs:
      - text
    env:
      TARGET_LANGUAGE: english

  - id: dora-qwen
    build: pip install -e ../../node-hub/dora-qwen
    path: dora-qwen
    inputs:
      text: dora-distil-whisper/text
    outputs:
      - text

  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      text_qwen: dora-qwen/text
      text_whisper: dora-distil-whisper/text

  - id: dora-kokoro-tts
    build: pip install -e ../../node-hub/dora-kokoro-tts
    path: dora-kokoro-tts
    inputs:
      text: dora-qwen/text
    outputs:
      - audio
    env:
      ACTIVATION_WORDS: you

  - id: dora-pyaudio
    build: pip install -e ../../node-hub/dora-pyaudio
    path: dora-pyaudio
    inputs:
      audio: dora-kokoro-tts/audio
      timestamp_start: dora-vad/timestamp_start
--- a/examples/llm/qwen-dev.yml
+++ b/examples/llm/qwen-dev.yml
@@ -14,12 +14,12 @@ nodes:
      audio: dora-microphone/audio
    outputs:
      - audio
      - timestamp_start

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    path: dora-distil-whisper
    inputs:
      text_noise: dora-qwen/text
      input: dora-vad/audio
    outputs:
      - text
@@ -56,4 +56,3 @@ nodes:
    path: dora-pyaudio
    inputs:
      audio: dora-kokoro-tts/audio
      timestamp_start: dora-vad/timestamp_start
--- a/node-hub/dora-distil-whisper/dora_distil_whisper/main.py
+++ b/node-hub/dora-distil-whisper/dora_distil_whisper/main.py
@@ -1,7 +1,9 @@
 """TODO: Add docstring."""

 import os
 import re
 import sys
 import time
 from pathlib import Path

 import pyarrow as pa
@@ -13,6 +15,79 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
 TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])


 def remove_text_noise(text: str, text_noise="") -> str:
    """Remove noise from text.

    Args:
        text (str): Original text
        text_noise (str): text to remove from the original text

    Returns:
        str: Cleaned text

    """
    # Handle the case where text_noise is empty
    if not text_noise.strip():
        return (
            text  # Return the original text if text_noise is empty or just whitespace
        )

    # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
    def normalize(s):
        # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
        s = re.sub(r"-", " ", s)
        # Remove other punctuation and convert to lowercase
        s = re.sub(r"[^\w\s]", "", s).lower()
        return s

    # Normalize both text and text_noise
    normalized_text = normalize(text)
    normalized_noise = normalize(text_noise)

    # Split into words
    text_words = normalized_text.split()
    noise_words = normalized_noise.split()

    # Function to find and remove noise sequence flexibly
    def remove_flexible(text_list, noise_list):
        i = 0
        while i <= len(text_list) - len(noise_list):
            match = True
            extra_words = 0
            for j, noise_word in enumerate(noise_list):
                if i + j + extra_words >= len(text_list):
                    match = False
                    break
                # Allow skipping extra words in text_list
                while (
                    i + j + extra_words < len(text_list)
                    and text_list[i + j + extra_words] != noise_word
                ):
                    extra_words += 1
                    if i + j + extra_words >= len(text_list):
                        match = False
                        break
                if not match:
                    break
            if match:
                # Remove matched part
                del text_list[i : i + len(noise_list) + extra_words]
                i = max(0, i - len(noise_list))  # Adjust index after removal
            else:
                i += 1
        return text_list

    # Only remove parts of text_noise that are found in text
    cleaned_words = text_words[:]
    for noise_word in noise_words:
        if noise_word in cleaned_words:
            cleaned_words.remove(noise_word)

    # Reconstruct the cleaned text
    cleaned_text = " ".join(cleaned_words)
    return cleaned_text


 def load_model():
    """TODO: Add docstring."""
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
@@ -69,6 +144,7 @@ BAD_SENTENCES = [
    " Sous-titrage Société Radio-Canada",
    " Sous",
    " Sous-",
    " i'm going to go to the next one.",
 ]


@@ -109,36 +185,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
 def main():
    """TODO: Add docstring."""
    node = Node()

    text_noise = ""
    noise_timestamp = time.time()
    # For macos use mlx:
    if sys.platform != "darwin":
        pipe = load_model()

    for event in node:
        if event["type"] == "INPUT":
            audio = event["value"].to_numpy()
            confg = (
                {"language": TARGET_LANGUAGE, "task": "translate"}
                if TRANSLATE
                else {
                    "language": TARGET_LANGUAGE,
                }
            )
            if sys.platform == "darwin":
                import mlx_whisper

                result = mlx_whisper.transcribe(
                    audio,
                    path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
                    append_punctuations=".",
            if "text_noise" in event["id"]:
                text_noise = event["value"][0].as_py()
                text_noise = (
                    text_noise.replace("(", "")
                    .replace(")", "")
                    .replace("[", "")
                    .replace("]", "")
                )

                noise_timestamp = time.time()
            else:
                result = pipe(
                    audio,
                    generate_kwargs=confg,
                audio = event["value"].to_numpy()
                confg = (
                    {"language": TARGET_LANGUAGE, "task": "translate"}
                    if TRANSLATE
                    else {
                        "language": TARGET_LANGUAGE,
                    }
                )
                if sys.platform == "darwin":
                    import mlx_whisper

                    result = mlx_whisper.transcribe(
                        audio,
                        path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
                        append_punctuations=".",
                    )

                else:
                    result = pipe(
                        audio,
                        generate_kwargs=confg,
                    )
                if result["text"] in BAD_SENTENCES:
                    continue
                text = cut_repetition(result["text"])

                # Remove noise filter after some time
                if time.time() - noise_timestamp > (len(text_noise.split()) / 2):  # WPS
                    text_noise = ""

                ## Remove text noise independently of casing
                text = remove_text_noise(text, text_noise)

                if text.strip() == "" or text.strip() == ".":
                    continue
                node.send_output(
                    "text", pa.array([text]), {"language": TARGET_LANGUAGE}
                )
            if result["text"] in BAD_SENTENCES:
                continue
            text = cut_repetition(result["text"])
            node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE})
--- a/node-hub/dora-distil-whisper/pyproject.toml
+++ b/node-hub/dora-distil-whisper/pyproject.toml
@@ -2,8 +2,8 @@
 name = "dora-distil-whisper"
 version = "0.3.10"
 authors = [
    { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
    { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
  { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
  { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
 ]
 description = "Dora dora-distil-whisper"
 license = { text = "MIT" }
@@ -11,14 +11,14 @@ readme = "README.md"
 requires-python = ">=3.8"

 dependencies = [
    "dora-rs >= 0.3.9",
    "numpy < 2.0.0",
    "pyarrow >= 5.0.0",
    "transformers >= 4.0.0",
    "accelerate >= 0.29.2",
    "torch >= 2.2.0",
    "modelscope >= 1.18.1",
    "mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
  "dora-rs >= 0.3.9",
  "numpy < 2.0.0",
  "pyarrow >= 5.0.0",
  "transformers >= 4.0.0",
  "accelerate >= 0.29.2",
  "torch >= 2.2.0",
  "modelscope >= 1.18.1",
  "mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
 ]


@@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main"

 [tool.ruff.lint]
 extend-select = [
  "D",   # pydocstyle
  "D", # pydocstyle
 ]
--- a/node-hub/dora-qwen/dora_qwen/main.py
+++ b/node-hub/dora-qwen/dora_qwen/main.py
@@ -18,7 +18,9 @@ def get_model_gguf():
    from llama_cpp import Llama

    llm = Llama.from_pretrained(
        repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False,
        repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF",
        filename="*fp16.gguf",
        verbose=False,
    )
    return llm

@@ -36,7 +38,9 @@ def get_model_huggingface():
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"

    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype="auto", device_map="auto",
        model_name,
        torch_dtype="auto",
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer
@@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str:
    """TODO: Add docstring."""
    history += [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        history, tokenize=False, add_generation_prompt=True,
        history,
        tokenize=False,
        add_generation_prompt=True,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=512)
@@ -66,9 +72,9 @@ def main():
    """TODO: Add docstring."""
    history = []
    # If OS is not Darwin, use Huggingface model
    if sys.platform != "":
    if sys.platform == "darwin":
        model = get_model_gguf()
    elif sys.platform == "huggingface":
    elif sys.platform == "linux":
        model, tokenizer = get_model_huggingface()
    else:
        model, tokenizer = get_model_darwin()
@@ -83,7 +89,7 @@ def main():

            if any(word in ACTIVATION_WORDS for word in words):
                # On linux, Windows
                if sys.platform != "":
                if sys.platform == "darwin":
                    response = model(
                        f"Q: {text} A: ",  # Prompt
                        max_tokens=24,
@@ -92,17 +98,23 @@ def main():
                            "\n",
                        ],  # Stop generating just before the model would generate a new question
                    )["choices"][0]["text"]
                elif sys.platform == "huggingface":
                elif sys.platform == "linux":
                    response, history = generate_hf(model, tokenizer, text, history)
                else:
                    from mlx_lm import generate

                    response = generate(
                        model, tokenizer, prompt=text, verbose=False, max_tokens=50,
                        model,
                        tokenizer,
                        prompt=text,
                        verbose=False,
                        max_tokens=50,
                    )

                node.send_output(
                    output_id="text", data=pa.array([response]), metadata={},
                    output_id="text",
                    data=pa.array([response]),
                    metadata={},
                )


--- a/node-hub/dora-reachy1/dora_reachy1/main.py
+++ b/node-hub/dora-reachy1/dora_reachy1/main.py
@@ -47,7 +47,6 @@ def sad_antennas(reachy):


 def main():

    node = Node()

    ROBOT_IP = os.getenv("ROBOT_IP", "10.42.0.24")
@@ -109,7 +108,7 @@ def main():
                reachy.joints.r_gripper.goal_position = goal
                time.sleep(0.02)

            # When openning the gripper always go to default pose
            # When opening the gripper always go to default pose
            if action == -100:
                goto(
                    {