Add noise filtering on whipser to be able to use speakers

1 year ago · 0e3f89ac59
--- a/examples/llm/qwen-dev-interruption.yml
+++ b/examples/llm/qwen-dev-interruption.yml
@@ -0,0 +1,59 @@
 nodes:
  - id: dora-microphone
    build: pip install -e ../../node-hub/dora-microphone
    path: dora-microphone
    inputs:
      tick: dora/timer/millis/2000
    outputs:
      - audio

  - id: dora-vad
    build: pip install -e ../../node-hub/dora-vad
    path: dora-vad
    inputs:
      audio: dora-microphone/audio
    outputs:
      - audio
      - timestamp_start

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    path: dora-distil-whisper
    inputs:
      input: dora-vad/audio
    outputs:
      - text
    env:
      TARGET_LANGUAGE: english

  - id: dora-qwen
    build: pip install -e ../../node-hub/dora-qwen
    path: dora-qwen
    inputs:
      text: dora-distil-whisper/text
    outputs:
      - text

  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      text_qwen: dora-qwen/text
      text_whisper: dora-distil-whisper/text

  - id: dora-kokoro-tts
    build: pip install -e ../../node-hub/dora-kokoro-tts
    path: dora-kokoro-tts
    inputs:
      text: dora-qwen/text
    outputs:
      - audio
    env:
      ACTIVATION_WORDS: you

  - id: dora-pyaudio
    build: pip install -e ../../node-hub/dora-pyaudio
    path: dora-pyaudio
    inputs:
      audio: dora-kokoro-tts/audio
      timestamp_start: dora-vad/timestamp_start
--- a/examples/llm/qwen-dev.yml
+++ b/examples/llm/qwen-dev.yml
@@ -14,12 +14,12 @@ nodes:
      audio: dora-microphone/audio
    outputs:
      - audio
      - timestamp_start

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    path: dora-distil-whisper
    inputs:
      text_noise: dora-qwen/text
      input: dora-vad/audio
    outputs:
      - text
@@ -56,4 +56,3 @@ nodes:
    path: dora-pyaudio
    inputs:
      audio: dora-kokoro-tts/audio
      timestamp_start: dora-vad/timestamp_start
--- a/node-hub/dora-distil-whisper/dora_distil_whisper/main.py
+++ b/node-hub/dora-distil-whisper/dora_distil_whisper/main.py
@@ -1,7 +1,9 @@
 """TODO: Add docstring."""

 import os
 import re
 import sys
 import time
 from pathlib import Path

 import pyarrow as pa
@@ -13,6 +15,69 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
 TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])


 def remove_text_noise(text, text_noise):
    # Handle the case where text_noise is empty
    if not text_noise.strip():
        return (
            text  # Return the original text if text_noise is empty or just whitespace
        )

    # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
    def normalize(s):
        # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
        s = re.sub(r"-", " ", s)
        # Remove other punctuation and convert to lowercase
        s = re.sub(r"[^\w\s]", "", s).lower()
        return s

    # Normalize both text and text_noise
    normalized_text = normalize(text)
    normalized_noise = normalize(text_noise)

    # Split into words
    text_words = normalized_text.split()
    noise_words = normalized_noise.split()

    # Function to find and remove noise sequence flexibly
    def remove_flexible(text_list, noise_list):
        i = 0
        while i <= len(text_list) - len(noise_list):
            match = True
            extra_words = 0
            for j, noise_word in enumerate(noise_list):
                if i + j + extra_words >= len(text_list):
                    match = False
                    break
                # Allow skipping extra words in text_list
                while (
                    i + j + extra_words < len(text_list)
                    and text_list[i + j + extra_words] != noise_word
                ):
                    extra_words += 1
                    if i + j + extra_words >= len(text_list):
                        match = False
                        break
                if not match:
                    break
            if match:
                # Remove matched part
                del text_list[i : i + len(noise_list) + extra_words]
                i = max(0, i - len(noise_list))  # Adjust index after removal
            else:
                i += 1
        return text_list

    # Only remove parts of text_noise that are found in text
    cleaned_words = text_words[:]
    for noise_word in noise_words:
        if noise_word in cleaned_words:
            cleaned_words.remove(noise_word)

    # Reconstruct the cleaned text
    cleaned_text = " ".join(cleaned_words)
    return cleaned_text


 def load_model():
    """TODO: Add docstring."""
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
@@ -69,6 +134,7 @@ BAD_SENTENCES = [
    " Sous-titrage Société Radio-Canada",
    " Sous",
    " Sous-",
    " i'm going to go to the next one.",
 ]


@@ -109,36 +175,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
 def main():
    """TODO: Add docstring."""
    node = Node()

    text_noise = ""
    noise_timestamp = time.time()
    # For macos use mlx:
    if sys.platform != "darwin":
        pipe = load_model()

    for event in node:
        if event["type"] == "INPUT":
            audio = event["value"].to_numpy()
            confg = (
                {"language": TARGET_LANGUAGE, "task": "translate"}
                if TRANSLATE
                else {
                    "language": TARGET_LANGUAGE,
                }
            )
            if sys.platform == "darwin":
                import mlx_whisper

                result = mlx_whisper.transcribe(
                    audio,
                    path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
                    append_punctuations=".",
            if "text_noise" in event["id"]:
                text_noise = event["value"][0].as_py()
                text_noise = (
                    text_noise.replace("(", "")
                    .replace(")", "")
                    .replace("[", "")
                    .replace("]", "")
                )

                noise_timestamp = time.time()
            else:
                result = pipe(
                    audio,
                    generate_kwargs=confg,
                audio = event["value"].to_numpy()
                confg = (
                    {"language": TARGET_LANGUAGE, "task": "translate"}
                    if TRANSLATE
                    else {
                        "language": TARGET_LANGUAGE,
                    }
                )
                if sys.platform == "darwin":
                    import mlx_whisper

                    result = mlx_whisper.transcribe(
                        audio,
                        path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
                        append_punctuations=".",
                    )

                else:
                    result = pipe(
                        audio,
                        generate_kwargs=confg,
                    )
                if result["text"] in BAD_SENTENCES:
                    continue
                text = cut_repetition(result["text"])

                # Remove noise filter after some time
                if time.time() - noise_timestamp > (len(text_noise.split()) / 2):  # WPS
                    text_noise = ""

                ## Remove text noise independantly of casing
                text = remove_text_noise(text, text_noise)

                if text.strip() == "" or text.strip() == ".":
                    continue
                node.send_output(
                    "text", pa.array([text]), {"language": TARGET_LANGUAGE}
                )
            if result["text"] in BAD_SENTENCES:
                continue
            text = cut_repetition(result["text"])
            node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE})
--- a/node-hub/dora-distil-whisper/pyproject.toml
+++ b/node-hub/dora-distil-whisper/pyproject.toml
@@ -2,8 +2,8 @@
 name = "dora-distil-whisper"
 version = "0.3.10"
 authors = [
    { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
    { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
  { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
  { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
 ]
 description = "Dora dora-distil-whisper"
 license = { text = "MIT" }
@@ -11,14 +11,14 @@ readme = "README.md"
 requires-python = ">=3.8"

 dependencies = [
    "dora-rs >= 0.3.9",
    "numpy < 2.0.0",
    "pyarrow >= 5.0.0",
    "transformers >= 4.0.0",
    "accelerate >= 0.29.2",
    "torch >= 2.2.0",
    "modelscope >= 1.18.1",
    "mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
  "dora-rs >= 0.3.9",
  "numpy < 2.0.0",
  "pyarrow >= 5.0.0",
  "transformers >= 4.0.0",
  "accelerate >= 0.29.2",
  "torch >= 2.2.0",
  "modelscope >= 1.18.1",
  "mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
 ]


@@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main"

 [tool.ruff.lint]
 extend-select = [
  "D",   # pydocstyle
  "D", # pydocstyle
 ]
--- a/node-hub/dora-qwen/dora_qwen/main.py
+++ b/node-hub/dora-qwen/dora_qwen/main.py
@@ -18,7 +18,9 @@ def get_model_gguf():
    from llama_cpp import Llama

    llm = Llama.from_pretrained(
        repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False,
        repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF",
        filename="*fp16.gguf",
        verbose=False,
    )
    return llm

@@ -36,7 +38,9 @@ def get_model_huggingface():
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"

    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype="auto", device_map="auto",
        model_name,
        torch_dtype="auto",
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer
@@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str:
    """TODO: Add docstring."""
    history += [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        history, tokenize=False, add_generation_prompt=True,
        history,
        tokenize=False,
        add_generation_prompt=True,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=512)
@@ -66,9 +72,9 @@ def main():
    """TODO: Add docstring."""
    history = []
    # If OS is not Darwin, use Huggingface model
    if sys.platform != "":
    if sys.platform == "darwin":
        model = get_model_gguf()
    elif sys.platform == "huggingface":
    elif sys.platform == "linux":
        model, tokenizer = get_model_huggingface()
    else:
        model, tokenizer = get_model_darwin()
@@ -83,7 +89,7 @@ def main():

            if any(word in ACTIVATION_WORDS for word in words):
                # On linux, Windows
                if sys.platform != "":
                if sys.platform == "darwin":
                    response = model(
                        f"Q: {text} A: ",  # Prompt
                        max_tokens=24,
@@ -92,17 +98,23 @@ def main():
                            "\n",
                        ],  # Stop generating just before the model would generate a new question
                    )["choices"][0]["text"]
                elif sys.platform == "huggingface":
                elif sys.platform == "linux":
                    response, history = generate_hf(model, tokenizer, text, history)
                else:
                    from mlx_lm import generate

                    response = generate(
                        model, tokenizer, prompt=text, verbose=False, max_tokens=50,
                        model,
                        tokenizer,
                        prompt=text,
                        verbose=False,
                        max_tokens=50,
                    )

                node.send_output(
                    output_id="text", data=pa.array([response]), metadata={},
                    output_id="text",
                    data=pa.array([response]),
                    metadata={},
                )