diff --git a/examples/llm/qwen-dev-interruption.yml b/examples/llm/qwen-dev-interruption.yml new file mode 100755 index 00000000..128466c5 --- /dev/null +++ b/examples/llm/qwen-dev-interruption.yml @@ -0,0 +1,59 @@ +nodes: + - id: dora-microphone + build: pip install -e ../../node-hub/dora-microphone + path: dora-microphone + inputs: + tick: dora/timer/millis/2000 + outputs: + - audio + + - id: dora-vad + build: pip install -e ../../node-hub/dora-vad + path: dora-vad + inputs: + audio: dora-microphone/audio + outputs: + - audio + - timestamp_start + + - id: dora-distil-whisper + build: pip install -e ../../node-hub/dora-distil-whisper + path: dora-distil-whisper + inputs: + input: dora-vad/audio + outputs: + - text + env: + TARGET_LANGUAGE: english + + - id: dora-qwen + build: pip install -e ../../node-hub/dora-qwen + path: dora-qwen + inputs: + text: dora-distil-whisper/text + outputs: + - text + + - id: plot + build: pip install -e ../../node-hub/dora-rerun + path: dora-rerun + inputs: + text_qwen: dora-qwen/text + text_whisper: dora-distil-whisper/text + + - id: dora-kokoro-tts + build: pip install -e ../../node-hub/dora-kokoro-tts + path: dora-kokoro-tts + inputs: + text: dora-qwen/text + outputs: + - audio + env: + ACTIVATION_WORDS: you + + - id: dora-pyaudio + build: pip install -e ../../node-hub/dora-pyaudio + path: dora-pyaudio + inputs: + audio: dora-kokoro-tts/audio + timestamp_start: dora-vad/timestamp_start diff --git a/examples/llm/qwen-dev.yml b/examples/llm/qwen-dev.yml index 128466c5..d7860394 100755 --- a/examples/llm/qwen-dev.yml +++ b/examples/llm/qwen-dev.yml @@ -14,12 +14,12 @@ nodes: audio: dora-microphone/audio outputs: - audio - - timestamp_start - id: dora-distil-whisper build: pip install -e ../../node-hub/dora-distil-whisper path: dora-distil-whisper inputs: + text_noise: dora-qwen/text input: dora-vad/audio outputs: - text @@ -56,4 +56,3 @@ nodes: path: dora-pyaudio inputs: audio: dora-kokoro-tts/audio - timestamp_start: dora-vad/timestamp_start diff --git a/node-hub/dora-distil-whisper/dora_distil_whisper/main.py b/node-hub/dora-distil-whisper/dora_distil_whisper/main.py index eb3c7df7..682e1884 100644 --- a/node-hub/dora-distil-whisper/dora_distil_whisper/main.py +++ b/node-hub/dora-distil-whisper/dora_distil_whisper/main.py @@ -1,7 +1,9 @@ """TODO: Add docstring.""" import os +import re import sys +import time from pathlib import Path import pyarrow as pa @@ -13,6 +15,69 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english") TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"]) +def remove_text_noise(text, text_noise): + # Handle the case where text_noise is empty + if not text_noise.strip(): + return ( + text # Return the original text if text_noise is empty or just whitespace + ) + + # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens) + def normalize(s): + # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent + s = re.sub(r"-", " ", s) + # Remove other punctuation and convert to lowercase + s = re.sub(r"[^\w\s]", "", s).lower() + return s + + # Normalize both text and text_noise + normalized_text = normalize(text) + normalized_noise = normalize(text_noise) + + # Split into words + text_words = normalized_text.split() + noise_words = normalized_noise.split() + + # Function to find and remove noise sequence flexibly + def remove_flexible(text_list, noise_list): + i = 0 + while i <= len(text_list) - len(noise_list): + match = True + extra_words = 0 + for j, noise_word in enumerate(noise_list): + if i + j + extra_words >= len(text_list): + match = False + break + # Allow skipping extra words in text_list + while ( + i + j + extra_words < len(text_list) + and text_list[i + j + extra_words] != noise_word + ): + extra_words += 1 + if i + j + extra_words >= len(text_list): + match = False + break + if not match: + break + if match: + # Remove matched part + del text_list[i : i + len(noise_list) + extra_words] + i = max(0, i - len(noise_list)) # Adjust index after removal + else: + i += 1 + return text_list + + # Only remove parts of text_noise that are found in text + cleaned_words = text_words[:] + for noise_word in noise_words: + if noise_word in cleaned_words: + cleaned_words.remove(noise_word) + + # Reconstruct the cleaned text + cleaned_text = " ".join(cleaned_words) + return cleaned_text + + def load_model(): """TODO: Add docstring.""" from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline @@ -69,6 +134,7 @@ BAD_SENTENCES = [ " Sous-titrage Société Radio-Canada", " Sous", " Sous-", + " i'm going to go to the next one.", ] @@ -109,36 +175,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50): def main(): """TODO: Add docstring.""" node = Node() - + text_noise = "" + noise_timestamp = time.time() # For macos use mlx: if sys.platform != "darwin": pipe = load_model() for event in node: if event["type"] == "INPUT": - audio = event["value"].to_numpy() - confg = ( - {"language": TARGET_LANGUAGE, "task": "translate"} - if TRANSLATE - else { - "language": TARGET_LANGUAGE, - } - ) - if sys.platform == "darwin": - import mlx_whisper - - result = mlx_whisper.transcribe( - audio, - path_or_hf_repo="mlx-community/whisper-large-v3-turbo", - append_punctuations=".", + if "text_noise" in event["id"]: + text_noise = event["value"][0].as_py() + text_noise = ( + text_noise.replace("(", "") + .replace(")", "") + .replace("[", "") + .replace("]", "") ) - + noise_timestamp = time.time() else: - result = pipe( - audio, - generate_kwargs=confg, + audio = event["value"].to_numpy() + confg = ( + {"language": TARGET_LANGUAGE, "task": "translate"} + if TRANSLATE + else { + "language": TARGET_LANGUAGE, + } + ) + if sys.platform == "darwin": + import mlx_whisper + + result = mlx_whisper.transcribe( + audio, + path_or_hf_repo="mlx-community/whisper-large-v3-turbo", + append_punctuations=".", + ) + + else: + result = pipe( + audio, + generate_kwargs=confg, + ) + if result["text"] in BAD_SENTENCES: + continue + text = cut_repetition(result["text"]) + + # Remove noise filter after some time + if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS + text_noise = "" + + ## Remove text noise independantly of casing + text = remove_text_noise(text, text_noise) + + if text.strip() == "" or text.strip() == ".": + continue + node.send_output( + "text", pa.array([text]), {"language": TARGET_LANGUAGE} ) - if result["text"] in BAD_SENTENCES: - continue - text = cut_repetition(result["text"]) - node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE}) diff --git a/node-hub/dora-distil-whisper/pyproject.toml b/node-hub/dora-distil-whisper/pyproject.toml index 87047d6f..7cabce79 100644 --- a/node-hub/dora-distil-whisper/pyproject.toml +++ b/node-hub/dora-distil-whisper/pyproject.toml @@ -2,8 +2,8 @@ name = "dora-distil-whisper" version = "0.3.10" authors = [ - { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, - { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, + { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, + { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, ] description = "Dora dora-distil-whisper" license = { text = "MIT" } @@ -11,14 +11,14 @@ readme = "README.md" requires-python = ">=3.8" dependencies = [ - "dora-rs >= 0.3.9", - "numpy < 2.0.0", - "pyarrow >= 5.0.0", - "transformers >= 4.0.0", - "accelerate >= 0.29.2", - "torch >= 2.2.0", - "modelscope >= 1.18.1", - "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", + "dora-rs >= 0.3.9", + "numpy < 2.0.0", + "pyarrow >= 5.0.0", + "transformers >= 4.0.0", + "accelerate >= 0.29.2", + "torch >= 2.2.0", + "modelscope >= 1.18.1", + "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", ] @@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main" [tool.ruff.lint] extend-select = [ - "D", # pydocstyle + "D", # pydocstyle ] diff --git a/node-hub/dora-qwen/dora_qwen/main.py b/node-hub/dora-qwen/dora_qwen/main.py index 348666df..71ef92b5 100644 --- a/node-hub/dora-qwen/dora_qwen/main.py +++ b/node-hub/dora-qwen/dora_qwen/main.py @@ -18,7 +18,9 @@ def get_model_gguf(): from llama_cpp import Llama llm = Llama.from_pretrained( - repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False, + repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", + filename="*fp16.gguf", + verbose=False, ) return llm @@ -36,7 +38,9 @@ def get_model_huggingface(): model_name = "Qwen/Qwen2.5-0.5B-Instruct" model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype="auto", device_map="auto", + model_name, + torch_dtype="auto", + device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer @@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str: """TODO: Add docstring.""" history += [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template( - history, tokenize=False, add_generation_prompt=True, + history, + tokenize=False, + add_generation_prompt=True, ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate(**model_inputs, max_new_tokens=512) @@ -66,9 +72,9 @@ def main(): """TODO: Add docstring.""" history = [] # If OS is not Darwin, use Huggingface model - if sys.platform != "": + if sys.platform == "darwin": model = get_model_gguf() - elif sys.platform == "huggingface": + elif sys.platform == "linux": model, tokenizer = get_model_huggingface() else: model, tokenizer = get_model_darwin() @@ -83,7 +89,7 @@ def main(): if any(word in ACTIVATION_WORDS for word in words): # On linux, Windows - if sys.platform != "": + if sys.platform == "darwin": response = model( f"Q: {text} A: ", # Prompt max_tokens=24, @@ -92,17 +98,23 @@ def main(): "\n", ], # Stop generating just before the model would generate a new question )["choices"][0]["text"] - elif sys.platform == "huggingface": + elif sys.platform == "linux": response, history = generate_hf(model, tokenizer, text, history) else: from mlx_lm import generate response = generate( - model, tokenizer, prompt=text, verbose=False, max_tokens=50, + model, + tokenizer, + prompt=text, + verbose=False, + max_tokens=50, ) node.send_output( - output_id="text", data=pa.array([response]), metadata={}, + output_id="text", + data=pa.array([response]), + metadata={}, )