| @@ -0,0 +1,59 @@ | |||
| nodes: | |||
| - id: dora-microphone | |||
| build: pip install -e ../../node-hub/dora-microphone | |||
| path: dora-microphone | |||
| inputs: | |||
| tick: dora/timer/millis/2000 | |||
| outputs: | |||
| - audio | |||
| - id: dora-vad | |||
| build: pip install -e ../../node-hub/dora-vad | |||
| path: dora-vad | |||
| inputs: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - timestamp_start | |||
| - id: dora-distil-whisper | |||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||
| path: dora-distil-whisper | |||
| inputs: | |||
| input: dora-vad/audio | |||
| outputs: | |||
| - text | |||
| env: | |||
| TARGET_LANGUAGE: english | |||
| - id: dora-qwen | |||
| build: pip install -e ../../node-hub/dora-qwen | |||
| path: dora-qwen | |||
| inputs: | |||
| text: dora-distil-whisper/text | |||
| outputs: | |||
| - text | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| inputs: | |||
| text_qwen: dora-qwen/text | |||
| text_whisper: dora-distil-whisper/text | |||
| - id: dora-kokoro-tts | |||
| build: pip install -e ../../node-hub/dora-kokoro-tts | |||
| path: dora-kokoro-tts | |||
| inputs: | |||
| text: dora-qwen/text | |||
| outputs: | |||
| - audio | |||
| env: | |||
| ACTIVATION_WORDS: you | |||
| - id: dora-pyaudio | |||
| build: pip install -e ../../node-hub/dora-pyaudio | |||
| path: dora-pyaudio | |||
| inputs: | |||
| audio: dora-kokoro-tts/audio | |||
| timestamp_start: dora-vad/timestamp_start | |||
| @@ -14,12 +14,12 @@ nodes: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - timestamp_start | |||
| - id: dora-distil-whisper | |||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||
| path: dora-distil-whisper | |||
| inputs: | |||
| text_noise: dora-qwen/text | |||
| input: dora-vad/audio | |||
| outputs: | |||
| - text | |||
| @@ -56,4 +56,3 @@ nodes: | |||
| path: dora-pyaudio | |||
| inputs: | |||
| audio: dora-kokoro-tts/audio | |||
| timestamp_start: dora-vad/timestamp_start | |||
| @@ -1,7 +1,9 @@ | |||
| """TODO: Add docstring.""" | |||
| import os | |||
| import re | |||
| import sys | |||
| import time | |||
| from pathlib import Path | |||
| import pyarrow as pa | |||
| @@ -13,6 +15,69 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english") | |||
| TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"]) | |||
| def remove_text_noise(text, text_noise): | |||
| # Handle the case where text_noise is empty | |||
| if not text_noise.strip(): | |||
| return ( | |||
| text # Return the original text if text_noise is empty or just whitespace | |||
| ) | |||
| # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens) | |||
| def normalize(s): | |||
| # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent | |||
| s = re.sub(r"-", " ", s) | |||
| # Remove other punctuation and convert to lowercase | |||
| s = re.sub(r"[^\w\s]", "", s).lower() | |||
| return s | |||
| # Normalize both text and text_noise | |||
| normalized_text = normalize(text) | |||
| normalized_noise = normalize(text_noise) | |||
| # Split into words | |||
| text_words = normalized_text.split() | |||
| noise_words = normalized_noise.split() | |||
| # Function to find and remove noise sequence flexibly | |||
| def remove_flexible(text_list, noise_list): | |||
| i = 0 | |||
| while i <= len(text_list) - len(noise_list): | |||
| match = True | |||
| extra_words = 0 | |||
| for j, noise_word in enumerate(noise_list): | |||
| if i + j + extra_words >= len(text_list): | |||
| match = False | |||
| break | |||
| # Allow skipping extra words in text_list | |||
| while ( | |||
| i + j + extra_words < len(text_list) | |||
| and text_list[i + j + extra_words] != noise_word | |||
| ): | |||
| extra_words += 1 | |||
| if i + j + extra_words >= len(text_list): | |||
| match = False | |||
| break | |||
| if not match: | |||
| break | |||
| if match: | |||
| # Remove matched part | |||
| del text_list[i : i + len(noise_list) + extra_words] | |||
| i = max(0, i - len(noise_list)) # Adjust index after removal | |||
| else: | |||
| i += 1 | |||
| return text_list | |||
| # Only remove parts of text_noise that are found in text | |||
| cleaned_words = text_words[:] | |||
| for noise_word in noise_words: | |||
| if noise_word in cleaned_words: | |||
| cleaned_words.remove(noise_word) | |||
| # Reconstruct the cleaned text | |||
| cleaned_text = " ".join(cleaned_words) | |||
| return cleaned_text | |||
| def load_model(): | |||
| """TODO: Add docstring.""" | |||
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |||
| @@ -69,6 +134,7 @@ BAD_SENTENCES = [ | |||
| " Sous-titrage Société Radio-Canada", | |||
| " Sous", | |||
| " Sous-", | |||
| " i'm going to go to the next one.", | |||
| ] | |||
| @@ -109,36 +175,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50): | |||
| def main(): | |||
| """TODO: Add docstring.""" | |||
| node = Node() | |||
| text_noise = "" | |||
| noise_timestamp = time.time() | |||
| # For macos use mlx: | |||
| if sys.platform != "darwin": | |||
| pipe = load_model() | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| audio = event["value"].to_numpy() | |||
| confg = ( | |||
| {"language": TARGET_LANGUAGE, "task": "translate"} | |||
| if TRANSLATE | |||
| else { | |||
| "language": TARGET_LANGUAGE, | |||
| } | |||
| ) | |||
| if sys.platform == "darwin": | |||
| import mlx_whisper | |||
| result = mlx_whisper.transcribe( | |||
| audio, | |||
| path_or_hf_repo="mlx-community/whisper-large-v3-turbo", | |||
| append_punctuations=".", | |||
| if "text_noise" in event["id"]: | |||
| text_noise = event["value"][0].as_py() | |||
| text_noise = ( | |||
| text_noise.replace("(", "") | |||
| .replace(")", "") | |||
| .replace("[", "") | |||
| .replace("]", "") | |||
| ) | |||
| noise_timestamp = time.time() | |||
| else: | |||
| result = pipe( | |||
| audio, | |||
| generate_kwargs=confg, | |||
| audio = event["value"].to_numpy() | |||
| confg = ( | |||
| {"language": TARGET_LANGUAGE, "task": "translate"} | |||
| if TRANSLATE | |||
| else { | |||
| "language": TARGET_LANGUAGE, | |||
| } | |||
| ) | |||
| if sys.platform == "darwin": | |||
| import mlx_whisper | |||
| result = mlx_whisper.transcribe( | |||
| audio, | |||
| path_or_hf_repo="mlx-community/whisper-large-v3-turbo", | |||
| append_punctuations=".", | |||
| ) | |||
| else: | |||
| result = pipe( | |||
| audio, | |||
| generate_kwargs=confg, | |||
| ) | |||
| if result["text"] in BAD_SENTENCES: | |||
| continue | |||
| text = cut_repetition(result["text"]) | |||
| # Remove noise filter after some time | |||
| if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS | |||
| text_noise = "" | |||
| ## Remove text noise independantly of casing | |||
| text = remove_text_noise(text, text_noise) | |||
| if text.strip() == "" or text.strip() == ".": | |||
| continue | |||
| node.send_output( | |||
| "text", pa.array([text]), {"language": TARGET_LANGUAGE} | |||
| ) | |||
| if result["text"] in BAD_SENTENCES: | |||
| continue | |||
| text = cut_repetition(result["text"]) | |||
| node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE}) | |||
| @@ -2,8 +2,8 @@ | |||
| name = "dora-distil-whisper" | |||
| version = "0.3.10" | |||
| authors = [ | |||
| { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, | |||
| { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, | |||
| { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, | |||
| { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, | |||
| ] | |||
| description = "Dora dora-distil-whisper" | |||
| license = { text = "MIT" } | |||
| @@ -11,14 +11,14 @@ readme = "README.md" | |||
| requires-python = ">=3.8" | |||
| dependencies = [ | |||
| "dora-rs >= 0.3.9", | |||
| "numpy < 2.0.0", | |||
| "pyarrow >= 5.0.0", | |||
| "transformers >= 4.0.0", | |||
| "accelerate >= 0.29.2", | |||
| "torch >= 2.2.0", | |||
| "modelscope >= 1.18.1", | |||
| "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", | |||
| "dora-rs >= 0.3.9", | |||
| "numpy < 2.0.0", | |||
| "pyarrow >= 5.0.0", | |||
| "transformers >= 4.0.0", | |||
| "accelerate >= 0.29.2", | |||
| "torch >= 2.2.0", | |||
| "modelscope >= 1.18.1", | |||
| "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", | |||
| ] | |||
| @@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main" | |||
| [tool.ruff.lint] | |||
| extend-select = [ | |||
| "D", # pydocstyle | |||
| "D", # pydocstyle | |||
| ] | |||
| @@ -18,7 +18,9 @@ def get_model_gguf(): | |||
| from llama_cpp import Llama | |||
| llm = Llama.from_pretrained( | |||
| repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False, | |||
| repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", | |||
| filename="*fp16.gguf", | |||
| verbose=False, | |||
| ) | |||
| return llm | |||
| @@ -36,7 +38,9 @@ def get_model_huggingface(): | |||
| model_name = "Qwen/Qwen2.5-0.5B-Instruct" | |||
| model = AutoModelForCausalLM.from_pretrained( | |||
| model_name, torch_dtype="auto", device_map="auto", | |||
| model_name, | |||
| torch_dtype="auto", | |||
| device_map="auto", | |||
| ) | |||
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |||
| return model, tokenizer | |||
| @@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str: | |||
| """TODO: Add docstring.""" | |||
| history += [{"role": "user", "content": prompt}] | |||
| text = tokenizer.apply_chat_template( | |||
| history, tokenize=False, add_generation_prompt=True, | |||
| history, | |||
| tokenize=False, | |||
| add_generation_prompt=True, | |||
| ) | |||
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |||
| generated_ids = model.generate(**model_inputs, max_new_tokens=512) | |||
| @@ -66,9 +72,9 @@ def main(): | |||
| """TODO: Add docstring.""" | |||
| history = [] | |||
| # If OS is not Darwin, use Huggingface model | |||
| if sys.platform != "": | |||
| if sys.platform == "darwin": | |||
| model = get_model_gguf() | |||
| elif sys.platform == "huggingface": | |||
| elif sys.platform == "linux": | |||
| model, tokenizer = get_model_huggingface() | |||
| else: | |||
| model, tokenizer = get_model_darwin() | |||
| @@ -83,7 +89,7 @@ def main(): | |||
| if any(word in ACTIVATION_WORDS for word in words): | |||
| # On linux, Windows | |||
| if sys.platform != "": | |||
| if sys.platform == "darwin": | |||
| response = model( | |||
| f"Q: {text} A: ", # Prompt | |||
| max_tokens=24, | |||
| @@ -92,17 +98,23 @@ def main(): | |||
| "\n", | |||
| ], # Stop generating just before the model would generate a new question | |||
| )["choices"][0]["text"] | |||
| elif sys.platform == "huggingface": | |||
| elif sys.platform == "linux": | |||
| response, history = generate_hf(model, tokenizer, text, history) | |||
| else: | |||
| from mlx_lm import generate | |||
| response = generate( | |||
| model, tokenizer, prompt=text, verbose=False, max_tokens=50, | |||
| model, | |||
| tokenizer, | |||
| prompt=text, | |||
| verbose=False, | |||
| max_tokens=50, | |||
| ) | |||
| node.send_output( | |||
| output_id="text", data=pa.array([response]), metadata={}, | |||
| output_id="text", | |||
| data=pa.array([response]), | |||
| metadata={}, | |||
| ) | |||