| @@ -0,0 +1,59 @@ | |||||
| nodes: | |||||
| - id: dora-microphone | |||||
| build: pip install -e ../../node-hub/dora-microphone | |||||
| path: dora-microphone | |||||
| inputs: | |||||
| tick: dora/timer/millis/2000 | |||||
| outputs: | |||||
| - audio | |||||
| - id: dora-vad | |||||
| build: pip install -e ../../node-hub/dora-vad | |||||
| path: dora-vad | |||||
| inputs: | |||||
| audio: dora-microphone/audio | |||||
| outputs: | |||||
| - audio | |||||
| - timestamp_start | |||||
| - id: dora-distil-whisper | |||||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||||
| path: dora-distil-whisper | |||||
| inputs: | |||||
| input: dora-vad/audio | |||||
| outputs: | |||||
| - text | |||||
| env: | |||||
| TARGET_LANGUAGE: english | |||||
| - id: dora-qwen | |||||
| build: pip install -e ../../node-hub/dora-qwen | |||||
| path: dora-qwen | |||||
| inputs: | |||||
| text: dora-distil-whisper/text | |||||
| outputs: | |||||
| - text | |||||
| - id: plot | |||||
| build: pip install -e ../../node-hub/dora-rerun | |||||
| path: dora-rerun | |||||
| inputs: | |||||
| text_qwen: dora-qwen/text | |||||
| text_whisper: dora-distil-whisper/text | |||||
| - id: dora-kokoro-tts | |||||
| build: pip install -e ../../node-hub/dora-kokoro-tts | |||||
| path: dora-kokoro-tts | |||||
| inputs: | |||||
| text: dora-qwen/text | |||||
| outputs: | |||||
| - audio | |||||
| env: | |||||
| ACTIVATION_WORDS: you | |||||
| - id: dora-pyaudio | |||||
| build: pip install -e ../../node-hub/dora-pyaudio | |||||
| path: dora-pyaudio | |||||
| inputs: | |||||
| audio: dora-kokoro-tts/audio | |||||
| timestamp_start: dora-vad/timestamp_start | |||||
| @@ -14,12 +14,12 @@ nodes: | |||||
| audio: dora-microphone/audio | audio: dora-microphone/audio | ||||
| outputs: | outputs: | ||||
| - audio | - audio | ||||
| - timestamp_start | |||||
| - id: dora-distil-whisper | - id: dora-distil-whisper | ||||
| build: pip install -e ../../node-hub/dora-distil-whisper | build: pip install -e ../../node-hub/dora-distil-whisper | ||||
| path: dora-distil-whisper | path: dora-distil-whisper | ||||
| inputs: | inputs: | ||||
| text_noise: dora-qwen/text | |||||
| input: dora-vad/audio | input: dora-vad/audio | ||||
| outputs: | outputs: | ||||
| - text | - text | ||||
| @@ -56,4 +56,3 @@ nodes: | |||||
| path: dora-pyaudio | path: dora-pyaudio | ||||
| inputs: | inputs: | ||||
| audio: dora-kokoro-tts/audio | audio: dora-kokoro-tts/audio | ||||
| timestamp_start: dora-vad/timestamp_start | |||||
| @@ -1,7 +1,9 @@ | |||||
| """TODO: Add docstring.""" | """TODO: Add docstring.""" | ||||
| import os | import os | ||||
| import re | |||||
| import sys | import sys | ||||
| import time | |||||
| from pathlib import Path | from pathlib import Path | ||||
| import pyarrow as pa | import pyarrow as pa | ||||
| @@ -13,6 +15,69 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english") | |||||
| TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"]) | TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"]) | ||||
| def remove_text_noise(text, text_noise): | |||||
| # Handle the case where text_noise is empty | |||||
| if not text_noise.strip(): | |||||
| return ( | |||||
| text # Return the original text if text_noise is empty or just whitespace | |||||
| ) | |||||
| # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens) | |||||
| def normalize(s): | |||||
| # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent | |||||
| s = re.sub(r"-", " ", s) | |||||
| # Remove other punctuation and convert to lowercase | |||||
| s = re.sub(r"[^\w\s]", "", s).lower() | |||||
| return s | |||||
| # Normalize both text and text_noise | |||||
| normalized_text = normalize(text) | |||||
| normalized_noise = normalize(text_noise) | |||||
| # Split into words | |||||
| text_words = normalized_text.split() | |||||
| noise_words = normalized_noise.split() | |||||
| # Function to find and remove noise sequence flexibly | |||||
| def remove_flexible(text_list, noise_list): | |||||
| i = 0 | |||||
| while i <= len(text_list) - len(noise_list): | |||||
| match = True | |||||
| extra_words = 0 | |||||
| for j, noise_word in enumerate(noise_list): | |||||
| if i + j + extra_words >= len(text_list): | |||||
| match = False | |||||
| break | |||||
| # Allow skipping extra words in text_list | |||||
| while ( | |||||
| i + j + extra_words < len(text_list) | |||||
| and text_list[i + j + extra_words] != noise_word | |||||
| ): | |||||
| extra_words += 1 | |||||
| if i + j + extra_words >= len(text_list): | |||||
| match = False | |||||
| break | |||||
| if not match: | |||||
| break | |||||
| if match: | |||||
| # Remove matched part | |||||
| del text_list[i : i + len(noise_list) + extra_words] | |||||
| i = max(0, i - len(noise_list)) # Adjust index after removal | |||||
| else: | |||||
| i += 1 | |||||
| return text_list | |||||
| # Only remove parts of text_noise that are found in text | |||||
| cleaned_words = text_words[:] | |||||
| for noise_word in noise_words: | |||||
| if noise_word in cleaned_words: | |||||
| cleaned_words.remove(noise_word) | |||||
| # Reconstruct the cleaned text | |||||
| cleaned_text = " ".join(cleaned_words) | |||||
| return cleaned_text | |||||
| def load_model(): | def load_model(): | ||||
| """TODO: Add docstring.""" | """TODO: Add docstring.""" | ||||
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | ||||
| @@ -69,6 +134,7 @@ BAD_SENTENCES = [ | |||||
| " Sous-titrage Société Radio-Canada", | " Sous-titrage Société Radio-Canada", | ||||
| " Sous", | " Sous", | ||||
| " Sous-", | " Sous-", | ||||
| " i'm going to go to the next one.", | |||||
| ] | ] | ||||
| @@ -109,36 +175,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50): | |||||
| def main(): | def main(): | ||||
| """TODO: Add docstring.""" | """TODO: Add docstring.""" | ||||
| node = Node() | node = Node() | ||||
| text_noise = "" | |||||
| noise_timestamp = time.time() | |||||
| # For macos use mlx: | # For macos use mlx: | ||||
| if sys.platform != "darwin": | if sys.platform != "darwin": | ||||
| pipe = load_model() | pipe = load_model() | ||||
| for event in node: | for event in node: | ||||
| if event["type"] == "INPUT": | if event["type"] == "INPUT": | ||||
| audio = event["value"].to_numpy() | |||||
| confg = ( | |||||
| {"language": TARGET_LANGUAGE, "task": "translate"} | |||||
| if TRANSLATE | |||||
| else { | |||||
| "language": TARGET_LANGUAGE, | |||||
| } | |||||
| ) | |||||
| if sys.platform == "darwin": | |||||
| import mlx_whisper | |||||
| result = mlx_whisper.transcribe( | |||||
| audio, | |||||
| path_or_hf_repo="mlx-community/whisper-large-v3-turbo", | |||||
| append_punctuations=".", | |||||
| if "text_noise" in event["id"]: | |||||
| text_noise = event["value"][0].as_py() | |||||
| text_noise = ( | |||||
| text_noise.replace("(", "") | |||||
| .replace(")", "") | |||||
| .replace("[", "") | |||||
| .replace("]", "") | |||||
| ) | ) | ||||
| noise_timestamp = time.time() | |||||
| else: | else: | ||||
| result = pipe( | |||||
| audio, | |||||
| generate_kwargs=confg, | |||||
| audio = event["value"].to_numpy() | |||||
| confg = ( | |||||
| {"language": TARGET_LANGUAGE, "task": "translate"} | |||||
| if TRANSLATE | |||||
| else { | |||||
| "language": TARGET_LANGUAGE, | |||||
| } | |||||
| ) | |||||
| if sys.platform == "darwin": | |||||
| import mlx_whisper | |||||
| result = mlx_whisper.transcribe( | |||||
| audio, | |||||
| path_or_hf_repo="mlx-community/whisper-large-v3-turbo", | |||||
| append_punctuations=".", | |||||
| ) | |||||
| else: | |||||
| result = pipe( | |||||
| audio, | |||||
| generate_kwargs=confg, | |||||
| ) | |||||
| if result["text"] in BAD_SENTENCES: | |||||
| continue | |||||
| text = cut_repetition(result["text"]) | |||||
| # Remove noise filter after some time | |||||
| if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS | |||||
| text_noise = "" | |||||
| ## Remove text noise independantly of casing | |||||
| text = remove_text_noise(text, text_noise) | |||||
| if text.strip() == "" or text.strip() == ".": | |||||
| continue | |||||
| node.send_output( | |||||
| "text", pa.array([text]), {"language": TARGET_LANGUAGE} | |||||
| ) | ) | ||||
| if result["text"] in BAD_SENTENCES: | |||||
| continue | |||||
| text = cut_repetition(result["text"]) | |||||
| node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE}) | |||||
| @@ -2,8 +2,8 @@ | |||||
| name = "dora-distil-whisper" | name = "dora-distil-whisper" | ||||
| version = "0.3.10" | version = "0.3.10" | ||||
| authors = [ | authors = [ | ||||
| { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, | |||||
| { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, | |||||
| { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, | |||||
| { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, | |||||
| ] | ] | ||||
| description = "Dora dora-distil-whisper" | description = "Dora dora-distil-whisper" | ||||
| license = { text = "MIT" } | license = { text = "MIT" } | ||||
| @@ -11,14 +11,14 @@ readme = "README.md" | |||||
| requires-python = ">=3.8" | requires-python = ">=3.8" | ||||
| dependencies = [ | dependencies = [ | ||||
| "dora-rs >= 0.3.9", | |||||
| "numpy < 2.0.0", | |||||
| "pyarrow >= 5.0.0", | |||||
| "transformers >= 4.0.0", | |||||
| "accelerate >= 0.29.2", | |||||
| "torch >= 2.2.0", | |||||
| "modelscope >= 1.18.1", | |||||
| "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", | |||||
| "dora-rs >= 0.3.9", | |||||
| "numpy < 2.0.0", | |||||
| "pyarrow >= 5.0.0", | |||||
| "transformers >= 4.0.0", | |||||
| "accelerate >= 0.29.2", | |||||
| "torch >= 2.2.0", | |||||
| "modelscope >= 1.18.1", | |||||
| "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", | |||||
| ] | ] | ||||
| @@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main" | |||||
| [tool.ruff.lint] | [tool.ruff.lint] | ||||
| extend-select = [ | extend-select = [ | ||||
| "D", # pydocstyle | |||||
| "D", # pydocstyle | |||||
| ] | ] | ||||
| @@ -18,7 +18,9 @@ def get_model_gguf(): | |||||
| from llama_cpp import Llama | from llama_cpp import Llama | ||||
| llm = Llama.from_pretrained( | llm = Llama.from_pretrained( | ||||
| repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False, | |||||
| repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", | |||||
| filename="*fp16.gguf", | |||||
| verbose=False, | |||||
| ) | ) | ||||
| return llm | return llm | ||||
| @@ -36,7 +38,9 @@ def get_model_huggingface(): | |||||
| model_name = "Qwen/Qwen2.5-0.5B-Instruct" | model_name = "Qwen/Qwen2.5-0.5B-Instruct" | ||||
| model = AutoModelForCausalLM.from_pretrained( | model = AutoModelForCausalLM.from_pretrained( | ||||
| model_name, torch_dtype="auto", device_map="auto", | |||||
| model_name, | |||||
| torch_dtype="auto", | |||||
| device_map="auto", | |||||
| ) | ) | ||||
| tokenizer = AutoTokenizer.from_pretrained(model_name) | tokenizer = AutoTokenizer.from_pretrained(model_name) | ||||
| return model, tokenizer | return model, tokenizer | ||||
| @@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str: | |||||
| """TODO: Add docstring.""" | """TODO: Add docstring.""" | ||||
| history += [{"role": "user", "content": prompt}] | history += [{"role": "user", "content": prompt}] | ||||
| text = tokenizer.apply_chat_template( | text = tokenizer.apply_chat_template( | ||||
| history, tokenize=False, add_generation_prompt=True, | |||||
| history, | |||||
| tokenize=False, | |||||
| add_generation_prompt=True, | |||||
| ) | ) | ||||
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | ||||
| generated_ids = model.generate(**model_inputs, max_new_tokens=512) | generated_ids = model.generate(**model_inputs, max_new_tokens=512) | ||||
| @@ -66,9 +72,9 @@ def main(): | |||||
| """TODO: Add docstring.""" | """TODO: Add docstring.""" | ||||
| history = [] | history = [] | ||||
| # If OS is not Darwin, use Huggingface model | # If OS is not Darwin, use Huggingface model | ||||
| if sys.platform != "": | |||||
| if sys.platform == "darwin": | |||||
| model = get_model_gguf() | model = get_model_gguf() | ||||
| elif sys.platform == "huggingface": | |||||
| elif sys.platform == "linux": | |||||
| model, tokenizer = get_model_huggingface() | model, tokenizer = get_model_huggingface() | ||||
| else: | else: | ||||
| model, tokenizer = get_model_darwin() | model, tokenizer = get_model_darwin() | ||||
| @@ -83,7 +89,7 @@ def main(): | |||||
| if any(word in ACTIVATION_WORDS for word in words): | if any(word in ACTIVATION_WORDS for word in words): | ||||
| # On linux, Windows | # On linux, Windows | ||||
| if sys.platform != "": | |||||
| if sys.platform == "darwin": | |||||
| response = model( | response = model( | ||||
| f"Q: {text} A: ", # Prompt | f"Q: {text} A: ", # Prompt | ||||
| max_tokens=24, | max_tokens=24, | ||||
| @@ -92,17 +98,23 @@ def main(): | |||||
| "\n", | "\n", | ||||
| ], # Stop generating just before the model would generate a new question | ], # Stop generating just before the model would generate a new question | ||||
| )["choices"][0]["text"] | )["choices"][0]["text"] | ||||
| elif sys.platform == "huggingface": | |||||
| elif sys.platform == "linux": | |||||
| response, history = generate_hf(model, tokenizer, text, history) | response, history = generate_hf(model, tokenizer, text, history) | ||||
| else: | else: | ||||
| from mlx_lm import generate | from mlx_lm import generate | ||||
| response = generate( | response = generate( | ||||
| model, tokenizer, prompt=text, verbose=False, max_tokens=50, | |||||
| model, | |||||
| tokenizer, | |||||
| prompt=text, | |||||
| verbose=False, | |||||
| max_tokens=50, | |||||
| ) | ) | ||||
| node.send_output( | node.send_output( | ||||
| output_id="text", data=pa.array([response]), metadata={}, | |||||
| output_id="text", | |||||
| data=pa.array([response]), | |||||
| metadata={}, | |||||
| ) | ) | ||||