This PR makes it possible to use speakers when talking to an AI through voice. It is done by filtering out the output of the TTS from the input of the STT using fuzzy matching.tags/v0.3.11-rc1
| @@ -491,6 +491,8 @@ jobs: | |||
| target: aarch64-unknown-linux-musl | |||
| - runner: ubuntu-22.04 | |||
| target: armv7-unknown-linux-musleabihf | |||
| - runner: ubuntu-22.04 | |||
| target: x86_64-pc-windows-gnu | |||
| - runner: macos-13 | |||
| target: aarch64-apple-darwin | |||
| - runner: macos-13 | |||
| @@ -501,9 +503,13 @@ jobs: | |||
| - uses: r7kamura/rust-problem-matchers@v1.1.0 | |||
| - name: "Add toolchains" | |||
| run: rustup target add ${{ matrix.platform.target }} | |||
| - name: "Build" | |||
| - name: Install system-level dependencies | |||
| if: runner.target == 'x86_64-pc-windows-gnu' | |||
| run: | | |||
| sudo apt install g++-mingw-w64-x86-64 gcc-mingw-w64-x86-64 | |||
| - name: "Check" | |||
| uses: actions-rs/cargo@v1 | |||
| with: | |||
| use-cross: true | |||
| command: check | |||
| args: --target ${{ matrix.platform.target }} -p dora-cli | |||
| args: --target ${{ matrix.platform.target }} --all --exclude dora-node-api-python --exclude dora-operator-api-python --exclude dora-ros2-bridge-python | |||
| @@ -27,8 +27,8 @@ else | |||
| cargo test | |||
| pip install "maturin[zig]" | |||
| maturin build --zig --release | |||
| # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel | |||
| maturin build --zig | |||
| # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel on multiple platforms | |||
| if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then | |||
| # Free up ubuntu space | |||
| sudo apt-get clean | |||
| @@ -37,29 +37,18 @@ else | |||
| sudo rm -rf /opt/ghc/ | |||
| maturin publish --skip-existing --zig | |||
| fi | |||
| # aarch64-unknown-linux-gnu | |||
| rustup target add aarch64-unknown-linux-gnu | |||
| maturin build --target aarch64-unknown-linux-gnu --zig --release | |||
| # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel | |||
| if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then | |||
| # aarch64-unknown-linux-gnu | |||
| rustup target add aarch64-unknown-linux-gnu | |||
| maturin publish --target aarch64-unknown-linux-gnu --skip-existing --zig | |||
| fi | |||
| # armv7-unknown-linux-musleabihf | |||
| rustup target add armv7-unknown-linux-musleabihf | |||
| maturin build --target armv7-unknown-linux-musleabihf --zig --release | |||
| # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel | |||
| if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then | |||
| # armv7-unknown-linux-musleabihf | |||
| rustup target add armv7-unknown-linux-musleabihf | |||
| # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel | |||
| maturin publish --target armv7-unknown-linux-musleabihf --skip-existing --zig | |||
| fi | |||
| # x86_64-pc-windows-gnu | |||
| rustup target add x86_64-pc-windows-gnu | |||
| maturin build --target x86_64-pc-windows-gnu --release | |||
| # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel | |||
| if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then | |||
| # x86_64-pc-windows-gnu | |||
| rustup target add x86_64-pc-windows-gnu | |||
| # If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel | |||
| maturin publish --target x86_64-pc-windows-gnu --skip-existing | |||
| fi | |||
| @@ -1,3 +1,4 @@ | |||
| [default.extend-identifiers] | |||
| # *sigh* this just isn't worth the cost of fixing | |||
| DeviceNDArray = "DeviceNDArray" | |||
| Feedforward_2nd_Gain = "Feedforward_2nd_Gain" | |||
| @@ -9,10 +9,7 @@ fn main() { | |||
| println!("cargo:rerun-if-changed=src/lib.rs"); | |||
| // rename header files | |||
| let src_dir = target_dir() | |||
| .join("cxxbridge") | |||
| .join("dora-node-api-cxx") | |||
| .join("src"); | |||
| let src_dir = origin_dir(); | |||
| let target_dir = src_dir.parent().unwrap(); | |||
| std::fs::copy(src_dir.join("lib.rs.h"), target_dir.join("dora-node-api.h")).unwrap(); | |||
| std::fs::copy( | |||
| @@ -28,8 +25,8 @@ fn main() { | |||
| bridge_files.clear(); | |||
| } | |||
| fn target_dir() -> PathBuf { | |||
| std::env::var("CARGO_TARGET_DIR") | |||
| fn origin_dir() -> PathBuf { | |||
| let default_target = std::env::var("CARGO_TARGET_DIR") | |||
| .map(PathBuf::from) | |||
| .unwrap_or_else(|_| { | |||
| let root = Path::new(env!("CARGO_MANIFEST_DIR")) | |||
| @@ -37,12 +34,26 @@ fn target_dir() -> PathBuf { | |||
| .nth(3) | |||
| .unwrap(); | |||
| root.join("target") | |||
| }) | |||
| }); | |||
| let cross_target = default_target | |||
| .join(std::env::var("TARGET").unwrap()) | |||
| .join("cxxbridge") | |||
| .join("dora-node-api-cxx") | |||
| .join("src"); | |||
| if cross_target.exists() { | |||
| cross_target | |||
| } else { | |||
| default_target | |||
| .join("cxxbridge") | |||
| .join("dora-node-api-cxx") | |||
| .join("src") | |||
| } | |||
| } | |||
| #[cfg(feature = "ros2-bridge")] | |||
| mod ros2 { | |||
| use super::target_dir; | |||
| use super::origin_dir; | |||
| use std::{ | |||
| io::{BufRead, BufReader}, | |||
| path::{Component, Path, PathBuf}, | |||
| @@ -113,10 +124,7 @@ mod ros2 { | |||
| .join("ros2_bindings.rs.cc"); | |||
| // copy message files to target directory | |||
| let target_path = target_dir() | |||
| .join("cxxbridge") | |||
| .join("dora-node-api-cxx") | |||
| .join("dora-ros2-bindings.h"); | |||
| let target_path = origin_dir().parent().unwrap().join("dora-ros2-bindings.h"); | |||
| std::fs::copy(&header_path, &target_path).unwrap(); | |||
| println!("cargo:rerun-if-changed={}", header_path.display()); | |||
| @@ -0,0 +1,59 @@ | |||
| nodes: | |||
| - id: dora-microphone | |||
| build: pip install -e ../../node-hub/dora-microphone | |||
| path: dora-microphone | |||
| inputs: | |||
| tick: dora/timer/millis/2000 | |||
| outputs: | |||
| - audio | |||
| - id: dora-vad | |||
| build: pip install -e ../../node-hub/dora-vad | |||
| path: dora-vad | |||
| inputs: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - timestamp_start | |||
| - id: dora-distil-whisper | |||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||
| path: dora-distil-whisper | |||
| inputs: | |||
| input: dora-vad/audio | |||
| outputs: | |||
| - text | |||
| env: | |||
| TARGET_LANGUAGE: english | |||
| - id: dora-qwen | |||
| build: pip install -e ../../node-hub/dora-qwen | |||
| path: dora-qwen | |||
| inputs: | |||
| text: dora-distil-whisper/text | |||
| outputs: | |||
| - text | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| inputs: | |||
| text_qwen: dora-qwen/text | |||
| text_whisper: dora-distil-whisper/text | |||
| - id: dora-kokoro-tts | |||
| build: pip install -e ../../node-hub/dora-kokoro-tts | |||
| path: dora-kokoro-tts | |||
| inputs: | |||
| text: dora-qwen/text | |||
| outputs: | |||
| - audio | |||
| env: | |||
| ACTIVATION_WORDS: you | |||
| - id: dora-pyaudio | |||
| build: pip install -e ../../node-hub/dora-pyaudio | |||
| path: dora-pyaudio | |||
| inputs: | |||
| audio: dora-kokoro-tts/audio | |||
| timestamp_start: dora-vad/timestamp_start | |||
| @@ -14,12 +14,12 @@ nodes: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - timestamp_start | |||
| - id: dora-distil-whisper | |||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||
| path: dora-distil-whisper | |||
| inputs: | |||
| text_noise: dora-qwen/text | |||
| input: dora-vad/audio | |||
| outputs: | |||
| - text | |||
| @@ -56,4 +56,3 @@ nodes: | |||
| path: dora-pyaudio | |||
| inputs: | |||
| audio: dora-kokoro-tts/audio | |||
| timestamp_start: dora-vad/timestamp_start | |||
| @@ -1,7 +1,9 @@ | |||
| """TODO: Add docstring.""" | |||
| import os | |||
| import re | |||
| import sys | |||
| import time | |||
| from pathlib import Path | |||
| import pyarrow as pa | |||
| @@ -13,6 +15,79 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english") | |||
| TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"]) | |||
| def remove_text_noise(text: str, text_noise="") -> str: | |||
| """Remove noise from text. | |||
| Args: | |||
| text (str): Original text | |||
| text_noise (str): text to remove from the original text | |||
| Returns: | |||
| str: Cleaned text | |||
| """ | |||
| # Handle the case where text_noise is empty | |||
| if not text_noise.strip(): | |||
| return ( | |||
| text # Return the original text if text_noise is empty or just whitespace | |||
| ) | |||
| # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens) | |||
| def normalize(s): | |||
| # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent | |||
| s = re.sub(r"-", " ", s) | |||
| # Remove other punctuation and convert to lowercase | |||
| s = re.sub(r"[^\w\s]", "", s).lower() | |||
| return s | |||
| # Normalize both text and text_noise | |||
| normalized_text = normalize(text) | |||
| normalized_noise = normalize(text_noise) | |||
| # Split into words | |||
| text_words = normalized_text.split() | |||
| noise_words = normalized_noise.split() | |||
| # Function to find and remove noise sequence flexibly | |||
| def remove_flexible(text_list, noise_list): | |||
| i = 0 | |||
| while i <= len(text_list) - len(noise_list): | |||
| match = True | |||
| extra_words = 0 | |||
| for j, noise_word in enumerate(noise_list): | |||
| if i + j + extra_words >= len(text_list): | |||
| match = False | |||
| break | |||
| # Allow skipping extra words in text_list | |||
| while ( | |||
| i + j + extra_words < len(text_list) | |||
| and text_list[i + j + extra_words] != noise_word | |||
| ): | |||
| extra_words += 1 | |||
| if i + j + extra_words >= len(text_list): | |||
| match = False | |||
| break | |||
| if not match: | |||
| break | |||
| if match: | |||
| # Remove matched part | |||
| del text_list[i : i + len(noise_list) + extra_words] | |||
| i = max(0, i - len(noise_list)) # Adjust index after removal | |||
| else: | |||
| i += 1 | |||
| return text_list | |||
| # Only remove parts of text_noise that are found in text | |||
| cleaned_words = text_words[:] | |||
| for noise_word in noise_words: | |||
| if noise_word in cleaned_words: | |||
| cleaned_words.remove(noise_word) | |||
| # Reconstruct the cleaned text | |||
| cleaned_text = " ".join(cleaned_words) | |||
| return cleaned_text | |||
| def load_model(): | |||
| """TODO: Add docstring.""" | |||
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |||
| @@ -69,6 +144,7 @@ BAD_SENTENCES = [ | |||
| " Sous-titrage Société Radio-Canada", | |||
| " Sous", | |||
| " Sous-", | |||
| " i'm going to go to the next one.", | |||
| ] | |||
| @@ -109,36 +185,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50): | |||
| def main(): | |||
| """TODO: Add docstring.""" | |||
| node = Node() | |||
| text_noise = "" | |||
| noise_timestamp = time.time() | |||
| # For macos use mlx: | |||
| if sys.platform != "darwin": | |||
| pipe = load_model() | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| audio = event["value"].to_numpy() | |||
| confg = ( | |||
| {"language": TARGET_LANGUAGE, "task": "translate"} | |||
| if TRANSLATE | |||
| else { | |||
| "language": TARGET_LANGUAGE, | |||
| } | |||
| ) | |||
| if sys.platform == "darwin": | |||
| import mlx_whisper | |||
| result = mlx_whisper.transcribe( | |||
| audio, | |||
| path_or_hf_repo="mlx-community/whisper-large-v3-turbo", | |||
| append_punctuations=".", | |||
| if "text_noise" in event["id"]: | |||
| text_noise = event["value"][0].as_py() | |||
| text_noise = ( | |||
| text_noise.replace("(", "") | |||
| .replace(")", "") | |||
| .replace("[", "") | |||
| .replace("]", "") | |||
| ) | |||
| noise_timestamp = time.time() | |||
| else: | |||
| result = pipe( | |||
| audio, | |||
| generate_kwargs=confg, | |||
| audio = event["value"].to_numpy() | |||
| confg = ( | |||
| {"language": TARGET_LANGUAGE, "task": "translate"} | |||
| if TRANSLATE | |||
| else { | |||
| "language": TARGET_LANGUAGE, | |||
| } | |||
| ) | |||
| if sys.platform == "darwin": | |||
| import mlx_whisper | |||
| result = mlx_whisper.transcribe( | |||
| audio, | |||
| path_or_hf_repo="mlx-community/whisper-large-v3-turbo", | |||
| append_punctuations=".", | |||
| ) | |||
| else: | |||
| result = pipe( | |||
| audio, | |||
| generate_kwargs=confg, | |||
| ) | |||
| if result["text"] in BAD_SENTENCES: | |||
| continue | |||
| text = cut_repetition(result["text"]) | |||
| # Remove noise filter after some time | |||
| if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS | |||
| text_noise = "" | |||
| ## Remove text noise independently of casing | |||
| text = remove_text_noise(text, text_noise) | |||
| if text.strip() == "" or text.strip() == ".": | |||
| continue | |||
| node.send_output( | |||
| "text", pa.array([text]), {"language": TARGET_LANGUAGE} | |||
| ) | |||
| if result["text"] in BAD_SENTENCES: | |||
| continue | |||
| text = cut_repetition(result["text"]) | |||
| node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE}) | |||
| @@ -2,8 +2,8 @@ | |||
| name = "dora-distil-whisper" | |||
| version = "0.3.10" | |||
| authors = [ | |||
| { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, | |||
| { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, | |||
| { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" }, | |||
| { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" }, | |||
| ] | |||
| description = "Dora dora-distil-whisper" | |||
| license = { text = "MIT" } | |||
| @@ -11,14 +11,14 @@ readme = "README.md" | |||
| requires-python = ">=3.8" | |||
| dependencies = [ | |||
| "dora-rs >= 0.3.9", | |||
| "numpy < 2.0.0", | |||
| "pyarrow >= 5.0.0", | |||
| "transformers >= 4.0.0", | |||
| "accelerate >= 0.29.2", | |||
| "torch >= 2.2.0", | |||
| "modelscope >= 1.18.1", | |||
| "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", | |||
| "dora-rs >= 0.3.9", | |||
| "numpy < 2.0.0", | |||
| "pyarrow >= 5.0.0", | |||
| "transformers >= 4.0.0", | |||
| "accelerate >= 0.29.2", | |||
| "torch >= 2.2.0", | |||
| "modelscope >= 1.18.1", | |||
| "mlx-whisper >= 0.4.1; sys_platform == 'darwin'", | |||
| ] | |||
| @@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main" | |||
| [tool.ruff.lint] | |||
| extend-select = [ | |||
| "D", # pydocstyle | |||
| "D", # pydocstyle | |||
| ] | |||
| @@ -18,7 +18,9 @@ def get_model_gguf(): | |||
| from llama_cpp import Llama | |||
| llm = Llama.from_pretrained( | |||
| repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False, | |||
| repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", | |||
| filename="*fp16.gguf", | |||
| verbose=False, | |||
| ) | |||
| return llm | |||
| @@ -36,7 +38,9 @@ def get_model_huggingface(): | |||
| model_name = "Qwen/Qwen2.5-0.5B-Instruct" | |||
| model = AutoModelForCausalLM.from_pretrained( | |||
| model_name, torch_dtype="auto", device_map="auto", | |||
| model_name, | |||
| torch_dtype="auto", | |||
| device_map="auto", | |||
| ) | |||
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |||
| return model, tokenizer | |||
| @@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str: | |||
| """TODO: Add docstring.""" | |||
| history += [{"role": "user", "content": prompt}] | |||
| text = tokenizer.apply_chat_template( | |||
| history, tokenize=False, add_generation_prompt=True, | |||
| history, | |||
| tokenize=False, | |||
| add_generation_prompt=True, | |||
| ) | |||
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |||
| generated_ids = model.generate(**model_inputs, max_new_tokens=512) | |||
| @@ -66,9 +72,9 @@ def main(): | |||
| """TODO: Add docstring.""" | |||
| history = [] | |||
| # If OS is not Darwin, use Huggingface model | |||
| if sys.platform != "": | |||
| if sys.platform == "darwin": | |||
| model = get_model_gguf() | |||
| elif sys.platform == "huggingface": | |||
| elif sys.platform == "linux": | |||
| model, tokenizer = get_model_huggingface() | |||
| else: | |||
| model, tokenizer = get_model_darwin() | |||
| @@ -83,7 +89,7 @@ def main(): | |||
| if any(word in ACTIVATION_WORDS for word in words): | |||
| # On linux, Windows | |||
| if sys.platform != "": | |||
| if sys.platform == "darwin": | |||
| response = model( | |||
| f"Q: {text} A: ", # Prompt | |||
| max_tokens=24, | |||
| @@ -92,17 +98,23 @@ def main(): | |||
| "\n", | |||
| ], # Stop generating just before the model would generate a new question | |||
| )["choices"][0]["text"] | |||
| elif sys.platform == "huggingface": | |||
| elif sys.platform == "linux": | |||
| response, history = generate_hf(model, tokenizer, text, history) | |||
| else: | |||
| from mlx_lm import generate | |||
| response = generate( | |||
| model, tokenizer, prompt=text, verbose=False, max_tokens=50, | |||
| model, | |||
| tokenizer, | |||
| prompt=text, | |||
| verbose=False, | |||
| max_tokens=50, | |||
| ) | |||
| node.send_output( | |||
| output_id="text", data=pa.array([response]), metadata={}, | |||
| output_id="text", | |||
| data=pa.array([response]), | |||
| metadata={}, | |||
| ) | |||
| @@ -47,7 +47,6 @@ def sad_antennas(reachy): | |||
| def main(): | |||
| node = Node() | |||
| ROBOT_IP = os.getenv("ROBOT_IP", "10.42.0.24") | |||
| @@ -109,7 +108,7 @@ def main(): | |||
| reachy.joints.r_gripper.goal_position = goal | |||
| time.sleep(0.02) | |||
| # When openning the gripper always go to default pose | |||
| # When opening the gripper always go to default pose | |||
| if action == -100: | |||
| goto( | |||
| { | |||