Browse Source

Add noise filtering on whipser to be able to use speakers (#847)

This PR makes it possible to use speakers when talking to an AI through
voice. It is done by filtering out the output of the TTS from the input
of the STT using fuzzy matching.
tags/v0.3.11-rc1
Haixuan Xavier Tao GitHub 10 months ago
parent
commit
844e94a6be
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
10 changed files with 255 additions and 83 deletions
  1. +8
    -2
      .github/workflows/ci.yml
  2. +10
    -21
      .github/workflows/node_hub_test.sh
  3. +1
    -0
      _typos.toml
  4. +20
    -12
      apis/c++/node/build.rs
  5. +59
    -0
      examples/llm/qwen-dev-interruption.yml
  6. +1
    -2
      examples/llm/qwen-dev.yml
  7. +123
    -24
      node-hub/dora-distil-whisper/dora_distil_whisper/main.py
  8. +11
    -11
      node-hub/dora-distil-whisper/pyproject.toml
  9. +21
    -9
      node-hub/dora-qwen/dora_qwen/main.py
  10. +1
    -2
      node-hub/dora-reachy1/dora_reachy1/main.py

+ 8
- 2
.github/workflows/ci.yml View File

@@ -491,6 +491,8 @@ jobs:
target: aarch64-unknown-linux-musl
- runner: ubuntu-22.04
target: armv7-unknown-linux-musleabihf
- runner: ubuntu-22.04
target: x86_64-pc-windows-gnu
- runner: macos-13
target: aarch64-apple-darwin
- runner: macos-13
@@ -501,9 +503,13 @@ jobs:
- uses: r7kamura/rust-problem-matchers@v1.1.0
- name: "Add toolchains"
run: rustup target add ${{ matrix.platform.target }}
- name: "Build"
- name: Install system-level dependencies
if: runner.target == 'x86_64-pc-windows-gnu'
run: |
sudo apt install g++-mingw-w64-x86-64 gcc-mingw-w64-x86-64
- name: "Check"
uses: actions-rs/cargo@v1
with:
use-cross: true
command: check
args: --target ${{ matrix.platform.target }} -p dora-cli
args: --target ${{ matrix.platform.target }} --all --exclude dora-node-api-python --exclude dora-operator-api-python --exclude dora-ros2-bridge-python

+ 10
- 21
.github/workflows/node_hub_test.sh View File

@@ -27,8 +27,8 @@ else
cargo test

pip install "maturin[zig]"
maturin build --zig --release
# If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
maturin build --zig
# If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel on multiple platforms
if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
# Free up ubuntu space
sudo apt-get clean
@@ -37,29 +37,18 @@ else
sudo rm -rf /opt/ghc/

maturin publish --skip-existing --zig
fi

# aarch64-unknown-linux-gnu
rustup target add aarch64-unknown-linux-gnu
maturin build --target aarch64-unknown-linux-gnu --zig --release
# If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
# aarch64-unknown-linux-gnu
rustup target add aarch64-unknown-linux-gnu
maturin publish --target aarch64-unknown-linux-gnu --skip-existing --zig
fi
# armv7-unknown-linux-musleabihf
rustup target add armv7-unknown-linux-musleabihf
maturin build --target armv7-unknown-linux-musleabihf --zig --release
# If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
# armv7-unknown-linux-musleabihf
rustup target add armv7-unknown-linux-musleabihf
# If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
maturin publish --target armv7-unknown-linux-musleabihf --skip-existing --zig
fi

# x86_64-pc-windows-gnu
rustup target add x86_64-pc-windows-gnu
maturin build --target x86_64-pc-windows-gnu --release
# If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
if [ "$GITHUB_EVENT_NAME" == "release" ] || [ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]; then
# x86_64-pc-windows-gnu
rustup target add x86_64-pc-windows-gnu
# If GITHUB_EVENT_NAME is release or workflow_dispatch, publish the wheel
maturin publish --target x86_64-pc-windows-gnu --skip-existing
fi



+ 1
- 0
_typos.toml View File

@@ -1,3 +1,4 @@
[default.extend-identifiers]
# *sigh* this just isn't worth the cost of fixing
DeviceNDArray = "DeviceNDArray"
Feedforward_2nd_Gain = "Feedforward_2nd_Gain"

+ 20
- 12
apis/c++/node/build.rs View File

@@ -9,10 +9,7 @@ fn main() {
println!("cargo:rerun-if-changed=src/lib.rs");

// rename header files
let src_dir = target_dir()
.join("cxxbridge")
.join("dora-node-api-cxx")
.join("src");
let src_dir = origin_dir();
let target_dir = src_dir.parent().unwrap();
std::fs::copy(src_dir.join("lib.rs.h"), target_dir.join("dora-node-api.h")).unwrap();
std::fs::copy(
@@ -28,8 +25,8 @@ fn main() {
bridge_files.clear();
}

fn target_dir() -> PathBuf {
std::env::var("CARGO_TARGET_DIR")
fn origin_dir() -> PathBuf {
let default_target = std::env::var("CARGO_TARGET_DIR")
.map(PathBuf::from)
.unwrap_or_else(|_| {
let root = Path::new(env!("CARGO_MANIFEST_DIR"))
@@ -37,12 +34,26 @@ fn target_dir() -> PathBuf {
.nth(3)
.unwrap();
root.join("target")
})
});
let cross_target = default_target
.join(std::env::var("TARGET").unwrap())
.join("cxxbridge")
.join("dora-node-api-cxx")
.join("src");

if cross_target.exists() {
cross_target
} else {
default_target
.join("cxxbridge")
.join("dora-node-api-cxx")
.join("src")
}
}

#[cfg(feature = "ros2-bridge")]
mod ros2 {
use super::target_dir;
use super::origin_dir;
use std::{
io::{BufRead, BufReader},
path::{Component, Path, PathBuf},
@@ -113,10 +124,7 @@ mod ros2 {
.join("ros2_bindings.rs.cc");

// copy message files to target directory
let target_path = target_dir()
.join("cxxbridge")
.join("dora-node-api-cxx")
.join("dora-ros2-bindings.h");
let target_path = origin_dir().parent().unwrap().join("dora-ros2-bindings.h");

std::fs::copy(&header_path, &target_path).unwrap();
println!("cargo:rerun-if-changed={}", header_path.display());


+ 59
- 0
examples/llm/qwen-dev-interruption.yml View File

@@ -0,0 +1,59 @@
nodes:
- id: dora-microphone
build: pip install -e ../../node-hub/dora-microphone
path: dora-microphone
inputs:
tick: dora/timer/millis/2000
outputs:
- audio

- id: dora-vad
build: pip install -e ../../node-hub/dora-vad
path: dora-vad
inputs:
audio: dora-microphone/audio
outputs:
- audio
- timestamp_start

- id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper
path: dora-distil-whisper
inputs:
input: dora-vad/audio
outputs:
- text
env:
TARGET_LANGUAGE: english

- id: dora-qwen
build: pip install -e ../../node-hub/dora-qwen
path: dora-qwen
inputs:
text: dora-distil-whisper/text
outputs:
- text

- id: plot
build: pip install -e ../../node-hub/dora-rerun
path: dora-rerun
inputs:
text_qwen: dora-qwen/text
text_whisper: dora-distil-whisper/text

- id: dora-kokoro-tts
build: pip install -e ../../node-hub/dora-kokoro-tts
path: dora-kokoro-tts
inputs:
text: dora-qwen/text
outputs:
- audio
env:
ACTIVATION_WORDS: you

- id: dora-pyaudio
build: pip install -e ../../node-hub/dora-pyaudio
path: dora-pyaudio
inputs:
audio: dora-kokoro-tts/audio
timestamp_start: dora-vad/timestamp_start

+ 1
- 2
examples/llm/qwen-dev.yml View File

@@ -14,12 +14,12 @@ nodes:
audio: dora-microphone/audio
outputs:
- audio
- timestamp_start

- id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper
path: dora-distil-whisper
inputs:
text_noise: dora-qwen/text
input: dora-vad/audio
outputs:
- text
@@ -56,4 +56,3 @@ nodes:
path: dora-pyaudio
inputs:
audio: dora-kokoro-tts/audio
timestamp_start: dora-vad/timestamp_start

+ 123
- 24
node-hub/dora-distil-whisper/dora_distil_whisper/main.py View File

@@ -1,7 +1,9 @@
"""TODO: Add docstring."""

import os
import re
import sys
import time
from pathlib import Path

import pyarrow as pa
@@ -13,6 +15,79 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])


def remove_text_noise(text: str, text_noise="") -> str:
"""Remove noise from text.

Args:
text (str): Original text
text_noise (str): text to remove from the original text

Returns:
str: Cleaned text

"""
# Handle the case where text_noise is empty
if not text_noise.strip():
return (
text # Return the original text if text_noise is empty or just whitespace
)

# Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
def normalize(s):
# Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
s = re.sub(r"-", " ", s)
# Remove other punctuation and convert to lowercase
s = re.sub(r"[^\w\s]", "", s).lower()
return s

# Normalize both text and text_noise
normalized_text = normalize(text)
normalized_noise = normalize(text_noise)

# Split into words
text_words = normalized_text.split()
noise_words = normalized_noise.split()

# Function to find and remove noise sequence flexibly
def remove_flexible(text_list, noise_list):
i = 0
while i <= len(text_list) - len(noise_list):
match = True
extra_words = 0
for j, noise_word in enumerate(noise_list):
if i + j + extra_words >= len(text_list):
match = False
break
# Allow skipping extra words in text_list
while (
i + j + extra_words < len(text_list)
and text_list[i + j + extra_words] != noise_word
):
extra_words += 1
if i + j + extra_words >= len(text_list):
match = False
break
if not match:
break
if match:
# Remove matched part
del text_list[i : i + len(noise_list) + extra_words]
i = max(0, i - len(noise_list)) # Adjust index after removal
else:
i += 1
return text_list

# Only remove parts of text_noise that are found in text
cleaned_words = text_words[:]
for noise_word in noise_words:
if noise_word in cleaned_words:
cleaned_words.remove(noise_word)

# Reconstruct the cleaned text
cleaned_text = " ".join(cleaned_words)
return cleaned_text


def load_model():
"""TODO: Add docstring."""
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
@@ -69,6 +144,7 @@ BAD_SENTENCES = [
" Sous-titrage Société Radio-Canada",
" Sous",
" Sous-",
" i'm going to go to the next one.",
]


@@ -109,36 +185,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
def main():
"""TODO: Add docstring."""
node = Node()

text_noise = ""
noise_timestamp = time.time()
# For macos use mlx:
if sys.platform != "darwin":
pipe = load_model()

for event in node:
if event["type"] == "INPUT":
audio = event["value"].to_numpy()
confg = (
{"language": TARGET_LANGUAGE, "task": "translate"}
if TRANSLATE
else {
"language": TARGET_LANGUAGE,
}
)
if sys.platform == "darwin":
import mlx_whisper

result = mlx_whisper.transcribe(
audio,
path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
append_punctuations=".",
if "text_noise" in event["id"]:
text_noise = event["value"][0].as_py()
text_noise = (
text_noise.replace("(", "")
.replace(")", "")
.replace("[", "")
.replace("]", "")
)
noise_timestamp = time.time()
else:
result = pipe(
audio,
generate_kwargs=confg,
audio = event["value"].to_numpy()
confg = (
{"language": TARGET_LANGUAGE, "task": "translate"}
if TRANSLATE
else {
"language": TARGET_LANGUAGE,
}
)
if sys.platform == "darwin":
import mlx_whisper

result = mlx_whisper.transcribe(
audio,
path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
append_punctuations=".",
)

else:
result = pipe(
audio,
generate_kwargs=confg,
)
if result["text"] in BAD_SENTENCES:
continue
text = cut_repetition(result["text"])

# Remove noise filter after some time
if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS
text_noise = ""

## Remove text noise independently of casing
text = remove_text_noise(text, text_noise)

if text.strip() == "" or text.strip() == ".":
continue
node.send_output(
"text", pa.array([text]), {"language": TARGET_LANGUAGE}
)
if result["text"] in BAD_SENTENCES:
continue
text = cut_repetition(result["text"])
node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE})

+ 11
- 11
node-hub/dora-distil-whisper/pyproject.toml View File

@@ -2,8 +2,8 @@
name = "dora-distil-whisper"
version = "0.3.10"
authors = [
{ name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
{ name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
{ name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
{ name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
]
description = "Dora dora-distil-whisper"
license = { text = "MIT" }
@@ -11,14 +11,14 @@ readme = "README.md"
requires-python = ">=3.8"

dependencies = [
"dora-rs >= 0.3.9",
"numpy < 2.0.0",
"pyarrow >= 5.0.0",
"transformers >= 4.0.0",
"accelerate >= 0.29.2",
"torch >= 2.2.0",
"modelscope >= 1.18.1",
"mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
"dora-rs >= 0.3.9",
"numpy < 2.0.0",
"pyarrow >= 5.0.0",
"transformers >= 4.0.0",
"accelerate >= 0.29.2",
"torch >= 2.2.0",
"modelscope >= 1.18.1",
"mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
]


@@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main"

[tool.ruff.lint]
extend-select = [
"D", # pydocstyle
"D", # pydocstyle
]

+ 21
- 9
node-hub/dora-qwen/dora_qwen/main.py View File

@@ -18,7 +18,9 @@ def get_model_gguf():
from llama_cpp import Llama

llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False,
repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF",
filename="*fp16.gguf",
verbose=False,
)
return llm

@@ -36,7 +38,9 @@ def get_model_huggingface():
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype="auto", device_map="auto",
model_name,
torch_dtype="auto",
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
return model, tokenizer
@@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str:
"""TODO: Add docstring."""
history += [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
history, tokenize=False, add_generation_prompt=True,
history,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(**model_inputs, max_new_tokens=512)
@@ -66,9 +72,9 @@ def main():
"""TODO: Add docstring."""
history = []
# If OS is not Darwin, use Huggingface model
if sys.platform != "":
if sys.platform == "darwin":
model = get_model_gguf()
elif sys.platform == "huggingface":
elif sys.platform == "linux":
model, tokenizer = get_model_huggingface()
else:
model, tokenizer = get_model_darwin()
@@ -83,7 +89,7 @@ def main():

if any(word in ACTIVATION_WORDS for word in words):
# On linux, Windows
if sys.platform != "":
if sys.platform == "darwin":
response = model(
f"Q: {text} A: ", # Prompt
max_tokens=24,
@@ -92,17 +98,23 @@ def main():
"\n",
], # Stop generating just before the model would generate a new question
)["choices"][0]["text"]
elif sys.platform == "huggingface":
elif sys.platform == "linux":
response, history = generate_hf(model, tokenizer, text, history)
else:
from mlx_lm import generate

response = generate(
model, tokenizer, prompt=text, verbose=False, max_tokens=50,
model,
tokenizer,
prompt=text,
verbose=False,
max_tokens=50,
)

node.send_output(
output_id="text", data=pa.array([response]), metadata={},
output_id="text",
data=pa.array([response]),
metadata={},
)




+ 1
- 2
node-hub/dora-reachy1/dora_reachy1/main.py View File

@@ -47,7 +47,6 @@ def sad_antennas(reachy):


def main():

node = Node()

ROBOT_IP = os.getenv("ROBOT_IP", "10.42.0.24")
@@ -109,7 +108,7 @@ def main():
reachy.joints.r_gripper.goal_position = goal
time.sleep(0.02)

# When openning the gripper always go to default pose
# When opening the gripper always go to default pose
if action == -100:
goto(
{


Loading…
Cancel
Save