Browse Source

Add noise filtering on whipser to be able to use speakers

tags/v0.3.11-rc1
haixuanTao 10 months ago
parent
commit
0e3f89ac59
5 changed files with 205 additions and 46 deletions
  1. +59
    -0
      examples/llm/qwen-dev-interruption.yml
  2. +1
    -2
      examples/llm/qwen-dev.yml
  3. +113
    -24
      node-hub/dora-distil-whisper/dora_distil_whisper/main.py
  4. +11
    -11
      node-hub/dora-distil-whisper/pyproject.toml
  5. +21
    -9
      node-hub/dora-qwen/dora_qwen/main.py

+ 59
- 0
examples/llm/qwen-dev-interruption.yml View File

@@ -0,0 +1,59 @@
nodes:
- id: dora-microphone
build: pip install -e ../../node-hub/dora-microphone
path: dora-microphone
inputs:
tick: dora/timer/millis/2000
outputs:
- audio

- id: dora-vad
build: pip install -e ../../node-hub/dora-vad
path: dora-vad
inputs:
audio: dora-microphone/audio
outputs:
- audio
- timestamp_start

- id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper
path: dora-distil-whisper
inputs:
input: dora-vad/audio
outputs:
- text
env:
TARGET_LANGUAGE: english

- id: dora-qwen
build: pip install -e ../../node-hub/dora-qwen
path: dora-qwen
inputs:
text: dora-distil-whisper/text
outputs:
- text

- id: plot
build: pip install -e ../../node-hub/dora-rerun
path: dora-rerun
inputs:
text_qwen: dora-qwen/text
text_whisper: dora-distil-whisper/text

- id: dora-kokoro-tts
build: pip install -e ../../node-hub/dora-kokoro-tts
path: dora-kokoro-tts
inputs:
text: dora-qwen/text
outputs:
- audio
env:
ACTIVATION_WORDS: you

- id: dora-pyaudio
build: pip install -e ../../node-hub/dora-pyaudio
path: dora-pyaudio
inputs:
audio: dora-kokoro-tts/audio
timestamp_start: dora-vad/timestamp_start

+ 1
- 2
examples/llm/qwen-dev.yml View File

@@ -14,12 +14,12 @@ nodes:
audio: dora-microphone/audio audio: dora-microphone/audio
outputs: outputs:
- audio - audio
- timestamp_start


- id: dora-distil-whisper - id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper build: pip install -e ../../node-hub/dora-distil-whisper
path: dora-distil-whisper path: dora-distil-whisper
inputs: inputs:
text_noise: dora-qwen/text
input: dora-vad/audio input: dora-vad/audio
outputs: outputs:
- text - text
@@ -56,4 +56,3 @@ nodes:
path: dora-pyaudio path: dora-pyaudio
inputs: inputs:
audio: dora-kokoro-tts/audio audio: dora-kokoro-tts/audio
timestamp_start: dora-vad/timestamp_start

+ 113
- 24
node-hub/dora-distil-whisper/dora_distil_whisper/main.py View File

@@ -1,7 +1,9 @@
"""TODO: Add docstring.""" """TODO: Add docstring."""


import os import os
import re
import sys import sys
import time
from pathlib import Path from pathlib import Path


import pyarrow as pa import pyarrow as pa
@@ -13,6 +15,69 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"]) TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])




def remove_text_noise(text, text_noise):
# Handle the case where text_noise is empty
if not text_noise.strip():
return (
text # Return the original text if text_noise is empty or just whitespace
)

# Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
def normalize(s):
# Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
s = re.sub(r"-", " ", s)
# Remove other punctuation and convert to lowercase
s = re.sub(r"[^\w\s]", "", s).lower()
return s

# Normalize both text and text_noise
normalized_text = normalize(text)
normalized_noise = normalize(text_noise)

# Split into words
text_words = normalized_text.split()
noise_words = normalized_noise.split()

# Function to find and remove noise sequence flexibly
def remove_flexible(text_list, noise_list):
i = 0
while i <= len(text_list) - len(noise_list):
match = True
extra_words = 0
for j, noise_word in enumerate(noise_list):
if i + j + extra_words >= len(text_list):
match = False
break
# Allow skipping extra words in text_list
while (
i + j + extra_words < len(text_list)
and text_list[i + j + extra_words] != noise_word
):
extra_words += 1
if i + j + extra_words >= len(text_list):
match = False
break
if not match:
break
if match:
# Remove matched part
del text_list[i : i + len(noise_list) + extra_words]
i = max(0, i - len(noise_list)) # Adjust index after removal
else:
i += 1
return text_list

# Only remove parts of text_noise that are found in text
cleaned_words = text_words[:]
for noise_word in noise_words:
if noise_word in cleaned_words:
cleaned_words.remove(noise_word)

# Reconstruct the cleaned text
cleaned_text = " ".join(cleaned_words)
return cleaned_text


def load_model(): def load_model():
"""TODO: Add docstring.""" """TODO: Add docstring."""
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
@@ -69,6 +134,7 @@ BAD_SENTENCES = [
" Sous-titrage Société Radio-Canada", " Sous-titrage Société Radio-Canada",
" Sous", " Sous",
" Sous-", " Sous-",
" i'm going to go to the next one.",
] ]




@@ -109,36 +175,59 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
def main(): def main():
"""TODO: Add docstring.""" """TODO: Add docstring."""
node = Node() node = Node()

text_noise = ""
noise_timestamp = time.time()
# For macos use mlx: # For macos use mlx:
if sys.platform != "darwin": if sys.platform != "darwin":
pipe = load_model() pipe = load_model()


for event in node: for event in node:
if event["type"] == "INPUT": if event["type"] == "INPUT":
audio = event["value"].to_numpy()
confg = (
{"language": TARGET_LANGUAGE, "task": "translate"}
if TRANSLATE
else {
"language": TARGET_LANGUAGE,
}
)
if sys.platform == "darwin":
import mlx_whisper

result = mlx_whisper.transcribe(
audio,
path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
append_punctuations=".",
if "text_noise" in event["id"]:
text_noise = event["value"][0].as_py()
text_noise = (
text_noise.replace("(", "")
.replace(")", "")
.replace("[", "")
.replace("]", "")
) )
noise_timestamp = time.time()
else: else:
result = pipe(
audio,
generate_kwargs=confg,
audio = event["value"].to_numpy()
confg = (
{"language": TARGET_LANGUAGE, "task": "translate"}
if TRANSLATE
else {
"language": TARGET_LANGUAGE,
}
)
if sys.platform == "darwin":
import mlx_whisper

result = mlx_whisper.transcribe(
audio,
path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
append_punctuations=".",
)

else:
result = pipe(
audio,
generate_kwargs=confg,
)
if result["text"] in BAD_SENTENCES:
continue
text = cut_repetition(result["text"])

# Remove noise filter after some time
if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS
text_noise = ""

## Remove text noise independantly of casing
text = remove_text_noise(text, text_noise)

if text.strip() == "" or text.strip() == ".":
continue
node.send_output(
"text", pa.array([text]), {"language": TARGET_LANGUAGE}
) )
if result["text"] in BAD_SENTENCES:
continue
text = cut_repetition(result["text"])
node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE})

+ 11
- 11
node-hub/dora-distil-whisper/pyproject.toml View File

@@ -2,8 +2,8 @@
name = "dora-distil-whisper" name = "dora-distil-whisper"
version = "0.3.10" version = "0.3.10"
authors = [ authors = [
{ name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
{ name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
{ name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
{ name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
] ]
description = "Dora dora-distil-whisper" description = "Dora dora-distil-whisper"
license = { text = "MIT" } license = { text = "MIT" }
@@ -11,14 +11,14 @@ readme = "README.md"
requires-python = ">=3.8" requires-python = ">=3.8"


dependencies = [ dependencies = [
"dora-rs >= 0.3.9",
"numpy < 2.0.0",
"pyarrow >= 5.0.0",
"transformers >= 4.0.0",
"accelerate >= 0.29.2",
"torch >= 2.2.0",
"modelscope >= 1.18.1",
"mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
"dora-rs >= 0.3.9",
"numpy < 2.0.0",
"pyarrow >= 5.0.0",
"transformers >= 4.0.0",
"accelerate >= 0.29.2",
"torch >= 2.2.0",
"modelscope >= 1.18.1",
"mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
] ]




@@ -30,5 +30,5 @@ dora-distil-whisper = "dora_distil_whisper.main:main"


[tool.ruff.lint] [tool.ruff.lint]
extend-select = [ extend-select = [
"D", # pydocstyle
"D", # pydocstyle
] ]

+ 21
- 9
node-hub/dora-qwen/dora_qwen/main.py View File

@@ -18,7 +18,9 @@ def get_model_gguf():
from llama_cpp import Llama from llama_cpp import Llama


llm = Llama.from_pretrained( llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False,
repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF",
filename="*fp16.gguf",
verbose=False,
) )
return llm return llm


@@ -36,7 +38,9 @@ def get_model_huggingface():
model_name = "Qwen/Qwen2.5-0.5B-Instruct" model_name = "Qwen/Qwen2.5-0.5B-Instruct"


model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype="auto", device_map="auto",
model_name,
torch_dtype="auto",
device_map="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
return model, tokenizer return model, tokenizer
@@ -49,7 +53,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str:
"""TODO: Add docstring.""" """TODO: Add docstring."""
history += [{"role": "user", "content": prompt}] history += [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template( text = tokenizer.apply_chat_template(
history, tokenize=False, add_generation_prompt=True,
history,
tokenize=False,
add_generation_prompt=True,
) )
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(**model_inputs, max_new_tokens=512) generated_ids = model.generate(**model_inputs, max_new_tokens=512)
@@ -66,9 +72,9 @@ def main():
"""TODO: Add docstring.""" """TODO: Add docstring."""
history = [] history = []
# If OS is not Darwin, use Huggingface model # If OS is not Darwin, use Huggingface model
if sys.platform != "":
if sys.platform == "darwin":
model = get_model_gguf() model = get_model_gguf()
elif sys.platform == "huggingface":
elif sys.platform == "linux":
model, tokenizer = get_model_huggingface() model, tokenizer = get_model_huggingface()
else: else:
model, tokenizer = get_model_darwin() model, tokenizer = get_model_darwin()
@@ -83,7 +89,7 @@ def main():


if any(word in ACTIVATION_WORDS for word in words): if any(word in ACTIVATION_WORDS for word in words):
# On linux, Windows # On linux, Windows
if sys.platform != "":
if sys.platform == "darwin":
response = model( response = model(
f"Q: {text} A: ", # Prompt f"Q: {text} A: ", # Prompt
max_tokens=24, max_tokens=24,
@@ -92,17 +98,23 @@ def main():
"\n", "\n",
], # Stop generating just before the model would generate a new question ], # Stop generating just before the model would generate a new question
)["choices"][0]["text"] )["choices"][0]["text"]
elif sys.platform == "huggingface":
elif sys.platform == "linux":
response, history = generate_hf(model, tokenizer, text, history) response, history = generate_hf(model, tokenizer, text, history)
else: else:
from mlx_lm import generate from mlx_lm import generate


response = generate( response = generate(
model, tokenizer, prompt=text, verbose=False, max_tokens=50,
model,
tokenizer,
prompt=text,
verbose=False,
max_tokens=50,
) )


node.send_output( node.send_output(
output_id="text", data=pa.array([response]), metadata={},
output_id="text",
data=pa.array([response]),
metadata={},
) )






Loading…
Cancel
Save