From 2d12aa0f940a007ee12daaa9a9f1820c40e28301 Mon Sep 17 00:00:00 2001 From: haixuantao Date: Sun, 2 Mar 2025 13:45:52 +0100 Subject: [PATCH] Fix kokoro demo with gguf based inference of qwen2.5 0.5B --- examples/speech-to-speech/README.md | 7 ++-- .../vlm/qwen2-5-vl-speech-to-speech-dev.yml | 14 +++---- .../dora-kokoro-tts/dora_kokoro_tts/main.py | 2 - node-hub/dora-qwen/dora_qwen/main.py | 41 +++++++++++++------ node-hub/dora-qwen/pyproject.toml | 17 ++++++++ 5 files changed, 55 insertions(+), 26 deletions(-) diff --git a/examples/speech-to-speech/README.md b/examples/speech-to-speech/README.md index 1c1da030..39817609 100644 --- a/examples/speech-to-speech/README.md +++ b/examples/speech-to-speech/README.md @@ -3,8 +3,7 @@ Make sure to have, dora, pip and cargo installed. ```bash -dora build https://raw.githubusercontent.com/dora-rs/dora/main/examples/speech-to-speech/outtetts.yml -dora run https://raw.githubusercontent.com/dora-rs/dora/main/examples/speech-to-speech/outtetts.yml - -# Wait for models to download which can takes a bit of time. +uv venv --seed -p 3.11 +dora build kokoro-dev.yml +dora run kokoro-dev.yml ``` diff --git a/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml b/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml index 7e16d802..141cb4ea 100755 --- a/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml +++ b/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml @@ -3,7 +3,7 @@ nodes: build: pip install opencv-video-capture path: opencv-video-capture inputs: - tick: dora/timer/millis/100 + tick: dora/timer/millis/1000 outputs: - image env: @@ -37,7 +37,7 @@ nodes: - id: dora-qwenvl build: pip install -e ../../node-hub/dora-qwen2-5-vl - path: dora-qwenvl + path: dora-qwen2-5-vl inputs: image: camera/image text: dora-distil-whisper/text @@ -45,7 +45,7 @@ nodes: - text env: DEFAULT_QUESTION: Describe the image in three words. - HISTORY: True + IMAGE_RESIZE_RATIO: "0.5" - id: plot build: pip install dora-rerun @@ -55,9 +55,9 @@ nodes: text_qwenvl: dora-qwenvl/text text_whisper: dora-distil-whisper/text - - id: dora-outtetts - build: pip install -e ../../node-hub/dora-outtetts - path: dora-outtetts + - id: dora-kokoro-tts + build: pip install -e ../../node-hub/dora-kokoro-tts + path: dora-kokoro-tts inputs: text: dora-qwenvl/text outputs: @@ -67,4 +67,4 @@ nodes: build: pip install -e ../../node-hub/dora-pyaudio path: dora-pyaudio inputs: - audio: dora-outtetts/audio + audio: dora-kokoro-tts/audio diff --git a/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py b/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py index 1e4591d6..cccd7ebb 100644 --- a/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py +++ b/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py @@ -21,8 +21,6 @@ def main(): ) for i, (gs, ps, audio) in enumerate(generator): audio = audio.numpy() - print("audio detected") - sf.write(f"text_{i}.wav", audio, 24000) # save each audio file node.send_output("audio", pa.array(audio), {"sample_rate": 24000}) diff --git a/node-hub/dora-qwen/dora_qwen/main.py b/node-hub/dora-qwen/dora_qwen/main.py index d8fe3b0b..7ca76f82 100644 --- a/node-hub/dora-qwen/dora_qwen/main.py +++ b/node-hub/dora-qwen/dora_qwen/main.py @@ -11,6 +11,15 @@ SYSTEM_PROMPT = os.getenv( ) +def get_model_gguf(): + from llama_cpp import Llama + + llm = Llama.from_pretrained( + repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False + ) + return llm + + def get_model_darwin(): from mlx_lm import load # noqa @@ -28,7 +37,7 @@ def get_model_huggingface(): return model, tokenizer -TRIGGER_WORDS = ["you", "wh", "tu"] +ACTIVATION_WORDS = ["what", "how", "who", "where", "you"] def generate_hf(model, tokenizer, prompt: str, history) -> str: @@ -48,16 +57,11 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str: def main(): - if SYSTEM_PROMPT != "": - history = [ - { - "role": "system", - "content": SYSTEM_PROMPT, - }, - ] - + history = [] # If OS is not Darwin, use Huggingface model - if sys.platform != "darwin": + if sys.platform != "": + model = get_model_gguf() + elif sys.platform == "huggingface": model, tokenizer = get_model_huggingface() else: model, tokenizer = get_model_darwin() @@ -68,10 +72,21 @@ def main(): if event["type"] == "INPUT": # Warning: Make sure to add my_output_id and my_input_id within the dataflow. text = event["value"][0].as_py() - if True: + words = text.lower().split() + + if any(word in ACTIVATION_WORDS for word in words): # On linux, Windows - if sys.platform != "darwin": - response, history = generate_hf(text, history) + if sys.platform != "": + response = model( + f"Q: {text} A: ", # Prompt + max_tokens=24, + stop=[ + "Q:", + "\n", + ], # Stop generating just before the model would generate a new question + )["choices"][0]["text"] + elif sys.platform == "huggingface": + response, history = generate_hf(model, tokenizer, text, history) else: from mlx_lm import generate diff --git a/node-hub/dora-qwen/pyproject.toml b/node-hub/dora-qwen/pyproject.toml index 24dfab42..ff490d4b 100644 --- a/node-hub/dora-qwen/pyproject.toml +++ b/node-hub/dora-qwen/pyproject.toml @@ -17,10 +17,27 @@ dependencies = [ "accelerate>=1.3.0", "transformers", "mlx-lm>=0.21.1; sys_platform == 'darwin'", + "llama-cpp-python", +] + +[tool.uv.sources] +llama-cpp-python = [ + { index = "llama_cpp_python_metal", marker = "sys_platform == 'darwin'" }, + { index = "llama_cpp_python_cu121", marker = "sys_platform == 'linux'" }, ] [dependency-groups] dev = ["pytest >=8.1.1", "ruff >=0.9.1"] +[[tool.uv.index]] +name = "llama_cpp_python_cu121" +url = "https://abetlen.github.io/llama-cpp-python/whl/metal" +explicit = true + +[[tool.uv.index]] +name = "llama_cpp_python_metal" +url = "https://abetlen.github.io/llama-cpp-python/whl/metal" +explicit = true + [project.scripts] dora-qwen = "dora_qwen.main:main"