From 2d12aa0f940a007ee12daaa9a9f1820c40e28301 Mon Sep 17 00:00:00 2001
From: haixuantao <tao.xavier@outlook.com>
Date: Sun, 2 Mar 2025 13:45:52 +0100
Subject: [PATCH] Fix kokoro demo with gguf based inference of qwen2.5 0.5B

---
 examples/speech-to-speech/README.md           |  7 ++--
 .../vlm/qwen2-5-vl-speech-to-speech-dev.yml   | 14 +++----
 .../dora-kokoro-tts/dora_kokoro_tts/main.py   |  2 -
 node-hub/dora-qwen/dora_qwen/main.py          | 41 +++++++++++++------
 node-hub/dora-qwen/pyproject.toml             | 17 ++++++++
 5 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/examples/speech-to-speech/README.md b/examples/speech-to-speech/README.md
index 1c1da030..39817609 100644
--- a/examples/speech-to-speech/README.md
+++ b/examples/speech-to-speech/README.md
@@ -3,8 +3,7 @@
 Make sure to have, dora, pip and cargo installed.
 
 ```bash
-dora build https://raw.githubusercontent.com/dora-rs/dora/main/examples/speech-to-speech/outtetts.yml
-dora run https://raw.githubusercontent.com/dora-rs/dora/main/examples/speech-to-speech/outtetts.yml
-
-# Wait for models to download which can takes a bit of time.
+uv venv --seed -p 3.11
+dora build kokoro-dev.yml
+dora run kokoro-dev.yml
 ```
diff --git a/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml b/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml
index 7e16d802..141cb4ea 100755
--- a/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml
+++ b/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml
@@ -3,7 +3,7 @@ nodes:
     build: pip install opencv-video-capture
     path: opencv-video-capture
     inputs:
-      tick: dora/timer/millis/100
+      tick: dora/timer/millis/1000
     outputs:
       - image
     env:
@@ -37,7 +37,7 @@ nodes:
 
   - id: dora-qwenvl
     build: pip install -e ../../node-hub/dora-qwen2-5-vl
-    path: dora-qwenvl
+    path: dora-qwen2-5-vl
     inputs:
       image: camera/image
       text: dora-distil-whisper/text
@@ -45,7 +45,7 @@ nodes:
       - text
     env:
       DEFAULT_QUESTION: Describe the image in three words.
-      HISTORY: True
+      IMAGE_RESIZE_RATIO: "0.5"
 
   - id: plot
     build: pip install dora-rerun
@@ -55,9 +55,9 @@ nodes:
       text_qwenvl: dora-qwenvl/text
       text_whisper: dora-distil-whisper/text
 
-  - id: dora-outtetts
-    build: pip install -e ../../node-hub/dora-outtetts
-    path: dora-outtetts
+  - id: dora-kokoro-tts
+    build: pip install -e ../../node-hub/dora-kokoro-tts
+    path: dora-kokoro-tts
     inputs:
       text: dora-qwenvl/text
     outputs:
@@ -67,4 +67,4 @@ nodes:
     build: pip install -e ../../node-hub/dora-pyaudio
     path: dora-pyaudio
     inputs:
-      audio: dora-outtetts/audio
+      audio: dora-kokoro-tts/audio
diff --git a/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py b/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
index 1e4591d6..cccd7ebb 100644
--- a/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
+++ b/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
@@ -21,8 +21,6 @@ def main():
                 )
                 for i, (gs, ps, audio) in enumerate(generator):
                     audio = audio.numpy()
-                    print("audio detected")
-                    sf.write(f"text_{i}.wav", audio, 24000)  # save each audio file
                     node.send_output("audio", pa.array(audio), {"sample_rate": 24000})
 
 
diff --git a/node-hub/dora-qwen/dora_qwen/main.py b/node-hub/dora-qwen/dora_qwen/main.py
index d8fe3b0b..7ca76f82 100644
--- a/node-hub/dora-qwen/dora_qwen/main.py
+++ b/node-hub/dora-qwen/dora_qwen/main.py
@@ -11,6 +11,15 @@ SYSTEM_PROMPT = os.getenv(
 )
 
 
+def get_model_gguf():
+    from llama_cpp import Llama
+
+    llm = Llama.from_pretrained(
+        repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False
+    )
+    return llm
+
+
 def get_model_darwin():
     from mlx_lm import load  # noqa
 
@@ -28,7 +37,7 @@ def get_model_huggingface():
     return model, tokenizer
 
 
-TRIGGER_WORDS = ["you", "wh", "tu"]
+ACTIVATION_WORDS = ["what", "how", "who", "where", "you"]
 
 
 def generate_hf(model, tokenizer, prompt: str, history) -> str:
@@ -48,16 +57,11 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str:
 
 
 def main():
-    if SYSTEM_PROMPT != "":
-        history = [
-            {
-                "role": "system",
-                "content": SYSTEM_PROMPT,
-            },
-        ]
-
+    history = []
     # If OS is not Darwin, use Huggingface model
-    if sys.platform != "darwin":
+    if sys.platform != "":
+        model = get_model_gguf()
+    elif sys.platform == "huggingface":
         model, tokenizer = get_model_huggingface()
     else:
         model, tokenizer = get_model_darwin()
@@ -68,10 +72,21 @@ def main():
         if event["type"] == "INPUT":
             # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
             text = event["value"][0].as_py()
-            if True:
+            words = text.lower().split()
+
+            if any(word in ACTIVATION_WORDS for word in words):
                 # On linux, Windows
-                if sys.platform != "darwin":
-                    response, history = generate_hf(text, history)
+                if sys.platform != "":
+                    response = model(
+                        f"Q: {text} A: ",  # Prompt
+                        max_tokens=24,
+                        stop=[
+                            "Q:",
+                            "\n",
+                        ],  # Stop generating just before the model would generate a new question
+                    )["choices"][0]["text"]
+                elif sys.platform == "huggingface":
+                    response, history = generate_hf(model, tokenizer, text, history)
                 else:
                     from mlx_lm import generate
 
diff --git a/node-hub/dora-qwen/pyproject.toml b/node-hub/dora-qwen/pyproject.toml
index 24dfab42..ff490d4b 100644
--- a/node-hub/dora-qwen/pyproject.toml
+++ b/node-hub/dora-qwen/pyproject.toml
@@ -17,10 +17,27 @@ dependencies = [
     "accelerate>=1.3.0",
     "transformers",
     "mlx-lm>=0.21.1; sys_platform == 'darwin'",
+    "llama-cpp-python",
+]
+
+[tool.uv.sources]
+llama-cpp-python = [
+    { index = "llama_cpp_python_metal", marker = "sys_platform == 'darwin'" },
+    { index = "llama_cpp_python_cu121", marker = "sys_platform == 'linux'" },
 ]
 
 [dependency-groups]
 dev = ["pytest >=8.1.1", "ruff >=0.9.1"]
 
+[[tool.uv.index]]
+name = "llama_cpp_python_cu121"
+url = "https://abetlen.github.io/llama-cpp-python/whl/metal"
+explicit = true
+
+[[tool.uv.index]]
+name = "llama_cpp_python_metal"
+url = "https://abetlen.github.io/llama-cpp-python/whl/metal"
+explicit = true
+
 [project.scripts]
 dora-qwen = "dora_qwen.main:main"