Fix kokoro demo with gguf based inference of qwen2.5 0.5B

1 year ago · 2d12aa0f94
--- a/examples/speech-to-speech/README.md
+++ b/examples/speech-to-speech/README.md
@@ -3,8 +3,7 @@
 Make sure to have, dora, pip and cargo installed.

 ```bash
 dora build https://raw.githubusercontent.com/dora-rs/dora/main/examples/speech-to-speech/outtetts.yml
 dora run https://raw.githubusercontent.com/dora-rs/dora/main/examples/speech-to-speech/outtetts.yml

 # Wait for models to download which can takes a bit of time.
 uv venv --seed -p 3.11
 dora build kokoro-dev.yml
 dora run kokoro-dev.yml
 ```
--- a/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml
+++ b/examples/vlm/qwen2-5-vl-speech-to-speech-dev.yml
@@ -3,7 +3,7 @@ nodes:
    build: pip install opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
      tick: dora/timer/millis/1000
    outputs:
      - image
    env:
@@ -37,7 +37,7 @@ nodes:

  - id: dora-qwenvl
    build: pip install -e ../../node-hub/dora-qwen2-5-vl
    path: dora-qwenvl
    path: dora-qwen2-5-vl
    inputs:
      image: camera/image
      text: dora-distil-whisper/text
@@ -45,7 +45,7 @@ nodes:
      - text
    env:
      DEFAULT_QUESTION: Describe the image in three words.
      HISTORY: True
      IMAGE_RESIZE_RATIO: "0.5"

  - id: plot
    build: pip install dora-rerun
@@ -55,9 +55,9 @@ nodes:
      text_qwenvl: dora-qwenvl/text
      text_whisper: dora-distil-whisper/text

  - id: dora-outtetts
    build: pip install -e ../../node-hub/dora-outtetts
    path: dora-outtetts
  - id: dora-kokoro-tts
    build: pip install -e ../../node-hub/dora-kokoro-tts
    path: dora-kokoro-tts
    inputs:
      text: dora-qwenvl/text
    outputs:
@@ -67,4 +67,4 @@ nodes:
    build: pip install -e ../../node-hub/dora-pyaudio
    path: dora-pyaudio
    inputs:
      audio: dora-outtetts/audio
      audio: dora-kokoro-tts/audio
--- a/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
+++ b/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
@@ -21,8 +21,6 @@ def main():
                )
                for i, (gs, ps, audio) in enumerate(generator):
                    audio = audio.numpy()
                    print("audio detected")
                    sf.write(f"text_{i}.wav", audio, 24000)  # save each audio file
                    node.send_output("audio", pa.array(audio), {"sample_rate": 24000})


--- a/node-hub/dora-qwen/dora_qwen/main.py
+++ b/node-hub/dora-qwen/dora_qwen/main.py
@@ -11,6 +11,15 @@ SYSTEM_PROMPT = os.getenv(
 )


 def get_model_gguf():
    from llama_cpp import Llama

    llm = Llama.from_pretrained(
        repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False
    )
    return llm


 def get_model_darwin():
    from mlx_lm import load  # noqa

@@ -28,7 +37,7 @@ def get_model_huggingface():
    return model, tokenizer


 TRIGGER_WORDS = ["you", "wh", "tu"]
 ACTIVATION_WORDS = ["what", "how", "who", "where", "you"]


 def generate_hf(model, tokenizer, prompt: str, history) -> str:
@@ -48,16 +57,11 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str:


 def main():
    if SYSTEM_PROMPT != "":
        history = [
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            },
        ]

    history = []
    # If OS is not Darwin, use Huggingface model
    if sys.platform != "darwin":
    if sys.platform != "":
        model = get_model_gguf()
    elif sys.platform == "huggingface":
        model, tokenizer = get_model_huggingface()
    else:
        model, tokenizer = get_model_darwin()
@@ -68,10 +72,21 @@ def main():
        if event["type"] == "INPUT":
            # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
            text = event["value"][0].as_py()
            if True:
            words = text.lower().split()

            if any(word in ACTIVATION_WORDS for word in words):
                # On linux, Windows
                if sys.platform != "darwin":
                    response, history = generate_hf(text, history)
                if sys.platform != "":
                    response = model(
                        f"Q: {text} A: ",  # Prompt
                        max_tokens=24,
                        stop=[
                            "Q:",
                            "\n",
                        ],  # Stop generating just before the model would generate a new question
                    )["choices"][0]["text"]
                elif sys.platform == "huggingface":
                    response, history = generate_hf(model, tokenizer, text, history)
                else:
                    from mlx_lm import generate

--- a/node-hub/dora-qwen/pyproject.toml
+++ b/node-hub/dora-qwen/pyproject.toml
@@ -17,10 +17,27 @@ dependencies = [
    "accelerate>=1.3.0",
    "transformers",
    "mlx-lm>=0.21.1; sys_platform == 'darwin'",
    "llama-cpp-python",
 ]

 [tool.uv.sources]
 llama-cpp-python = [
    { index = "llama_cpp_python_metal", marker = "sys_platform == 'darwin'" },
    { index = "llama_cpp_python_cu121", marker = "sys_platform == 'linux'" },
 ]

 [dependency-groups]
 dev = ["pytest >=8.1.1", "ruff >=0.9.1"]

 [[tool.uv.index]]
 name = "llama_cpp_python_cu121"
 url = "https://abetlen.github.io/llama-cpp-python/whl/metal"
 explicit = true

 [[tool.uv.index]]
 name = "llama_cpp_python_metal"
 url = "https://abetlen.github.io/llama-cpp-python/whl/metal"
 explicit = true

 [project.scripts]
 dora-qwen = "dora_qwen.main:main"