updated functionality

10 months ago · 2e08127881
--- a/node-hub/dora-llama-cpp-python/README.md
+++ b/node-hub/dora-llama-cpp-python/README.md
@@ -1,15 +1,15 @@
 # dora-llama-cpp-python

 A Dora node that provides access to LLaMA-based models using either llama.cpp or Hugging Face backends for text generation.
 A Dora node that provides access to LLaMA models using llama-cpp-python for efficient CPU/GPU inference.

 ## Features

 - Supports both llama.cpp (CPU) and Hugging Face (CPU/GPU) backends
 - GPU acceleration support (CUDA on Linux, Metal on macOS)
 - Easy integration with speech-to-text and text-to-speech pipelines  
 - Configurable system prompts and activation words
 - Chat history support with Hugging Face models
 - Lightweight CPU inference with GGUF models
 - Multiple model support (Qwen, LLaMA, etc.)
 - Thread-level CPU optimization
 - Adjustable context window size

 ## Getting started

@@ -20,6 +20,7 @@ uv venv -p 3.11 --seed
 uv pip install -e .
 ```


 ## Usage

 The node can be configured in your dataflow YAML file:
@@ -33,27 +34,26 @@ The node can be configured in your dataflow YAML file:
  outputs:
    - text  # Generated response text
  env:
    MODEL_BACKEND: "llama-cpp"  # or "huggingface"
    MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF"  # For llama-cpp backend
    MODEL_FILENAME: "*fp16.gguf"  # For llama-cpp backend
    HF_MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct"  # For huggingface backend
    MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
    SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
    ACTIVATION_WORDS: "what how who where you"
    MAX_TOKENS: "512"
    N_GPU_LAYERS: "35"     # Enable GPU acceleration
    N_THREADS: "4"         # CPU threads
    CONTEXT_SIZE: "4096"   # Maximum context window
 ```

 ### Configuration Options

 - `MODEL_BACKEND`: Choose between:
  - `llama-cpp`: Uses GGUF models via llama.cpp (CPU-optimized, default)
  - `huggingface`: Uses Hugging Face Transformers models

 - `MODEL_PATH`: Path to your GGUF model file (default: "./models/llama-2-7b-chat.Q4_K_M.gguf")
 - `SYSTEM_PROMPT`: Customize the AI assistant's personality/behavior
 - `ACTIVATION_WORDS`: Space-separated list of words that trigger model response
 - `MAX_TOKENS`: Maximum number of tokens to generate (default: 512)
 - `N_GPU_LAYERS`: Number of layers to offload to GPU (default: 0, set to 35 for GPU acceleration)
 - `N_THREADS`: Number of CPU threads to use (default: 4)
 - `CONTEXT_SIZE`: Maximum context window size (default: 4096)

 ## Example yml

 ### Basic Speech-to-Text-to-Speech Pipeline
 ## Example: Speech Assistant Pipeline

 This example shows how to create a conversational AI pipeline that:
 1. Captures audio from microphone
@@ -96,11 +96,13 @@ nodes:
    outputs:
      - text
    env:
      MODEL_BACKEND: llama-cpp
      MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
      MODEL_FILENAME: "*fp16.gguf"
      MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
      SYSTEM_PROMPT: "You're a helpful assistant."
      ACTIVATION_WORDS: "hey help what how"
      MAX_TOKENS: "512"
      N_GPU_LAYERS: "35"
      N_THREADS: "4"
      CONTEXT_SIZE: "4096"

  - id: dora-tts
    build: pip install dora-kokoro-tts
@@ -142,4 +144,4 @@ uv run pytest . # Test

 ## License

 dora-llama-cpp-python's code is released under the MIT License
 dora-llama-cpp-python is released under the MIT License
--- a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
+++ b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
@@ -1,91 +1,64 @@
 import os
 import pyarrow as pa
 from dora import Node
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from pathlib import Path

 # Environment variables for model configuration
 SYSTEM_PROMPT = os.getenv(
    "SYSTEM_PROMPT",
    "You're a very succinct AI assistant with short answers.",
 )
 MODEL_BACKEND = os.getenv("MODEL_BACKEND", "llama-cpp")
 MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
 MODEL_FILENAME = os.getenv("MODEL_FILENAME", "*fp16.gguf")
 HF_MODEL_NAME = os.getenv("HF_MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct")
 MODEL_PATH = os.getenv("MODEL_PATH", "./models/llama-2-7b-chat.Q4_K_M.gguf")
 MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
 N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0"))  # Number of layers to offload to GPU
 N_THREADS = int(os.getenv("N_THREADS", "4"))  # Number of CPU threads
 CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "4096"))


 def get_model_llama_cpp():
    """Load a GGUF model using llama-cpp-python (CPU by default)."""
 def get_model():
    """Load a GGUF model using llama-cpp-python with optional GPU acceleration."""
    from llama_cpp import Llama

    llm = Llama.from_pretrained(
        repo_id=MODEL_REPO_ID,
        filename=MODEL_FILENAME,
    
    model_path = Path(MODEL_PATH)
    if not model_path.exists():
        raise FileNotFoundError(
            f"Model file not found at {MODEL_PATH}. "
            "Download it using: wget -O models/llama-2-7b-chat.Q4_K_M.gguf "
            "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
        )

    llm = Llama(
        model_path=str(model_path),
        n_gpu_layers=N_GPU_LAYERS,  # Enable GPU acceleration if > 0
        n_ctx=CONTEXT_SIZE,         # Maximum context size
        n_threads=N_THREADS,        # Control CPU threading
        verbose=False
    )
    return llm


 def get_model_huggingface():
    """Load a Hugging Face transformers model."""
    model = AutoModelForCausalLM.from_pretrained(
        HF_MODEL_NAME,
        torch_dtype="auto",
        device_map="cpu"
    )
    tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
    return model, tokenizer


 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()


 def generate_hf(model, tokenizer, prompt: str, history) -> str:
    """Generates text using a Hugging Face model."""
    history += [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        history, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to("cpu")  
    generated_ids = model.generate(**model_inputs, max_new_tokens=MAX_TOKENS)
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    history += [{"role": "assistant", "content": response}]
    return response, history


 def main():
    history = []

    # Select model backend
    if MODEL_BACKEND == "llama-cpp":
        model = get_model_llama_cpp()
    else:
        model, tokenizer = get_model_huggingface()

    # Initialize model
    model = get_model()
    node = Node()

    for event in node:
        if event["type"] == "INPUT":
            text = event["value"][0].as_py()
            words = text.lower().split()
            # print(f"Input text: {text}")

            if any(word in ACTIVATION_WORDS for word in words):
                if MODEL_BACKEND == "llama-cpp":
                    response = model(
                        f"Q: {text} A: ",
                        max_tokens=MAX_TOKENS,
                        stop=["Q:", "\n"],
                    )["choices"][0]["text"]
                else:
                    response, history = generate_hf(model, tokenizer, text, history)
                # Generate response using system prompt
                prompt = f"{SYSTEM_PROMPT}\nQ: {text}\nA:"
                response = model(
                    prompt,
                    max_tokens=MAX_TOKENS,
                    stop=["Q:", "\n"],
                )["choices"][0]["text"]
                
                # print(f"Generated response: {response}")
                node.send_output(
                    output_id="text", data=pa.array([response]), metadata={}
                )
--- a/node-hub/dora-llama-cpp-python/pyproject.toml
+++ b/node-hub/dora-llama-cpp-python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dora-llama-cpp-python"
 version = "0.0.0"
 version = "0.0.1"
 authors = [{ name = "Shashwat Patil", email = "email@email.com" }]
 description = "dora-llama-cpp-python"
 license = { text = "MIT" }
@@ -16,9 +16,26 @@ dependencies = [
    "modelscope >= 1.18.1",
    "accelerate >= 1.3.0",
    "transformers",
    "mlx-lm>=0.21.1; sys_platform == 'darwin'",
    "llama-cpp-python",
 ]

 [tool.uv.sources]
 llama-cpp-python = [
    { index = "llama_cpp_python_metal", marker = "sys_platform == 'darwin'" },
    { index = "llama_cpp_python_cu121", marker = "sys_platform == 'linux'" },
 ]

 [[tool.uv.index]]
 name = "llama_cpp_python_cu121"
 url = "https://abetlen.github.io/llama-cpp-python/whl/cu121"
 explicit = true

 [[tool.uv.index]]
 name = "llama_cpp_python_metal"
 url = "https://abetlen.github.io/llama-cpp-python/whl/metal"
 explicit = true

 [dependency-groups]
 dev = ["pytest >=8.1.1", "ruff >=0.9.1"]

--- a/node-hub/dora-llama-cpp-python/test.yml
+++ b/node-hub/dora-llama-cpp-python/test.yml
@@ -34,13 +34,13 @@ nodes:
    outputs:
      - text
    env:
      MODEL_BACKEND: "llama-cpp"  # or "huggingface"
      MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF"  # For llama-cpp backend
      MODEL_FILENAME: "*fp16.gguf"  # For llama-cpp backend
      HF_MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct"  # For huggingface backend
      MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
      SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
      ACTIVATION_WORDS: "what how who where you"
      MAX_TOKENS: "512"
      N_GPU_LAYERS: "35"     # Enable GPU acceleration
      N_THREADS: "4"         # CPU threads
      CONTEXT_SIZE: "4096"   # Maximum context window

  - id: plot
    build: pip install -e ../../node-hub/dora-rerun