| @@ -1,15 +1,15 @@ | |||
| # dora-llama-cpp-python | |||
| A Dora node that provides access to LLaMA-based models using either llama.cpp or Hugging Face backends for text generation. | |||
| A Dora node that provides access to LLaMA models using llama-cpp-python for efficient CPU/GPU inference. | |||
| ## Features | |||
| - Supports both llama.cpp (CPU) and Hugging Face (CPU/GPU) backends | |||
| - GPU acceleration support (CUDA on Linux, Metal on macOS) | |||
| - Easy integration with speech-to-text and text-to-speech pipelines | |||
| - Configurable system prompts and activation words | |||
| - Chat history support with Hugging Face models | |||
| - Lightweight CPU inference with GGUF models | |||
| - Multiple model support (Qwen, LLaMA, etc.) | |||
| - Thread-level CPU optimization | |||
| - Adjustable context window size | |||
| ## Getting started | |||
| @@ -20,6 +20,7 @@ uv venv -p 3.11 --seed | |||
| uv pip install -e . | |||
| ``` | |||
| ## Usage | |||
| The node can be configured in your dataflow YAML file: | |||
| @@ -33,27 +34,26 @@ The node can be configured in your dataflow YAML file: | |||
| outputs: | |||
| - text # Generated response text | |||
| env: | |||
| MODEL_BACKEND: "llama-cpp" # or "huggingface" | |||
| MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF" # For llama-cpp backend | |||
| MODEL_FILENAME: "*fp16.gguf" # For llama-cpp backend | |||
| HF_MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct" # For huggingface backend | |||
| MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf" | |||
| SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers." | |||
| ACTIVATION_WORDS: "what how who where you" | |||
| MAX_TOKENS: "512" | |||
| N_GPU_LAYERS: "35" # Enable GPU acceleration | |||
| N_THREADS: "4" # CPU threads | |||
| CONTEXT_SIZE: "4096" # Maximum context window | |||
| ``` | |||
| ### Configuration Options | |||
| - `MODEL_BACKEND`: Choose between: | |||
| - `llama-cpp`: Uses GGUF models via llama.cpp (CPU-optimized, default) | |||
| - `huggingface`: Uses Hugging Face Transformers models | |||
| - `MODEL_PATH`: Path to your GGUF model file (default: "./models/llama-2-7b-chat.Q4_K_M.gguf") | |||
| - `SYSTEM_PROMPT`: Customize the AI assistant's personality/behavior | |||
| - `ACTIVATION_WORDS`: Space-separated list of words that trigger model response | |||
| - `MAX_TOKENS`: Maximum number of tokens to generate (default: 512) | |||
| - `N_GPU_LAYERS`: Number of layers to offload to GPU (default: 0, set to 35 for GPU acceleration) | |||
| - `N_THREADS`: Number of CPU threads to use (default: 4) | |||
| - `CONTEXT_SIZE`: Maximum context window size (default: 4096) | |||
| ## Example yml | |||
| ### Basic Speech-to-Text-to-Speech Pipeline | |||
| ## Example: Speech Assistant Pipeline | |||
| This example shows how to create a conversational AI pipeline that: | |||
| 1. Captures audio from microphone | |||
| @@ -96,11 +96,13 @@ nodes: | |||
| outputs: | |||
| - text | |||
| env: | |||
| MODEL_BACKEND: llama-cpp | |||
| MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF" | |||
| MODEL_FILENAME: "*fp16.gguf" | |||
| MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf" | |||
| SYSTEM_PROMPT: "You're a helpful assistant." | |||
| ACTIVATION_WORDS: "hey help what how" | |||
| MAX_TOKENS: "512" | |||
| N_GPU_LAYERS: "35" | |||
| N_THREADS: "4" | |||
| CONTEXT_SIZE: "4096" | |||
| - id: dora-tts | |||
| build: pip install dora-kokoro-tts | |||
| @@ -142,4 +144,4 @@ uv run pytest . # Test | |||
| ## License | |||
| dora-llama-cpp-python's code is released under the MIT License | |||
| dora-llama-cpp-python is released under the MIT License | |||
| @@ -1,91 +1,64 @@ | |||
| import os | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| from transformers import AutoModelForCausalLM, AutoTokenizer | |||
| from pathlib import Path | |||
| # Environment variables for model configuration | |||
| SYSTEM_PROMPT = os.getenv( | |||
| "SYSTEM_PROMPT", | |||
| "You're a very succinct AI assistant with short answers.", | |||
| ) | |||
| MODEL_BACKEND = os.getenv("MODEL_BACKEND", "llama-cpp") | |||
| MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF") | |||
| MODEL_FILENAME = os.getenv("MODEL_FILENAME", "*fp16.gguf") | |||
| HF_MODEL_NAME = os.getenv("HF_MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct") | |||
| MODEL_PATH = os.getenv("MODEL_PATH", "./models/llama-2-7b-chat.Q4_K_M.gguf") | |||
| MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512")) | |||
| N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0")) # Number of layers to offload to GPU | |||
| N_THREADS = int(os.getenv("N_THREADS", "4")) # Number of CPU threads | |||
| CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "4096")) | |||
| def get_model_llama_cpp(): | |||
| """Load a GGUF model using llama-cpp-python (CPU by default).""" | |||
| def get_model(): | |||
| """Load a GGUF model using llama-cpp-python with optional GPU acceleration.""" | |||
| from llama_cpp import Llama | |||
| llm = Llama.from_pretrained( | |||
| repo_id=MODEL_REPO_ID, | |||
| filename=MODEL_FILENAME, | |||
| model_path = Path(MODEL_PATH) | |||
| if not model_path.exists(): | |||
| raise FileNotFoundError( | |||
| f"Model file not found at {MODEL_PATH}. " | |||
| "Download it using: wget -O models/llama-2-7b-chat.Q4_K_M.gguf " | |||
| "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf" | |||
| ) | |||
| llm = Llama( | |||
| model_path=str(model_path), | |||
| n_gpu_layers=N_GPU_LAYERS, # Enable GPU acceleration if > 0 | |||
| n_ctx=CONTEXT_SIZE, # Maximum context size | |||
| n_threads=N_THREADS, # Control CPU threading | |||
| verbose=False | |||
| ) | |||
| return llm | |||
| def get_model_huggingface(): | |||
| """Load a Hugging Face transformers model.""" | |||
| model = AutoModelForCausalLM.from_pretrained( | |||
| HF_MODEL_NAME, | |||
| torch_dtype="auto", | |||
| device_map="cpu" | |||
| ) | |||
| tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME) | |||
| return model, tokenizer | |||
| ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() | |||
| def generate_hf(model, tokenizer, prompt: str, history) -> str: | |||
| """Generates text using a Hugging Face model.""" | |||
| history += [{"role": "user", "content": prompt}] | |||
| text = tokenizer.apply_chat_template( | |||
| history, tokenize=False, add_generation_prompt=True | |||
| ) | |||
| model_inputs = tokenizer([text], return_tensors="pt").to("cpu") | |||
| generated_ids = model.generate(**model_inputs, max_new_tokens=MAX_TOKENS) | |||
| generated_ids = [ | |||
| output_ids[len(input_ids):] | |||
| for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |||
| ] | |||
| response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |||
| history += [{"role": "assistant", "content": response}] | |||
| return response, history | |||
| def main(): | |||
| history = [] | |||
| # Select model backend | |||
| if MODEL_BACKEND == "llama-cpp": | |||
| model = get_model_llama_cpp() | |||
| else: | |||
| model, tokenizer = get_model_huggingface() | |||
| # Initialize model | |||
| model = get_model() | |||
| node = Node() | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| text = event["value"][0].as_py() | |||
| words = text.lower().split() | |||
| # print(f"Input text: {text}") | |||
| if any(word in ACTIVATION_WORDS for word in words): | |||
| if MODEL_BACKEND == "llama-cpp": | |||
| response = model( | |||
| f"Q: {text} A: ", | |||
| max_tokens=MAX_TOKENS, | |||
| stop=["Q:", "\n"], | |||
| )["choices"][0]["text"] | |||
| else: | |||
| response, history = generate_hf(model, tokenizer, text, history) | |||
| # Generate response using system prompt | |||
| prompt = f"{SYSTEM_PROMPT}\nQ: {text}\nA:" | |||
| response = model( | |||
| prompt, | |||
| max_tokens=MAX_TOKENS, | |||
| stop=["Q:", "\n"], | |||
| )["choices"][0]["text"] | |||
| # print(f"Generated response: {response}") | |||
| node.send_output( | |||
| output_id="text", data=pa.array([response]), metadata={} | |||
| ) | |||
| @@ -1,6 +1,6 @@ | |||
| [project] | |||
| name = "dora-llama-cpp-python" | |||
| version = "0.0.0" | |||
| version = "0.0.1" | |||
| authors = [{ name = "Shashwat Patil", email = "email@email.com" }] | |||
| description = "dora-llama-cpp-python" | |||
| license = { text = "MIT" } | |||
| @@ -16,9 +16,26 @@ dependencies = [ | |||
| "modelscope >= 1.18.1", | |||
| "accelerate >= 1.3.0", | |||
| "transformers", | |||
| "mlx-lm>=0.21.1; sys_platform == 'darwin'", | |||
| "llama-cpp-python", | |||
| ] | |||
| [tool.uv.sources] | |||
| llama-cpp-python = [ | |||
| { index = "llama_cpp_python_metal", marker = "sys_platform == 'darwin'" }, | |||
| { index = "llama_cpp_python_cu121", marker = "sys_platform == 'linux'" }, | |||
| ] | |||
| [[tool.uv.index]] | |||
| name = "llama_cpp_python_cu121" | |||
| url = "https://abetlen.github.io/llama-cpp-python/whl/cu121" | |||
| explicit = true | |||
| [[tool.uv.index]] | |||
| name = "llama_cpp_python_metal" | |||
| url = "https://abetlen.github.io/llama-cpp-python/whl/metal" | |||
| explicit = true | |||
| [dependency-groups] | |||
| dev = ["pytest >=8.1.1", "ruff >=0.9.1"] | |||
| @@ -34,13 +34,13 @@ nodes: | |||
| outputs: | |||
| - text | |||
| env: | |||
| MODEL_BACKEND: "llama-cpp" # or "huggingface" | |||
| MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF" # For llama-cpp backend | |||
| MODEL_FILENAME: "*fp16.gguf" # For llama-cpp backend | |||
| HF_MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct" # For huggingface backend | |||
| MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf" | |||
| SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers." | |||
| ACTIVATION_WORDS: "what how who where you" | |||
| MAX_TOKENS: "512" | |||
| N_GPU_LAYERS: "35" # Enable GPU acceleration | |||
| N_THREADS: "4" # CPU threads | |||
| CONTEXT_SIZE: "4096" # Maximum context window | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||