diff --git a/node-hub/dora-llama-cpp-python/README.md b/node-hub/dora-llama-cpp-python/README.md index 22e4e04c..2acac9df 100644 --- a/node-hub/dora-llama-cpp-python/README.md +++ b/node-hub/dora-llama-cpp-python/README.md @@ -26,6 +26,28 @@ uv pip install -e . The node can be configured in your dataflow YAML file: ```yaml + +# Using a local model + +- id: dora-llama-cpp-python + build: pip install -e path/to/dora-llama-cpp-python + path: dora-llama-cpp-python + inputs: + text: source_node/text # Input text to generate response for + outputs: + - text # Generated response text + env: + MODEL_LOCAL_PATH: "./models/my-local-model.gguf" + SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers." + ACTIVATION_WORDS: "what how who where you" + MAX_TOKENS: "512" + N_GPU_LAYERS: "35" # Enable GPU acceleration + N_THREADS: "4" # CPU threads + CONTEXT_SIZE: "4096" # Maximum context window + + + +# Using a HuggingFace model - id: dora-llama-cpp-python build: pip install -e path/to/dora-llama-cpp-python path: dora-llama-cpp-python @@ -34,7 +56,8 @@ The node can be configured in your dataflow YAML file: outputs: - text # Generated response text env: - MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf" + MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF" + MODEL_FILE_PATTERN: "*Q4_K_M.gguf" SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers." ACTIVATION_WORDS: "what how who where you" MAX_TOKENS: "512" @@ -96,7 +119,8 @@ nodes: outputs: - text env: - MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf" + MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF" + MODEL_FILE_PATTERN: "*Q4_K_M.gguf" SYSTEM_PROMPT: "You're a helpful assistant." ACTIVATION_WORDS: "hey help what how" MAX_TOKENS: "512" diff --git a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py index 843a32bf..18016f46 100644 --- a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py +++ b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py @@ -2,16 +2,21 @@ import os import pyarrow as pa from dora import Node from pathlib import Path +import logging +# Configure logging +logging.basicConfig(level=logging.INFO) # Environment variables for model configuration SYSTEM_PROMPT = os.getenv( "SYSTEM_PROMPT", "You're a very succinct AI assistant with short answers.", ) -MODEL_PATH = os.getenv("MODEL_PATH", "./models/llama-2-7b-chat.Q4_K_M.gguf") +MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "") # Local model path takes precedence +MODEL_NAME = os.getenv("MODEL_NAME", "TheBloke/Llama-2-7B-Chat-GGUF") # HF repo as fallback +MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf") MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512")) -N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0")) # Number of layers to offload to GPU -N_THREADS = int(os.getenv("N_THREADS", "4")) # Number of CPU threads +N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0")) +N_THREADS = int(os.getenv("N_THREADS", "4")) CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "4096")) @@ -19,22 +24,39 @@ def get_model(): """Load a GGUF model using llama-cpp-python with optional GPU acceleration.""" from llama_cpp import Llama - model_path = Path(MODEL_PATH) - if not model_path.exists(): - raise FileNotFoundError( - f"Model file not found at {MODEL_PATH}. " - "Download it using: wget -O models/llama-2-7b-chat.Q4_K_M.gguf " - "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf" - ) - - llm = Llama( - model_path=str(model_path), - n_gpu_layers=N_GPU_LAYERS, # Enable GPU acceleration if > 0 - n_ctx=CONTEXT_SIZE, # Maximum context size - n_threads=N_THREADS, # Control CPU threading - verbose=False - ) - return llm + try: + # Check if local path is provided + if MODEL_LOCAL_PATH: + model_path = Path(MODEL_LOCAL_PATH) + if not model_path.exists(): + raise FileNotFoundError(f"Local model not found at {MODEL_LOCAL_PATH}") + + logging.info(f"Loading local model from {MODEL_LOCAL_PATH}") + llm = Llama( + model_path=str(model_path), + n_gpu_layers=N_GPU_LAYERS, + n_ctx=CONTEXT_SIZE, + n_threads=N_THREADS, + verbose=False + ) + else: + # Load from HuggingFace if no local path + logging.info(f"Downloading model {MODEL_NAME} with pattern {MODEL_FILE_PATTERN}") + llm = Llama.from_pretrained( + repo_id=MODEL_NAME, + filename=MODEL_FILE_PATTERN, + n_gpu_layers=N_GPU_LAYERS, + n_ctx=CONTEXT_SIZE, + n_threads=N_THREADS, + verbose=False + ) + + logging.info("Model loaded successfully") + return llm + + except Exception as e: + logging.error(f"Error loading model: {e}") + raise ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() diff --git a/node-hub/dora-llama-cpp-python/pyproject.toml b/node-hub/dora-llama-cpp-python/pyproject.toml index 3663bf3b..a6006fd1 100644 --- a/node-hub/dora-llama-cpp-python/pyproject.toml +++ b/node-hub/dora-llama-cpp-python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dora-llama-cpp-python" -version = "0.0.1" +version = "1.0.0" authors = [{ name = "Shashwat Patil", email = "email@email.com" }] description = "dora-llama-cpp-python" license = { text = "MIT" } diff --git a/node-hub/dora-llama-cpp-python/test.yml b/node-hub/dora-llama-cpp-python/test.yml index 375dc01e..893cbfb4 100644 --- a/node-hub/dora-llama-cpp-python/test.yml +++ b/node-hub/dora-llama-cpp-python/test.yml @@ -34,7 +34,8 @@ nodes: outputs: - text env: - MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf" + MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF" # Llama 2.7B model pull from Hugging Face + MODEL_FILE_PATTERN: "*Q4_K_M.gguf" SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers." ACTIVATION_WORDS: "what how who where you" MAX_TOKENS: "512"