added support for remote model pulling from huggingface

10 months ago · 84641f93e4
--- a/node-hub/dora-llama-cpp-python/README.md
+++ b/node-hub/dora-llama-cpp-python/README.md
@@ -26,6 +26,28 @@ uv pip install -e .
 The node can be configured in your dataflow YAML file:

 ```yaml

 # Using a local model

 - id: dora-llama-cpp-python
  build: pip install -e path/to/dora-llama-cpp-python
  path: dora-llama-cpp-python
  inputs:
    text: source_node/text  # Input text to generate response for
  outputs:
    - text  # Generated response text
  env:
    MODEL_LOCAL_PATH: "./models/my-local-model.gguf"
    SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
    ACTIVATION_WORDS: "what how who where you"
    MAX_TOKENS: "512"
    N_GPU_LAYERS: "35"     # Enable GPU acceleration
    N_THREADS: "4"         # CPU threads
    CONTEXT_SIZE: "4096"   # Maximum context window



 # Using a HuggingFace model
 - id: dora-llama-cpp-python
  build: pip install -e path/to/dora-llama-cpp-python
  path: dora-llama-cpp-python
@@ -34,7 +56,8 @@ The node can be configured in your dataflow YAML file:
  outputs:
    - text  # Generated response text
  env:
    MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
    MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF"
    MODEL_FILE_PATTERN: "*Q4_K_M.gguf"
    SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
    ACTIVATION_WORDS: "what how who where you"
    MAX_TOKENS: "512"
@@ -96,7 +119,8 @@ nodes:
    outputs:
      - text
    env:
      MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
      MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF"
      MODEL_FILE_PATTERN: "*Q4_K_M.gguf"
      SYSTEM_PROMPT: "You're a helpful assistant."
      ACTIVATION_WORDS: "hey help what how"
      MAX_TOKENS: "512"
--- a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
+++ b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
@@ -2,16 +2,21 @@ import os
 import pyarrow as pa
 from dora import Node
 from pathlib import Path
 import logging

 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Environment variables for model configuration
 SYSTEM_PROMPT = os.getenv(
    "SYSTEM_PROMPT",
    "You're a very succinct AI assistant with short answers.",
 )
 MODEL_PATH = os.getenv("MODEL_PATH", "./models/llama-2-7b-chat.Q4_K_M.gguf")
 MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "")  # Local model path takes precedence
 MODEL_NAME = os.getenv("MODEL_NAME", "TheBloke/Llama-2-7B-Chat-GGUF")  # HF repo as fallback
 MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf")
 MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
 N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0"))  # Number of layers to offload to GPU
 N_THREADS = int(os.getenv("N_THREADS", "4"))  # Number of CPU threads
 N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0"))
 N_THREADS = int(os.getenv("N_THREADS", "4"))
 CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "4096"))


@@ -19,22 +24,39 @@ def get_model():
    """Load a GGUF model using llama-cpp-python with optional GPU acceleration."""
    from llama_cpp import Llama
    
    model_path = Path(MODEL_PATH)
    if not model_path.exists():
        raise FileNotFoundError(
            f"Model file not found at {MODEL_PATH}. "
            "Download it using: wget -O models/llama-2-7b-chat.Q4_K_M.gguf "
            "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
        )

    llm = Llama(
        model_path=str(model_path),
        n_gpu_layers=N_GPU_LAYERS,  # Enable GPU acceleration if > 0
        n_ctx=CONTEXT_SIZE,         # Maximum context size
        n_threads=N_THREADS,        # Control CPU threading
        verbose=False
    )
    return llm
    try:
        # Check if local path is provided
        if MODEL_LOCAL_PATH:
            model_path = Path(MODEL_LOCAL_PATH)
            if not model_path.exists():
                raise FileNotFoundError(f"Local model not found at {MODEL_LOCAL_PATH}")
            
            logging.info(f"Loading local model from {MODEL_LOCAL_PATH}")
            llm = Llama(
                model_path=str(model_path),
                n_gpu_layers=N_GPU_LAYERS,
                n_ctx=CONTEXT_SIZE,
                n_threads=N_THREADS,
                verbose=False
            )
        else:
            # Load from HuggingFace if no local path
            logging.info(f"Downloading model {MODEL_NAME} with pattern {MODEL_FILE_PATTERN}")
            llm = Llama.from_pretrained(
                repo_id=MODEL_NAME,
                filename=MODEL_FILE_PATTERN,
                n_gpu_layers=N_GPU_LAYERS,
                n_ctx=CONTEXT_SIZE,
                n_threads=N_THREADS,
                verbose=False
            )
        
        logging.info("Model loaded successfully")
        return llm
    
    except Exception as e:
        logging.error(f"Error loading model: {e}")
        raise


 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
--- a/node-hub/dora-llama-cpp-python/pyproject.toml
+++ b/node-hub/dora-llama-cpp-python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dora-llama-cpp-python"
 version = "0.0.1"
 version = "1.0.0"
 authors = [{ name = "Shashwat Patil", email = "email@email.com" }]
 description = "dora-llama-cpp-python"
 license = { text = "MIT" }
--- a/node-hub/dora-llama-cpp-python/test.yml
+++ b/node-hub/dora-llama-cpp-python/test.yml
@@ -34,7 +34,8 @@ nodes:
    outputs:
      - text
    env:
      MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
      MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF"   # Llama 2.7B model pull from Hugging Face
      MODEL_FILE_PATTERN: "*Q4_K_M.gguf"
      SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
      ACTIVATION_WORDS: "what how who where you"
      MAX_TOKENS: "512"