Browse Source

added support for remote model pulling from huggingface

tags/v0.3.11-rc1
ShashwatPatil 10 months ago
parent
commit
84641f93e4
4 changed files with 70 additions and 23 deletions
  1. +26
    -2
      node-hub/dora-llama-cpp-python/README.md
  2. +41
    -19
      node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
  3. +1
    -1
      node-hub/dora-llama-cpp-python/pyproject.toml
  4. +2
    -1
      node-hub/dora-llama-cpp-python/test.yml

+ 26
- 2
node-hub/dora-llama-cpp-python/README.md View File

@@ -26,6 +26,28 @@ uv pip install -e .
The node can be configured in your dataflow YAML file:

```yaml

# Using a local model

- id: dora-llama-cpp-python
build: pip install -e path/to/dora-llama-cpp-python
path: dora-llama-cpp-python
inputs:
text: source_node/text # Input text to generate response for
outputs:
- text # Generated response text
env:
MODEL_LOCAL_PATH: "./models/my-local-model.gguf"
SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
ACTIVATION_WORDS: "what how who where you"
MAX_TOKENS: "512"
N_GPU_LAYERS: "35" # Enable GPU acceleration
N_THREADS: "4" # CPU threads
CONTEXT_SIZE: "4096" # Maximum context window



# Using a HuggingFace model
- id: dora-llama-cpp-python
build: pip install -e path/to/dora-llama-cpp-python
path: dora-llama-cpp-python
@@ -34,7 +56,8 @@ The node can be configured in your dataflow YAML file:
outputs:
- text # Generated response text
env:
MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF"
MODEL_FILE_PATTERN: "*Q4_K_M.gguf"
SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
ACTIVATION_WORDS: "what how who where you"
MAX_TOKENS: "512"
@@ -96,7 +119,8 @@ nodes:
outputs:
- text
env:
MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF"
MODEL_FILE_PATTERN: "*Q4_K_M.gguf"
SYSTEM_PROMPT: "You're a helpful assistant."
ACTIVATION_WORDS: "hey help what how"
MAX_TOKENS: "512"


+ 41
- 19
node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py View File

@@ -2,16 +2,21 @@ import os
import pyarrow as pa
from dora import Node
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
# Environment variables for model configuration
SYSTEM_PROMPT = os.getenv(
"SYSTEM_PROMPT",
"You're a very succinct AI assistant with short answers.",
)
MODEL_PATH = os.getenv("MODEL_PATH", "./models/llama-2-7b-chat.Q4_K_M.gguf")
MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "") # Local model path takes precedence
MODEL_NAME = os.getenv("MODEL_NAME", "TheBloke/Llama-2-7B-Chat-GGUF") # HF repo as fallback
MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf")
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0")) # Number of layers to offload to GPU
N_THREADS = int(os.getenv("N_THREADS", "4")) # Number of CPU threads
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0"))
N_THREADS = int(os.getenv("N_THREADS", "4"))
CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "4096"))


@@ -19,22 +24,39 @@ def get_model():
"""Load a GGUF model using llama-cpp-python with optional GPU acceleration."""
from llama_cpp import Llama
model_path = Path(MODEL_PATH)
if not model_path.exists():
raise FileNotFoundError(
f"Model file not found at {MODEL_PATH}. "
"Download it using: wget -O models/llama-2-7b-chat.Q4_K_M.gguf "
"https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
)

llm = Llama(
model_path=str(model_path),
n_gpu_layers=N_GPU_LAYERS, # Enable GPU acceleration if > 0
n_ctx=CONTEXT_SIZE, # Maximum context size
n_threads=N_THREADS, # Control CPU threading
verbose=False
)
return llm
try:
# Check if local path is provided
if MODEL_LOCAL_PATH:
model_path = Path(MODEL_LOCAL_PATH)
if not model_path.exists():
raise FileNotFoundError(f"Local model not found at {MODEL_LOCAL_PATH}")
logging.info(f"Loading local model from {MODEL_LOCAL_PATH}")
llm = Llama(
model_path=str(model_path),
n_gpu_layers=N_GPU_LAYERS,
n_ctx=CONTEXT_SIZE,
n_threads=N_THREADS,
verbose=False
)
else:
# Load from HuggingFace if no local path
logging.info(f"Downloading model {MODEL_NAME} with pattern {MODEL_FILE_PATTERN}")
llm = Llama.from_pretrained(
repo_id=MODEL_NAME,
filename=MODEL_FILE_PATTERN,
n_gpu_layers=N_GPU_LAYERS,
n_ctx=CONTEXT_SIZE,
n_threads=N_THREADS,
verbose=False
)
logging.info("Model loaded successfully")
return llm
except Exception as e:
logging.error(f"Error loading model: {e}")
raise


ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()


+ 1
- 1
node-hub/dora-llama-cpp-python/pyproject.toml View File

@@ -1,6 +1,6 @@
[project]
name = "dora-llama-cpp-python"
version = "0.0.1"
version = "1.0.0"
authors = [{ name = "Shashwat Patil", email = "email@email.com" }]
description = "dora-llama-cpp-python"
license = { text = "MIT" }


+ 2
- 1
node-hub/dora-llama-cpp-python/test.yml View File

@@ -34,7 +34,8 @@ nodes:
outputs:
- text
env:
MODEL_PATH: "./models/llama-2-7b-chat.Q4_K_M.gguf"
MODEL_NAME: "TheBloke/Llama-2-7B-Chat-GGUF" # Llama 2.7B model pull from Hugging Face
MODEL_FILE_PATTERN: "*Q4_K_M.gguf"
SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
ACTIVATION_WORDS: "what how who where you"
MAX_TOKENS: "512"


Loading…
Cancel
Save