|
|
|
@@ -2,16 +2,21 @@ import os |
|
|
|
import pyarrow as pa |
|
|
|
from dora import Node |
|
|
|
from pathlib import Path |
|
|
|
import logging |
|
|
|
|
|
|
|
# Configure logging |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
# Environment variables for model configuration |
|
|
|
SYSTEM_PROMPT = os.getenv( |
|
|
|
"SYSTEM_PROMPT", |
|
|
|
"You're a very succinct AI assistant with short answers.", |
|
|
|
) |
|
|
|
MODEL_PATH = os.getenv("MODEL_PATH", "./models/llama-2-7b-chat.Q4_K_M.gguf") |
|
|
|
MODEL_LOCAL_PATH = os.getenv("MODEL_LOCAL_PATH", "") # Local model path takes precedence |
|
|
|
MODEL_NAME = os.getenv("MODEL_NAME", "TheBloke/Llama-2-7B-Chat-GGUF") # HF repo as fallback |
|
|
|
MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf") |
|
|
|
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512")) |
|
|
|
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0")) # Number of layers to offload to GPU |
|
|
|
N_THREADS = int(os.getenv("N_THREADS", "4")) # Number of CPU threads |
|
|
|
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "0")) |
|
|
|
N_THREADS = int(os.getenv("N_THREADS", "4")) |
|
|
|
CONTEXT_SIZE = int(os.getenv("CONTEXT_SIZE", "4096")) |
|
|
|
|
|
|
|
|
|
|
|
@@ -19,22 +24,39 @@ def get_model(): |
|
|
|
"""Load a GGUF model using llama-cpp-python with optional GPU acceleration.""" |
|
|
|
from llama_cpp import Llama |
|
|
|
|
|
|
|
model_path = Path(MODEL_PATH) |
|
|
|
if not model_path.exists(): |
|
|
|
raise FileNotFoundError( |
|
|
|
f"Model file not found at {MODEL_PATH}. " |
|
|
|
"Download it using: wget -O models/llama-2-7b-chat.Q4_K_M.gguf " |
|
|
|
"https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf" |
|
|
|
) |
|
|
|
|
|
|
|
llm = Llama( |
|
|
|
model_path=str(model_path), |
|
|
|
n_gpu_layers=N_GPU_LAYERS, # Enable GPU acceleration if > 0 |
|
|
|
n_ctx=CONTEXT_SIZE, # Maximum context size |
|
|
|
n_threads=N_THREADS, # Control CPU threading |
|
|
|
verbose=False |
|
|
|
) |
|
|
|
return llm |
|
|
|
try: |
|
|
|
# Check if local path is provided |
|
|
|
if MODEL_LOCAL_PATH: |
|
|
|
model_path = Path(MODEL_LOCAL_PATH) |
|
|
|
if not model_path.exists(): |
|
|
|
raise FileNotFoundError(f"Local model not found at {MODEL_LOCAL_PATH}") |
|
|
|
|
|
|
|
logging.info(f"Loading local model from {MODEL_LOCAL_PATH}") |
|
|
|
llm = Llama( |
|
|
|
model_path=str(model_path), |
|
|
|
n_gpu_layers=N_GPU_LAYERS, |
|
|
|
n_ctx=CONTEXT_SIZE, |
|
|
|
n_threads=N_THREADS, |
|
|
|
verbose=False |
|
|
|
) |
|
|
|
else: |
|
|
|
# Load from HuggingFace if no local path |
|
|
|
logging.info(f"Downloading model {MODEL_NAME} with pattern {MODEL_FILE_PATTERN}") |
|
|
|
llm = Llama.from_pretrained( |
|
|
|
repo_id=MODEL_NAME, |
|
|
|
filename=MODEL_FILE_PATTERN, |
|
|
|
n_gpu_layers=N_GPU_LAYERS, |
|
|
|
n_ctx=CONTEXT_SIZE, |
|
|
|
n_threads=N_THREADS, |
|
|
|
verbose=False |
|
|
|
) |
|
|
|
|
|
|
|
logging.info("Model loaded successfully") |
|
|
|
return llm |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error loading model: {e}") |
|
|
|
raise |
|
|
|
|
|
|
|
|
|
|
|
ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() |
|
|
|
|