Browse Source

added model change functionality from yml

tags/v0.3.11-rc1
ShashwatPatil 10 months ago
parent
commit
49eb2d6503
3 changed files with 33 additions and 18 deletions
  1. +8
    -1
      node-hub/dora-llama-cpp-python/README.md
  2. +18
    -16
      node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
  3. +7
    -1
      node-hub/dora-llama-cpp-python/test.yml

+ 8
- 1
node-hub/dora-llama-cpp-python/README.md View File

@@ -9,6 +9,7 @@ A Dora node that provides access to LLaMA-based models using either llama.cpp or
- Configurable system prompts and activation words
- Chat history support with Hugging Face models
- Lightweight CPU inference with GGUF models
- Multiple model support (Qwen, LLaMA, etc.)

## Getting started

@@ -33,8 +34,12 @@ The node can be configured in your dataflow YAML file:
- text # Generated response text
env:
MODEL_BACKEND: "llama-cpp" # or "huggingface"
MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF" # For llama-cpp backend
MODEL_FILENAME: "*fp16.gguf" # For llama-cpp backend
HF_MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct" # For huggingface backend
SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
ACTIVATION_WORDS: "what how who where you" # Space-separated activation words
ACTIVATION_WORDS: "what how who where you"
MAX_TOKENS: "512"
```

### Configuration Options
@@ -92,6 +97,8 @@ nodes:
- text
env:
MODEL_BACKEND: llama-cpp
MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
MODEL_FILENAME: "*fp16.gguf"
SYSTEM_PROMPT: "You're a helpful assistant."
ACTIVATION_WORDS: "hey help what how"



+ 18
- 16
node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py View File

@@ -3,14 +3,16 @@ import pyarrow as pa
from dora import Node
from transformers import AutoModelForCausalLM, AutoTokenizer

# System prompt
# Environment variables for model configuration
SYSTEM_PROMPT = os.getenv(
"SYSTEM_PROMPT",
"You're a very succinct AI assistant with short answers.",
)

# Model selection based on ENV variable
MODEL_BACKEND = os.getenv("MODEL_BACKEND", "llama-cpp") # Default to CPU-based Llama
MODEL_BACKEND = os.getenv("MODEL_BACKEND", "llama-cpp")
MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
MODEL_FILENAME = os.getenv("MODEL_FILENAME", "*fp16.gguf")
HF_MODEL_NAME = os.getenv("HF_MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct")
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))


def get_model_llama_cpp():
@@ -18,19 +20,21 @@ def get_model_llama_cpp():
from llama_cpp import Llama

llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False
repo_id=MODEL_REPO_ID,
filename=MODEL_FILENAME,
verbose=False
)
return llm


def get_model_huggingface():
"""Load a Hugging Face transformers model."""
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype="auto", device_map="cpu"
HF_MODEL_NAME,
torch_dtype="auto",
device_map="cpu"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
return model, tokenizer


@@ -44,9 +48,9 @@ def generate_hf(model, tokenizer, prompt: str, history) -> str:
history, tokenize=False, add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to("cpu")
generated_ids = model.generate(**model_inputs, max_new_tokens=512)
generated_ids = model.generate(**model_inputs, max_new_tokens=MAX_TOKENS)
generated_ids = [
output_ids[len(input_ids) :]
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -69,21 +73,19 @@ def main():
if event["type"] == "INPUT":
text = event["value"][0].as_py()
words = text.lower().split()
print(words)
# print(f"Input text: {text}")

if any(word in ACTIVATION_WORDS for word in words):
print("")
if MODEL_BACKEND == "llama-cpp":
response = model(
f"Q: {text} A: ",
max_tokens=24,
max_tokens=MAX_TOKENS,
stop=["Q:", "\n"],
)["choices"][0]["text"]
else:
response, history = generate_hf(model, tokenizer, text, history)
# log output
print(response)
# print(f"Generated response: {response}")
node.send_output(
output_id="text", data=pa.array([response]), metadata={}
)


+ 7
- 1
node-hub/dora-llama-cpp-python/test.yml View File

@@ -34,7 +34,13 @@ nodes:
outputs:
- text
env:
MODEL_BACKEND: llama-cpp # Can be changed to "huggingface" if needed
MODEL_BACKEND: "llama-cpp" # or "huggingface"
MODEL_REPO_ID: "Qwen/Qwen2.5-0.5B-Instruct-GGUF" # For llama-cpp backend
MODEL_FILENAME: "*fp16.gguf" # For llama-cpp backend
HF_MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct" # For huggingface backend
SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
ACTIVATION_WORDS: "what how who where you"
MAX_TOKENS: "512"

- id: plot
build: pip install -e ../../node-hub/dora-rerun


Loading…
Cancel
Save