Browse Source

Minor fix within llama-cpp-python transformers and qwen

tags/v0.3.11-rc1
haixuanTao 10 months ago
parent
commit
4c2194bcc7
4 changed files with 53 additions and 70 deletions
  1. +18
    -11
      node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
  2. +7
    -9
      node-hub/dora-qwen/dora_qwen/main.py
  3. +18
    -40
      node-hub/dora-transformers/dora_transformers/main.py
  4. +10
    -10
      node-hub/dora-transformers/pyproject.toml

+ 18
- 11
node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py View File

@@ -12,7 +12,7 @@ logging.basicConfig(level=logging.INFO)
# Environment variables for model configuration
SYSTEM_PROMPT = os.getenv(
"SYSTEM_PROMPT",
"You're a very succinct AI assistant with short answers.",
"",
)
MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", "TheBloke/Llama-2-7B-Chat-GGUF")
MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf")
@@ -40,7 +40,9 @@ def get_model():
)
else:
# Load from HuggingFace
logging.info(f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}")
logging.info(
f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}"
)
llm = Llama.from_pretrained(
repo_id=MODEL_NAME_OR_PATH,
filename=MODEL_FILE_PATTERN,
@@ -58,7 +60,7 @@ def get_model():
raise


ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()


def main():
@@ -66,23 +68,28 @@ def main():
# Initialize model
model = get_model()
node = Node()
history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else []
for event in node:
if event["type"] == "INPUT":
text = event["value"][0].as_py()
words = text.lower().split()

if any(word in ACTIVATION_WORDS for word in words):
if len(ACTIVATION_WORDS) == 0 or any(
word in ACTIVATION_WORDS for word in words
):
# Generate response using system prompt
prompt = f"{SYSTEM_PROMPT}\nQ: {text}\nA:"
response = model(
prompt,
response = model.create_chat_completion(
messages=history
+ [
{"role": "user", "content": text},
], # Prompt
max_tokens=MAX_TOKENS,
stop=["Q:", "\n"],
)["choices"][0]["text"]
)["choices"][0]["message"]["content"]

node.send_output(
output_id="text", data=pa.array([response]), metadata={},
output_id="text",
data=pa.array([response]),
metadata={},
)




+ 7
- 9
node-hub/dora-qwen/dora_qwen/main.py View File

@@ -45,7 +45,7 @@ def get_model_huggingface():
return model, tokenizer


ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()


def generate_hf(model, tokenizer, prompt: str, history) -> str:
@@ -86,17 +86,15 @@ def main():
text = event["value"][0].as_py()
words = text.lower().split()

if any(word in ACTIVATION_WORDS for word in words):
if len(ACTIVATION_WORDS) == 0 or any(
word in ACTIVATION_WORDS for word in words
):
# On linux, Windows
if sys.platform == "darwin":
response = model(
f"Q: {text} A: ", # Prompt
response = model.create_chat_completion(
messages=[{"role": "user", "content": text}], # Prompt
max_tokens=24,
stop=[
"Q:",
"\n",
], # Stop generating just before the model would generate a new question
)["choices"][0]["text"]
)["choices"][0]["message"]["content"]
elif sys.platform == "linux":
response, history = generate_hf(model, tokenizer, text, history)
else:


+ 18
- 40
node-hub/dora-transformers/dora_transformers/main.py View File

@@ -14,23 +14,17 @@ logging.basicConfig(level=logging.INFO)
# Environment variables for model configuration
SYSTEM_PROMPT = os.getenv(
"SYSTEM_PROMPT",
"You're a very succinct AI assistant with short answers.",
"",
)
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct")
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
DEVICE = os.getenv("DEVICE", "auto")
TORCH_DTYPE = os.getenv("TORCH_DTYPE", "auto")
ENABLE_MEMORY_EFFICIENT = os.getenv("ENABLE_MEMORY_EFFICIENT", "true").lower() == "true"

# Configure PyTorch memory management
if DEVICE == "cuda":
# Set memory efficient settings
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
if ENABLE_MEMORY_EFFICIENT:
torch.cuda.empty_cache()

# Words that trigger the model to respond
ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()


def load_model():
"""Load the transformer model and tokenizer."""
@@ -42,65 +36,50 @@ def load_model():
"device_map": DEVICE,
}

if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
model_kwargs.update({
"low_cpu_mem_usage": True,
"offload_folder": "offload",
"load_in_8bit": True,
})

model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
**model_kwargs,
)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
logging.info("Model loaded successfully")
return model, tokenizer



def generate_response(model, tokenizer, text: str, history) -> tuple[str, list]:
"""Generate text using the transformer model."""
history += [{"role": "user", "content": text}]

prompt = tokenizer.apply_chat_template(
history, tokenize=False, add_generation_prompt=True,
history,
tokenize=False,
add_generation_prompt=True,
)

model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

with torch.inference_mode():
generated_ids = model.generate(
**model_inputs,
max_new_tokens=MAX_TOKENS,
pad_token_id=tokenizer.pad_token_id,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.2,
length_penalty=0.5,
)

generated_ids = [
output_ids[len(input_ids):]
output_ids[len(input_ids) :]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
history += [{"role": "assistant", "content": response}]

# Clear CUDA cache after successful generation if enabled
if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
torch.cuda.empty_cache()

return response, history


def main():
"""TODO: Add docstring."""
# Initialize model and conversation history
model, tokenizer = load_model()
# Initialize history with system prompt
history = [{"role": "system", "content": SYSTEM_PROMPT}]

history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else []
node = Node()

for event in node:
@@ -108,16 +87,15 @@ def main():
text = event["value"][0].as_py()
words = text.lower().split()

if any(word in ACTIVATION_WORDS for word in words):
logging.info(f"Processing input: {text}")
response, history = generate_response(model, tokenizer, text, history)
logging.info(f"Generated response: {response}")
if len(ACTIVATION_WORDS) == 0 or any(
word in ACTIVATION_WORDS for word in words
):
response, _history = generate_response(model, tokenizer, text, history)

node.send_output(
output_id="text",
data=pa.array([response]),
metadata={},
output_id="text", data=pa.array([response]), metadata={}
)


if __name__ == "__main__":
main()

+ 10
- 10
node-hub/dora-transformers/pyproject.toml View File

@@ -8,22 +8,22 @@ readme = "README.md"
requires-python = ">=3.9"

dependencies = [
"dora-rs >= 0.3.9",
"torch == 2.4.0",
"torchvision >= 0.19",
"torchaudio >= 2.1.0",
"opencv-python >= 4.1.1",
"modelscope >= 1.18.1",
"accelerate >= 1.3.0",
"transformers",
"bitsandbytes>=0.41.1",
"dora-rs >= 0.3.9",
"torch == 2.4.0",
"torchvision >= 0.19",
"torchaudio >= 2.1.0",
"opencv-python >= 4.1.1",
"modelscope >= 1.18.1",
"accelerate >= 1.3.0",
"transformers",
"bitsandbytes>=0.41.1",
]

[dependency-groups]
dev = ["pytest >=8.1.1", "ruff >=0.9.1"]

[project.scripts]
dora-transformer = "dora_transformer.main:main"
dora-transformers = "dora_transformers.main:main"


[tool.ruff.lint]


Loading…
Cancel
Save