From 4c2194bcc797ba5b7558e0619e81d28c1191c467 Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Mon, 17 Mar 2025 16:27:45 +0100 Subject: [PATCH] Minor fix within llama-cpp-python transformers and qwen --- .../dora_llama_cpp_python/main.py | 29 ++++++---- node-hub/dora-qwen/dora_qwen/main.py | 16 +++-- .../dora_transformers/main.py | 58 ++++++------------- node-hub/dora-transformers/pyproject.toml | 20 +++---- 4 files changed, 53 insertions(+), 70 deletions(-) diff --git a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py index 1acff6c0..b4d4c195 100644 --- a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py +++ b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py @@ -12,7 +12,7 @@ logging.basicConfig(level=logging.INFO) # Environment variables for model configuration SYSTEM_PROMPT = os.getenv( "SYSTEM_PROMPT", - "You're a very succinct AI assistant with short answers.", + "", ) MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", "TheBloke/Llama-2-7B-Chat-GGUF") MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf") @@ -40,7 +40,9 @@ def get_model(): ) else: # Load from HuggingFace - logging.info(f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}") + logging.info( + f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}" + ) llm = Llama.from_pretrained( repo_id=MODEL_NAME_OR_PATH, filename=MODEL_FILE_PATTERN, @@ -58,7 +60,7 @@ def get_model(): raise -ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() +ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split() def main(): @@ -66,23 +68,28 @@ def main(): # Initialize model model = get_model() node = Node() - + history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else [] for event in node: if event["type"] == "INPUT": text = event["value"][0].as_py() words = text.lower().split() - if any(word in ACTIVATION_WORDS for word in words): + if len(ACTIVATION_WORDS) == 0 or any( + word in ACTIVATION_WORDS for word in words + ): # Generate response using system prompt - prompt = f"{SYSTEM_PROMPT}\nQ: {text}\nA:" - response = model( - prompt, + response = model.create_chat_completion( + messages=history + + [ + {"role": "user", "content": text}, + ], # Prompt max_tokens=MAX_TOKENS, - stop=["Q:", "\n"], - )["choices"][0]["text"] + )["choices"][0]["message"]["content"] node.send_output( - output_id="text", data=pa.array([response]), metadata={}, + output_id="text", + data=pa.array([response]), + metadata={}, ) diff --git a/node-hub/dora-qwen/dora_qwen/main.py b/node-hub/dora-qwen/dora_qwen/main.py index 352e566a..957abf42 100644 --- a/node-hub/dora-qwen/dora_qwen/main.py +++ b/node-hub/dora-qwen/dora_qwen/main.py @@ -45,7 +45,7 @@ def get_model_huggingface(): return model, tokenizer -ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() +ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split() def generate_hf(model, tokenizer, prompt: str, history) -> str: @@ -86,17 +86,15 @@ def main(): text = event["value"][0].as_py() words = text.lower().split() - if any(word in ACTIVATION_WORDS for word in words): + if len(ACTIVATION_WORDS) == 0 or any( + word in ACTIVATION_WORDS for word in words + ): # On linux, Windows if sys.platform == "darwin": - response = model( - f"Q: {text} A: ", # Prompt + response = model.create_chat_completion( + messages=[{"role": "user", "content": text}], # Prompt max_tokens=24, - stop=[ - "Q:", - "\n", - ], # Stop generating just before the model would generate a new question - )["choices"][0]["text"] + )["choices"][0]["message"]["content"] elif sys.platform == "linux": response, history = generate_hf(model, tokenizer, text, history) else: diff --git a/node-hub/dora-transformers/dora_transformers/main.py b/node-hub/dora-transformers/dora_transformers/main.py index 18323e78..de386d07 100644 --- a/node-hub/dora-transformers/dora_transformers/main.py +++ b/node-hub/dora-transformers/dora_transformers/main.py @@ -14,23 +14,17 @@ logging.basicConfig(level=logging.INFO) # Environment variables for model configuration SYSTEM_PROMPT = os.getenv( "SYSTEM_PROMPT", - "You're a very succinct AI assistant with short answers.", + "", ) MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct") MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512")) DEVICE = os.getenv("DEVICE", "auto") TORCH_DTYPE = os.getenv("TORCH_DTYPE", "auto") -ENABLE_MEMORY_EFFICIENT = os.getenv("ENABLE_MEMORY_EFFICIENT", "true").lower() == "true" -# Configure PyTorch memory management -if DEVICE == "cuda": - # Set memory efficient settings - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" - if ENABLE_MEMORY_EFFICIENT: - torch.cuda.empty_cache() # Words that trigger the model to respond -ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() +ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split() + def load_model(): """Load the transformer model and tokenizer.""" @@ -42,65 +36,50 @@ def load_model(): "device_map": DEVICE, } - if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda": - model_kwargs.update({ - "low_cpu_mem_usage": True, - "offload_folder": "offload", - "load_in_8bit": True, - }) - - model = AutoModelForCausalLM.from_pretrained( - MODEL_NAME, - **model_kwargs, - ) + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, **model_kwargs) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) logging.info("Model loaded successfully") return model, tokenizer - def generate_response(model, tokenizer, text: str, history) -> tuple[str, list]: """Generate text using the transformer model.""" history += [{"role": "user", "content": text}] prompt = tokenizer.apply_chat_template( - history, tokenize=False, add_generation_prompt=True, + history, + tokenize=False, + add_generation_prompt=True, ) - model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE) + model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device) with torch.inference_mode(): generated_ids = model.generate( **model_inputs, max_new_tokens=MAX_TOKENS, pad_token_id=tokenizer.pad_token_id, - do_sample=True, - temperature=0.7, - top_p=0.9, repetition_penalty=1.2, - length_penalty=0.5, ) generated_ids = [ - output_ids[len(input_ids):] + output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] history += [{"role": "assistant", "content": response}] - # Clear CUDA cache after successful generation if enabled - if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda": - torch.cuda.empty_cache() - return response, history + def main(): """TODO: Add docstring.""" # Initialize model and conversation history model, tokenizer = load_model() # Initialize history with system prompt - history = [{"role": "system", "content": SYSTEM_PROMPT}] + + history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else [] node = Node() for event in node: @@ -108,16 +87,15 @@ def main(): text = event["value"][0].as_py() words = text.lower().split() - if any(word in ACTIVATION_WORDS for word in words): - logging.info(f"Processing input: {text}") - response, history = generate_response(model, tokenizer, text, history) - logging.info(f"Generated response: {response}") + if len(ACTIVATION_WORDS) == 0 or any( + word in ACTIVATION_WORDS for word in words + ): + response, _history = generate_response(model, tokenizer, text, history) node.send_output( - output_id="text", - data=pa.array([response]), - metadata={}, + output_id="text", data=pa.array([response]), metadata={} ) + if __name__ == "__main__": main() diff --git a/node-hub/dora-transformers/pyproject.toml b/node-hub/dora-transformers/pyproject.toml index 1101f471..a6c77034 100644 --- a/node-hub/dora-transformers/pyproject.toml +++ b/node-hub/dora-transformers/pyproject.toml @@ -8,22 +8,22 @@ readme = "README.md" requires-python = ">=3.9" dependencies = [ - "dora-rs >= 0.3.9", - "torch == 2.4.0", - "torchvision >= 0.19", - "torchaudio >= 2.1.0", - "opencv-python >= 4.1.1", - "modelscope >= 1.18.1", - "accelerate >= 1.3.0", - "transformers", - "bitsandbytes>=0.41.1", + "dora-rs >= 0.3.9", + "torch == 2.4.0", + "torchvision >= 0.19", + "torchaudio >= 2.1.0", + "opencv-python >= 4.1.1", + "modelscope >= 1.18.1", + "accelerate >= 1.3.0", + "transformers", + "bitsandbytes>=0.41.1", ] [dependency-groups] dev = ["pytest >=8.1.1", "ruff >=0.9.1"] [project.scripts] -dora-transformer = "dora_transformer.main:main" +dora-transformers = "dora_transformers.main:main" [tool.ruff.lint]