From 4c2194bcc797ba5b7558e0619e81d28c1191c467 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Mon, 17 Mar 2025 16:27:45 +0100
Subject: [PATCH] Minor fix within llama-cpp-python transformers and qwen

---
 .../dora_llama_cpp_python/main.py             | 29 ++++++----
 node-hub/dora-qwen/dora_qwen/main.py          | 16 +++--
 .../dora_transformers/main.py                 | 58 ++++++-------------
 node-hub/dora-transformers/pyproject.toml     | 20 +++----
 4 files changed, 53 insertions(+), 70 deletions(-)

diff --git a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
index 1acff6c0..b4d4c195 100644
--- a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
+++ b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
@@ -12,7 +12,7 @@ logging.basicConfig(level=logging.INFO)
 # Environment variables for model configuration
 SYSTEM_PROMPT = os.getenv(
     "SYSTEM_PROMPT",
-    "You're a very succinct AI assistant with short answers.",
+    "",
 )
 MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", "TheBloke/Llama-2-7B-Chat-GGUF")
 MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf")
@@ -40,7 +40,9 @@ def get_model():
             )
         else:
             # Load from HuggingFace
-            logging.info(f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}")
+            logging.info(
+                f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}"
+            )
             llm = Llama.from_pretrained(
                 repo_id=MODEL_NAME_OR_PATH,
                 filename=MODEL_FILE_PATTERN,
@@ -58,7 +60,7 @@ def get_model():
         raise
 
 
-ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
+ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()
 
 
 def main():
@@ -66,23 +68,28 @@ def main():
     # Initialize model
     model = get_model()
     node = Node()
-
+    history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else []
     for event in node:
         if event["type"] == "INPUT":
             text = event["value"][0].as_py()
             words = text.lower().split()
 
-            if any(word in ACTIVATION_WORDS for word in words):
+            if len(ACTIVATION_WORDS) == 0 or any(
+                word in ACTIVATION_WORDS for word in words
+            ):
                 # Generate response using system prompt
-                prompt = f"{SYSTEM_PROMPT}\nQ: {text}\nA:"
-                response = model(
-                    prompt,
+                response = model.create_chat_completion(
+                    messages=history
+                    + [
+                        {"role": "user", "content": text},
+                    ],  # Prompt
                     max_tokens=MAX_TOKENS,
-                    stop=["Q:", "\n"],
-                )["choices"][0]["text"]
+                )["choices"][0]["message"]["content"]
 
                 node.send_output(
-                    output_id="text", data=pa.array([response]), metadata={},
+                    output_id="text",
+                    data=pa.array([response]),
+                    metadata={},
                 )
 
 
diff --git a/node-hub/dora-qwen/dora_qwen/main.py b/node-hub/dora-qwen/dora_qwen/main.py
index 352e566a..957abf42 100644
--- a/node-hub/dora-qwen/dora_qwen/main.py
+++ b/node-hub/dora-qwen/dora_qwen/main.py
@@ -45,7 +45,7 @@ def get_model_huggingface():
     return model, tokenizer
 
 
-ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
+ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()
 
 
 def generate_hf(model, tokenizer, prompt: str, history) -> str:
@@ -86,17 +86,15 @@ def main():
             text = event["value"][0].as_py()
             words = text.lower().split()
 
-            if any(word in ACTIVATION_WORDS for word in words):
+            if len(ACTIVATION_WORDS) == 0 or any(
+                word in ACTIVATION_WORDS for word in words
+            ):
                 # On linux, Windows
                 if sys.platform == "darwin":
-                    response = model(
-                        f"Q: {text} A: ",  # Prompt
+                    response = model.create_chat_completion(
+                        messages=[{"role": "user", "content": text}],  # Prompt
                         max_tokens=24,
-                        stop=[
-                            "Q:",
-                            "\n",
-                        ],  # Stop generating just before the model would generate a new question
-                    )["choices"][0]["text"]
+                    )["choices"][0]["message"]["content"]
                 elif sys.platform == "linux":
                     response, history = generate_hf(model, tokenizer, text, history)
                 else:
diff --git a/node-hub/dora-transformers/dora_transformers/main.py b/node-hub/dora-transformers/dora_transformers/main.py
index 18323e78..de386d07 100644
--- a/node-hub/dora-transformers/dora_transformers/main.py
+++ b/node-hub/dora-transformers/dora_transformers/main.py
@@ -14,23 +14,17 @@ logging.basicConfig(level=logging.INFO)
 # Environment variables for model configuration
 SYSTEM_PROMPT = os.getenv(
     "SYSTEM_PROMPT",
-    "You're a very succinct AI assistant with short answers.",
+    "",
 )
 MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct")
 MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
 DEVICE = os.getenv("DEVICE", "auto")
 TORCH_DTYPE = os.getenv("TORCH_DTYPE", "auto")
-ENABLE_MEMORY_EFFICIENT = os.getenv("ENABLE_MEMORY_EFFICIENT", "true").lower() == "true"
 
-# Configure PyTorch memory management
-if DEVICE == "cuda":
-    # Set memory efficient settings
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-    if ENABLE_MEMORY_EFFICIENT:
-        torch.cuda.empty_cache()
 
 # Words that trigger the model to respond
-ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
+ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()
+
 
 def load_model():
     """Load the transformer model and tokenizer."""
@@ -42,65 +36,50 @@ def load_model():
         "device_map": DEVICE,
     }
 
-    if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
-        model_kwargs.update({
-            "low_cpu_mem_usage": True,
-            "offload_folder": "offload",
-            "load_in_8bit": True,
-        })
-
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        **model_kwargs,
-    )
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, **model_kwargs)
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     logging.info("Model loaded successfully")
     return model, tokenizer
 
 
-
 def generate_response(model, tokenizer, text: str, history) -> tuple[str, list]:
     """Generate text using the transformer model."""
     history += [{"role": "user", "content": text}]
 
     prompt = tokenizer.apply_chat_template(
-        history, tokenize=False, add_generation_prompt=True,
+        history,
+        tokenize=False,
+        add_generation_prompt=True,
     )
 
-    model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
+    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
 
     with torch.inference_mode():
         generated_ids = model.generate(
             **model_inputs,
             max_new_tokens=MAX_TOKENS,
             pad_token_id=tokenizer.pad_token_id,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
             repetition_penalty=1.2,
-            length_penalty=0.5,
         )
 
     generated_ids = [
-        output_ids[len(input_ids):]
+        output_ids[len(input_ids) :]
         for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
     ]
 
     response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
     history += [{"role": "assistant", "content": response}]
 
-    # Clear CUDA cache after successful generation if enabled
-    if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
-        torch.cuda.empty_cache()
-
     return response, history
 
+
 def main():
     """TODO: Add docstring."""
     # Initialize model and conversation history
     model, tokenizer = load_model()
     # Initialize history with system prompt
-    history = [{"role": "system", "content": SYSTEM_PROMPT}]
+
+    history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else []
     node = Node()
 
     for event in node:
@@ -108,16 +87,15 @@ def main():
             text = event["value"][0].as_py()
             words = text.lower().split()
 
-            if any(word in ACTIVATION_WORDS for word in words):
-                logging.info(f"Processing input: {text}")
-                response, history = generate_response(model, tokenizer, text, history)
-                logging.info(f"Generated response: {response}")
+            if len(ACTIVATION_WORDS) == 0 or any(
+                word in ACTIVATION_WORDS for word in words
+            ):
+                response, _history = generate_response(model, tokenizer, text, history)
 
                 node.send_output(
-                    output_id="text",
-                    data=pa.array([response]),
-                    metadata={},
+                    output_id="text", data=pa.array([response]), metadata={}
                 )
 
+
 if __name__ == "__main__":
     main()
diff --git a/node-hub/dora-transformers/pyproject.toml b/node-hub/dora-transformers/pyproject.toml
index 1101f471..a6c77034 100644
--- a/node-hub/dora-transformers/pyproject.toml
+++ b/node-hub/dora-transformers/pyproject.toml
@@ -8,22 +8,22 @@ readme = "README.md"
 requires-python = ">=3.9"
 
 dependencies = [
-    "dora-rs >= 0.3.9",
-    "torch == 2.4.0",
-    "torchvision >= 0.19",
-    "torchaudio >= 2.1.0",
-    "opencv-python >= 4.1.1",
-    "modelscope >= 1.18.1",
-    "accelerate >= 1.3.0",
-    "transformers",
-    "bitsandbytes>=0.41.1", 
+  "dora-rs >= 0.3.9",
+  "torch == 2.4.0",
+  "torchvision >= 0.19",
+  "torchaudio >= 2.1.0",
+  "opencv-python >= 4.1.1",
+  "modelscope >= 1.18.1",
+  "accelerate >= 1.3.0",
+  "transformers",
+  "bitsandbytes>=0.41.1",
 ]
 
 [dependency-groups]
 dev = ["pytest >=8.1.1", "ruff >=0.9.1"]
 
 [project.scripts]
-dora-transformer = "dora_transformer.main:main"
+dora-transformers = "dora_transformers.main:main"
 
 
 [tool.ruff.lint]