Minor fix within llama-cpp-python transformers and qwen

11 months ago · 4c2194bcc7
--- a/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
+++ b/node-hub/dora-llama-cpp-python/dora_llama_cpp_python/main.py
@@ -12,7 +12,7 @@ logging.basicConfig(level=logging.INFO)
 # Environment variables for model configuration
 SYSTEM_PROMPT = os.getenv(
    "SYSTEM_PROMPT",
    "You're a very succinct AI assistant with short answers.",
    "",
 )
 MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", "TheBloke/Llama-2-7B-Chat-GGUF")
 MODEL_FILE_PATTERN = os.getenv("MODEL_FILE_PATTERN", "*Q4_K_M.gguf")
@@ -40,7 +40,9 @@ def get_model():
            )
        else:
            # Load from HuggingFace
            logging.info(f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}")
            logging.info(
                f"Downloading model {MODEL_NAME_OR_PATH} with pattern {MODEL_FILE_PATTERN}"
            )
            llm = Llama.from_pretrained(
                repo_id=MODEL_NAME_OR_PATH,
                filename=MODEL_FILE_PATTERN,
@@ -58,7 +60,7 @@ def get_model():
        raise


 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()


 def main():
@@ -66,23 +68,28 @@ def main():
    # Initialize model
    model = get_model()
    node = Node()

    history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else []
    for event in node:
        if event["type"] == "INPUT":
            text = event["value"][0].as_py()
            words = text.lower().split()

            if any(word in ACTIVATION_WORDS for word in words):
            if len(ACTIVATION_WORDS) == 0 or any(
                word in ACTIVATION_WORDS for word in words
            ):
                # Generate response using system prompt
                prompt = f"{SYSTEM_PROMPT}\nQ: {text}\nA:"
                response = model(
                    prompt,
                response = model.create_chat_completion(
                    messages=history
                    + [
                        {"role": "user", "content": text},
                    ],  # Prompt
                    max_tokens=MAX_TOKENS,
                    stop=["Q:", "\n"],
                )["choices"][0]["text"]
                )["choices"][0]["message"]["content"]

                node.send_output(
                    output_id="text", data=pa.array([response]), metadata={},
                    output_id="text",
                    data=pa.array([response]),
                    metadata={},
                )


--- a/node-hub/dora-qwen/dora_qwen/main.py
+++ b/node-hub/dora-qwen/dora_qwen/main.py
@@ -45,7 +45,7 @@ def get_model_huggingface():
    return model, tokenizer


 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()


 def generate_hf(model, tokenizer, prompt: str, history) -> str:
@@ -86,17 +86,15 @@ def main():
            text = event["value"][0].as_py()
            words = text.lower().split()

            if any(word in ACTIVATION_WORDS for word in words):
            if len(ACTIVATION_WORDS) == 0 or any(
                word in ACTIVATION_WORDS for word in words
            ):
                # On linux, Windows
                if sys.platform == "darwin":
                    response = model(
                        f"Q: {text} A: ",  # Prompt
                    response = model.create_chat_completion(
                        messages=[{"role": "user", "content": text}],  # Prompt
                        max_tokens=24,
                        stop=[
                            "Q:",
                            "\n",
                        ],  # Stop generating just before the model would generate a new question
                    )["choices"][0]["text"]
                    )["choices"][0]["message"]["content"]
                elif sys.platform == "linux":
                    response, history = generate_hf(model, tokenizer, text, history)
                else:
--- a/node-hub/dora-transformers/dora_transformers/main.py
+++ b/node-hub/dora-transformers/dora_transformers/main.py
@@ -14,23 +14,17 @@ logging.basicConfig(level=logging.INFO)
 # Environment variables for model configuration
 SYSTEM_PROMPT = os.getenv(
    "SYSTEM_PROMPT",
    "You're a very succinct AI assistant with short answers.",
    "",
 )
 MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-0.5B-Instruct")
 MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
 DEVICE = os.getenv("DEVICE", "auto")
 TORCH_DTYPE = os.getenv("TORCH_DTYPE", "auto")
 ENABLE_MEMORY_EFFICIENT = os.getenv("ENABLE_MEMORY_EFFICIENT", "true").lower() == "true"

 # Configure PyTorch memory management
 if DEVICE == "cuda":
    # Set memory efficient settings
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    if ENABLE_MEMORY_EFFICIENT:
        torch.cuda.empty_cache()

 # Words that trigger the model to respond
 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()
 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()


 def load_model():
    """Load the transformer model and tokenizer."""
@@ -42,65 +36,50 @@ def load_model():
        "device_map": DEVICE,
    }

    if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
        model_kwargs.update({
            "low_cpu_mem_usage": True,
            "offload_folder": "offload",
            "load_in_8bit": True,
        })

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        **model_kwargs,
    )
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, **model_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    logging.info("Model loaded successfully")
    return model, tokenizer



 def generate_response(model, tokenizer, text: str, history) -> tuple[str, list]:
    """Generate text using the transformer model."""
    history += [{"role": "user", "content": text}]

    prompt = tokenizer.apply_chat_template(
        history, tokenize=False, add_generation_prompt=True,
        history,
        tokenize=False,
        add_generation_prompt=True,
    )

    model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    with torch.inference_mode():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=MAX_TOKENS,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            length_penalty=0.5,
        )

    generated_ids = [
        output_ids[len(input_ids):]
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    history += [{"role": "assistant", "content": response}]

    # Clear CUDA cache after successful generation if enabled
    if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
        torch.cuda.empty_cache()

    return response, history


 def main():
    """TODO: Add docstring."""
    # Initialize model and conversation history
    model, tokenizer = load_model()
    # Initialize history with system prompt
    history = [{"role": "system", "content": SYSTEM_PROMPT}]

    history = [{"role": "system", "content": SYSTEM_PROMPT}] if SYSTEM_PROMPT else []
    node = Node()

    for event in node:
@@ -108,16 +87,15 @@ def main():
            text = event["value"][0].as_py()
            words = text.lower().split()

            if any(word in ACTIVATION_WORDS for word in words):
                logging.info(f"Processing input: {text}")
                response, history = generate_response(model, tokenizer, text, history)
                logging.info(f"Generated response: {response}")
            if len(ACTIVATION_WORDS) == 0 or any(
                word in ACTIVATION_WORDS for word in words
            ):
                response, _history = generate_response(model, tokenizer, text, history)

                node.send_output(
                    output_id="text",
                    data=pa.array([response]),
                    metadata={},
                    output_id="text", data=pa.array([response]), metadata={}
                )


 if __name__ == "__main__":
    main()
--- a/node-hub/dora-transformers/pyproject.toml
+++ b/node-hub/dora-transformers/pyproject.toml
@@ -8,22 +8,22 @@ readme = "README.md"
 requires-python = ">=3.9"

 dependencies = [
    "dora-rs >= 0.3.9",
    "torch == 2.4.0",
    "torchvision >= 0.19",
    "torchaudio >= 2.1.0",
    "opencv-python >= 4.1.1",
    "modelscope >= 1.18.1",
    "accelerate >= 1.3.0",
    "transformers",
    "bitsandbytes>=0.41.1", 
  "dora-rs >= 0.3.9",
  "torch == 2.4.0",
  "torchvision >= 0.19",
  "torchaudio >= 2.1.0",
  "opencv-python >= 4.1.1",
  "modelscope >= 1.18.1",
  "accelerate >= 1.3.0",
  "transformers",
  "bitsandbytes>=0.41.1",
 ]

 [dependency-groups]
 dev = ["pytest >=8.1.1", "ruff >=0.9.1"]

 [project.scripts]
 dora-transformer = "dora_transformer.main:main"
 dora-transformers = "dora_transformers.main:main"


 [tool.ruff.lint]