"""TODO: Add docstring.""" import os import sys import pyarrow as pa from dora import Node from transformers import AutoModelForCausalLM, AutoTokenizer SYSTEM_PROMPT = os.getenv( "SYSTEM_PROMPT", "You're a very succinct AI assistant with short answers.", ) def get_model_gguf(): """TODO: Add docstring.""" from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False, ) return llm def get_model_darwin(): """TODO: Add docstring.""" from mlx_lm import load model, tokenizer = load("mlx-community/Qwen2.5-0.5B-Instruct-8bit") return model, tokenizer def get_model_huggingface(): """TODO: Add docstring.""" model_name = "Qwen/Qwen2.5-0.5B-Instruct" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() def generate_hf(model, tokenizer, prompt: str, history) -> str: """TODO: Add docstring.""" history += [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template( history, tokenize=False, add_generation_prompt=True, ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate(**model_inputs, max_new_tokens=512) generated_ids = [ output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] history += [{"role": "assistant", "content": response}] return response, history def main(): """TODO: Add docstring.""" history = [] # If OS is not Darwin, use Huggingface model if sys.platform != "": model = get_model_gguf() elif sys.platform == "huggingface": model, tokenizer = get_model_huggingface() else: model, tokenizer = get_model_darwin() node = Node() for event in node: if event["type"] == "INPUT": # Warning: Make sure to add my_output_id and my_input_id within the dataflow. text = event["value"][0].as_py() words = text.lower().split() if any(word in ACTIVATION_WORDS for word in words): # On linux, Windows if sys.platform != "": response = model( f"Q: {text} A: ", # Prompt max_tokens=24, stop=[ "Q:", "\n", ], # Stop generating just before the model would generate a new question )["choices"][0]["text"] elif sys.platform == "huggingface": response, history = generate_hf(model, tokenizer, text, history) else: from mlx_lm import generate response = generate( model, tokenizer, prompt=text, verbose=False, max_tokens=50, ) node.send_output( output_id="text", data=pa.array([response]), metadata={}, ) if __name__ == "__main__": main()