"""TODO: Add docstring."""

import os
import sys

import pyarrow as pa
from dora import Node
from transformers import AutoModelForCausalLM, AutoTokenizer

SYSTEM_PROMPT = os.getenv(
    "SYSTEM_PROMPT",
    "You're a very succinct AI assistant with short answers.",
)


def get_model_gguf():
    """TODO: Add docstring."""
    from llama_cpp import Llama

    llm = Llama.from_pretrained(
        repo_id="Qwen/Qwen2.5-0.5B-Instruct-GGUF", filename="*fp16.gguf", verbose=False,
    )
    return llm


def get_model_darwin():
    """TODO: Add docstring."""
    from mlx_lm import load

    model, tokenizer = load("mlx-community/Qwen2.5-0.5B-Instruct-8bit")
    return model, tokenizer


def get_model_huggingface():
    """TODO: Add docstring."""
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"

    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype="auto", device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer


ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()


def generate_hf(model, tokenizer, prompt: str, history) -> str:
    """TODO: Add docstring."""
    history += [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        history, tokenize=False, add_generation_prompt=True,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=512)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    history += [{"role": "assistant", "content": response}]
    return response, history


def main():
    """TODO: Add docstring."""
    history = []
    # If OS is not Darwin, use Huggingface model
    if sys.platform != "":
        model = get_model_gguf()
    elif sys.platform == "huggingface":
        model, tokenizer = get_model_huggingface()
    else:
        model, tokenizer = get_model_darwin()

    node = Node()

    for event in node:
        if event["type"] == "INPUT":
            # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
            text = event["value"][0].as_py()
            words = text.lower().split()

            if any(word in ACTIVATION_WORDS for word in words):
                # On linux, Windows
                if sys.platform != "":
                    response = model(
                        f"Q: {text} A: ",  # Prompt
                        max_tokens=24,
                        stop=[
                            "Q:",
                            "\n",
                        ],  # Stop generating just before the model would generate a new question
                    )["choices"][0]["text"]
                elif sys.platform == "huggingface":
                    response, history = generate_hf(model, tokenizer, text, history)
                else:
                    from mlx_lm import generate

                    response = generate(
                        model, tokenizer, prompt=text, verbose=False, max_tokens=50,
                    )

                node.send_output(
                    output_id="text", data=pa.array([response]), metadata={},
                )


if __name__ == "__main__":
    main()