nodes:
  - id: benchmark_script
    build: |
      pip install ../mllm
    path: ../mllm/benchmark_script.py
    inputs:
      text: llm/text
    outputs:
      - text
    env:
      TEXT: "Please only generate the following output: This is a test"
      TEXT_TRUTH: "This is a test"

  - id: llm
    build: pip install -e ../../node-hub/dora-llama-cpp-python
    path: dora-llama-cpp-python
    inputs:
      text:
        source: benchmark_script/text
        queue-size: 10
    outputs:
      - text
    env:
      MODEL_NAME_OR_PATH: "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
      MODEL_FILE_PATTERN: "*fp16.gguf"
      SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
      MAX_TOKENS: "512"
      N_GPU_LAYERS: "35" # Enable GPU acceleration
      N_THREADS: "16" # CPU threads
      CONTEXT_SIZE: "4096" # Maximum context window