From 2d0bb81c680cef4ffc4a7b54d09890574dd169f7 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Mon, 17 Mar 2025 16:28:08 +0100
Subject: [PATCH] Adding benchmarking tool for llms

---
 benchs/llms/.gitignore            |   1 +
 benchs/llms/benchs.py             | 149 ++++++++++++++++++++++++++++++
 benchs/llms/llama_cpp_python.yaml |  27 ++++++
 benchs/llms/phi4.yaml             |  19 ++++
 benchs/llms/qwen2.5.yaml          |  21 +++++
 benchs/llms/transformers.yaml     |  26 ++++++
 6 files changed, 243 insertions(+)
 create mode 100644 benchs/llms/.gitignore
 create mode 100644 benchs/llms/benchs.py
 create mode 100644 benchs/llms/llama_cpp_python.yaml
 create mode 100644 benchs/llms/phi4.yaml
 create mode 100644 benchs/llms/qwen2.5.yaml
 create mode 100644 benchs/llms/transformers.yaml

diff --git a/benchs/llms/.gitignore b/benchs/llms/.gitignore
new file mode 100644
index 00000000..afed0735
--- /dev/null
+++ b/benchs/llms/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/benchs/llms/benchs.py b/benchs/llms/benchs.py
new file mode 100644
index 00000000..9af52bb3
--- /dev/null
+++ b/benchs/llms/benchs.py
@@ -0,0 +1,149 @@
+"""TODO: Add docstring."""
+
+import argparse
+import ast
+
+# Create an empty csv file with header in the current directory if file does not exist
+import csv
+import os
+import time
+
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+
+def write_to_csv(filename, header, row):
+    """
+    Create a CSV file with a header if it does not exist, and write a row to it.
+    If the file exists, append the row to the file.
+
+    :param filename: Name of the CSV file.
+    :param header: List of column names to use as the header.
+    :param row: List of data to write as a row in the CSV file.
+    """
+    file_exists = os.path.exists(filename)
+
+    with open(
+        filename, mode="a" if file_exists else "w", newline="", encoding="utf8"
+    ) as file:
+        writer = csv.writer(file)
+
+        # Write the header if the file is being created
+        if not file_exists:
+            writer.writerow(header)
+            print(f"File '{filename}' created with header: {header}")
+
+        # Write the row
+        writer.writerow(row)
+        print(f"Row written to '{filename}': {row}")
+
+
+def main():
+    # Handle dynamic nodes, ask for the name of the node in the dataflow, and the same values as the ENV variables.
+    """TODO: Add docstring."""
+    parser = argparse.ArgumentParser(description="Simple arrow sender")
+
+    parser.add_argument(
+        "--name",
+        type=str,
+        required=False,
+        help="The name of the node in the dataflow.",
+        default="pyarrow-sender",
+    )
+    parser.add_argument(
+        "--data",
+        type=str,
+        required=False,
+        help="Arrow Data as string.",
+        default=None,
+    )
+
+    args = parser.parse_args()
+
+    data = os.getenv("DATA", args.data)
+
+    node = Node(
+        args.name,
+    )  # provide the name to connect to the dataflow if dynamic node
+    name = node.dataflow_descriptor()["nodes"][1]["path"]
+
+    if data is None:
+        raise ValueError(
+            "No data provided. Please specify `DATA` environment argument or as `--data` argument",
+        )
+    try:
+        data = ast.literal_eval(data)
+    except Exception:  # noqa
+        print("Passing input as string")
+
+    if isinstance(data, (str, int, float)):
+        data = pa.array([data])
+    else:
+        data = pa.array(data)  # initialize pyarrow array
+
+    durations = []
+    speed = []
+    for i in range(50):
+        start_time = time.time()
+        node.send_output("data", data)
+        event = node.next()
+        duration = time.time() - start_time
+        if event is not None and event["type"] == "INPUT":
+            text = event["value"][0].as_py()
+            tokens = event["metadata"].get("tokens", 6)
+            assert (
+                "this is a test" in text.lower()
+            ), f"Expected 'This is a test', got {text}"
+            durations.append(duration)
+            speed.append(tokens / duration)
+            time.sleep(0.1)
+    durations = np.array(durations)
+    speed = np.array(speed)
+    print(
+        f"\nAverage duration: {sum(durations) / len(durations)}"
+        + f"\nMax duration: {max(durations)}"
+        + f"\nMin duration: {min(durations)}"
+        + f"\nMedian duration: {np.median(durations)}"
+        + f"\nMedian frequency: {1/np.median(durations)}"
+        + f"\nAverage speed: {sum(speed) / len(speed)}"
+        + f"\nMax speed: {max(speed)}"
+        + f"\nMin speed: {min(speed)}"
+        + f"\nMedian speed: {np.median(speed)}"
+        + f"\nTotal tokens: {tokens}"
+    )
+    write_to_csv(
+        "benchmark.csv",
+        [
+            "path",
+            "date",
+            "average_duration",
+            "max_duration",
+            "min_duration",
+            "median_duration",
+            "median_frequency",
+            "average_speed",
+            "max_speed",
+            "min_speed",
+            "median_speed",
+            "total_tokens",
+        ],
+        [
+            name,
+            time.strftime("%Y-%m-%d %H:%M:%S"),
+            sum(durations) / len(durations),
+            max(durations),
+            min(durations),
+            np.median(durations),
+            1 / np.median(durations),
+            sum(speed) / len(speed),
+            max(speed),
+            min(speed),
+            np.median(speed),
+            tokens,
+        ],
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchs/llms/llama_cpp_python.yaml b/benchs/llms/llama_cpp_python.yaml
new file mode 100644
index 00000000..29c0ced7
--- /dev/null
+++ b/benchs/llms/llama_cpp_python.yaml
@@ -0,0 +1,27 @@
+nodes:
+  - id: pyarrow-sender
+    path: benchs.py
+    inputs:
+      text: llm/text
+    outputs:
+      - data
+    env:
+      DATA: "Please only generate the following output: This is a test"
+
+  - id: llm
+    build: pip install -e ../../node-hub/dora-llama-cpp-python
+    path: dora-llama-cpp-python
+    inputs:
+      text:
+        source: pyarrow-sender/data
+        queue-size: 10
+    outputs:
+      - text
+    env:
+      MODEL_NAME_OR_PATH: "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
+      MODEL_FILE_PATTERN: "*fp16.gguf"
+      SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
+      MAX_TOKENS: "512"
+      N_GPU_LAYERS: "35" # Enable GPU acceleration
+      N_THREADS: "16" # CPU threads
+      CONTEXT_SIZE: "4096" # Maximum context window
diff --git a/benchs/llms/phi4.yaml b/benchs/llms/phi4.yaml
new file mode 100644
index 00000000..5a5cb45a
--- /dev/null
+++ b/benchs/llms/phi4.yaml
@@ -0,0 +1,19 @@
+nodes:
+  - id: pyarrow-sender
+    path: benchs.py
+    inputs:
+      text: llm/text
+    outputs:
+      - data
+    env:
+      DATA: "Please only generate the following output: This is a test"
+
+  - id: llm
+    build: |
+      pip install flash-attn --no-build-isolation
+      pip install -e ../../node-hub/dora-phi4
+    path: dora-phi4
+    inputs:
+      text: pyarrow-sender/data
+    outputs:
+      - text
diff --git a/benchs/llms/qwen2.5.yaml b/benchs/llms/qwen2.5.yaml
new file mode 100644
index 00000000..1c1c09be
--- /dev/null
+++ b/benchs/llms/qwen2.5.yaml
@@ -0,0 +1,21 @@
+nodes:
+  - id: pyarrow-sender
+    path: benchs.py
+    inputs:
+      text: llm/text
+    outputs:
+      - data
+    env:
+      DATA: "Please only generate the following output: This is a test"
+
+  - id: llm
+    build: |
+      pip install flash-attn --no-build-isolation
+      pip install -e ../../node-hub/dora-qwen
+    path: dora-qwen
+    inputs:
+      text:
+        source: pyarrow-sender/data
+        queue-size: 10
+    outputs:
+      - text
diff --git a/benchs/llms/transformers.yaml b/benchs/llms/transformers.yaml
new file mode 100644
index 00000000..f50b6a06
--- /dev/null
+++ b/benchs/llms/transformers.yaml
@@ -0,0 +1,26 @@
+nodes:
+  - id: pyarrow-sender
+    path: benchs.py
+    inputs:
+      text: llm/text
+    outputs:
+      - data
+    env:
+      DATA: "Please only generate the following output: This is a test"
+
+  - id: llm
+    build: pip install -e ../../node-hub/dora-transformers
+    path: dora-transformers
+    inputs:
+      text:
+        source: pyarrow-sender/data
+        queue-size: 10
+    outputs:
+      - text
+    env:
+      MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct" # Model from Hugging Face
+      SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
+      MAX_TOKENS: "128" # Reduced for concise responses
+      DEVICE: "cuda" # Use "cpu" for CPU, "cuda" for NVIDIA GPU, "mps" for Apple Silicon
+      ENABLE_MEMORY_EFFICIENT: "true" # Enable 8-bit quantization and memory optimizations
+      TORCH_DTYPE: "float16" # Use half precision for better memory efficiency