Browse Source

Adding benchmarking tool for llms

tags/v0.3.11-rc1
haixuanTao 10 months ago
parent
commit
2d0bb81c68
6 changed files with 243 additions and 0 deletions
  1. +1
    -0
      benchs/llms/.gitignore
  2. +149
    -0
      benchs/llms/benchs.py
  3. +27
    -0
      benchs/llms/llama_cpp_python.yaml
  4. +19
    -0
      benchs/llms/phi4.yaml
  5. +21
    -0
      benchs/llms/qwen2.5.yaml
  6. +26
    -0
      benchs/llms/transformers.yaml

+ 1
- 0
benchs/llms/.gitignore View File

@@ -0,0 +1 @@
*.csv

+ 149
- 0
benchs/llms/benchs.py View File

@@ -0,0 +1,149 @@
"""TODO: Add docstring."""

import argparse
import ast

# Create an empty csv file with header in the current directory if file does not exist
import csv
import os
import time

import numpy as np
import pyarrow as pa
from dora import Node


def write_to_csv(filename, header, row):
"""
Create a CSV file with a header if it does not exist, and write a row to it.
If the file exists, append the row to the file.

:param filename: Name of the CSV file.
:param header: List of column names to use as the header.
:param row: List of data to write as a row in the CSV file.
"""
file_exists = os.path.exists(filename)

with open(
filename, mode="a" if file_exists else "w", newline="", encoding="utf8"
) as file:
writer = csv.writer(file)

# Write the header if the file is being created
if not file_exists:
writer.writerow(header)
print(f"File '{filename}' created with header: {header}")

# Write the row
writer.writerow(row)
print(f"Row written to '{filename}': {row}")


def main():
# Handle dynamic nodes, ask for the name of the node in the dataflow, and the same values as the ENV variables.
"""TODO: Add docstring."""
parser = argparse.ArgumentParser(description="Simple arrow sender")

parser.add_argument(
"--name",
type=str,
required=False,
help="The name of the node in the dataflow.",
default="pyarrow-sender",
)
parser.add_argument(
"--data",
type=str,
required=False,
help="Arrow Data as string.",
default=None,
)

args = parser.parse_args()

data = os.getenv("DATA", args.data)

node = Node(
args.name,
) # provide the name to connect to the dataflow if dynamic node
name = node.dataflow_descriptor()["nodes"][1]["path"]

if data is None:
raise ValueError(
"No data provided. Please specify `DATA` environment argument or as `--data` argument",
)
try:
data = ast.literal_eval(data)
except Exception: # noqa
print("Passing input as string")

if isinstance(data, (str, int, float)):
data = pa.array([data])
else:
data = pa.array(data) # initialize pyarrow array

durations = []
speed = []
for i in range(50):
start_time = time.time()
node.send_output("data", data)
event = node.next()
duration = time.time() - start_time
if event is not None and event["type"] == "INPUT":
text = event["value"][0].as_py()
tokens = event["metadata"].get("tokens", 6)
assert (
"this is a test" in text.lower()
), f"Expected 'This is a test', got {text}"
durations.append(duration)
speed.append(tokens / duration)
time.sleep(0.1)
durations = np.array(durations)
speed = np.array(speed)
print(
f"\nAverage duration: {sum(durations) / len(durations)}"
+ f"\nMax duration: {max(durations)}"
+ f"\nMin duration: {min(durations)}"
+ f"\nMedian duration: {np.median(durations)}"
+ f"\nMedian frequency: {1/np.median(durations)}"
+ f"\nAverage speed: {sum(speed) / len(speed)}"
+ f"\nMax speed: {max(speed)}"
+ f"\nMin speed: {min(speed)}"
+ f"\nMedian speed: {np.median(speed)}"
+ f"\nTotal tokens: {tokens}"
)
write_to_csv(
"benchmark.csv",
[
"path",
"date",
"average_duration",
"max_duration",
"min_duration",
"median_duration",
"median_frequency",
"average_speed",
"max_speed",
"min_speed",
"median_speed",
"total_tokens",
],
[
name,
time.strftime("%Y-%m-%d %H:%M:%S"),
sum(durations) / len(durations),
max(durations),
min(durations),
np.median(durations),
1 / np.median(durations),
sum(speed) / len(speed),
max(speed),
min(speed),
np.median(speed),
tokens,
],
)


if __name__ == "__main__":
main()

+ 27
- 0
benchs/llms/llama_cpp_python.yaml View File

@@ -0,0 +1,27 @@
nodes:
- id: pyarrow-sender
path: benchs.py
inputs:
text: llm/text
outputs:
- data
env:
DATA: "Please only generate the following output: This is a test"

- id: llm
build: pip install -e ../../node-hub/dora-llama-cpp-python
path: dora-llama-cpp-python
inputs:
text:
source: pyarrow-sender/data
queue-size: 10
outputs:
- text
env:
MODEL_NAME_OR_PATH: "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
MODEL_FILE_PATTERN: "*fp16.gguf"
SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
MAX_TOKENS: "512"
N_GPU_LAYERS: "35" # Enable GPU acceleration
N_THREADS: "16" # CPU threads
CONTEXT_SIZE: "4096" # Maximum context window

+ 19
- 0
benchs/llms/phi4.yaml View File

@@ -0,0 +1,19 @@
nodes:
- id: pyarrow-sender
path: benchs.py
inputs:
text: llm/text
outputs:
- data
env:
DATA: "Please only generate the following output: This is a test"

- id: llm
build: |
pip install flash-attn --no-build-isolation
pip install -e ../../node-hub/dora-phi4
path: dora-phi4
inputs:
text: pyarrow-sender/data
outputs:
- text

+ 21
- 0
benchs/llms/qwen2.5.yaml View File

@@ -0,0 +1,21 @@
nodes:
- id: pyarrow-sender
path: benchs.py
inputs:
text: llm/text
outputs:
- data
env:
DATA: "Please only generate the following output: This is a test"

- id: llm
build: |
pip install flash-attn --no-build-isolation
pip install -e ../../node-hub/dora-qwen
path: dora-qwen
inputs:
text:
source: pyarrow-sender/data
queue-size: 10
outputs:
- text

+ 26
- 0
benchs/llms/transformers.yaml View File

@@ -0,0 +1,26 @@
nodes:
- id: pyarrow-sender
path: benchs.py
inputs:
text: llm/text
outputs:
- data
env:
DATA: "Please only generate the following output: This is a test"

- id: llm
build: pip install -e ../../node-hub/dora-transformers
path: dora-transformers
inputs:
text:
source: pyarrow-sender/data
queue-size: 10
outputs:
- text
env:
MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct" # Model from Hugging Face
SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers."
MAX_TOKENS: "128" # Reduced for concise responses
DEVICE: "cuda" # Use "cpu" for CPU, "cuda" for NVIDIA GPU, "mps" for Apple Silicon
ENABLE_MEMORY_EFFICIENT: "true" # Enable 8-bit quantization and memory optimizations
TORCH_DTYPE: "float16" # Use half precision for better memory efficiency

Loading…
Cancel
Save