From 2d0bb81c680cef4ffc4a7b54d09890574dd169f7 Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Mon, 17 Mar 2025 16:28:08 +0100 Subject: [PATCH] Adding benchmarking tool for llms --- benchs/llms/.gitignore | 1 + benchs/llms/benchs.py | 149 ++++++++++++++++++++++++++++++ benchs/llms/llama_cpp_python.yaml | 27 ++++++ benchs/llms/phi4.yaml | 19 ++++ benchs/llms/qwen2.5.yaml | 21 +++++ benchs/llms/transformers.yaml | 26 ++++++ 6 files changed, 243 insertions(+) create mode 100644 benchs/llms/.gitignore create mode 100644 benchs/llms/benchs.py create mode 100644 benchs/llms/llama_cpp_python.yaml create mode 100644 benchs/llms/phi4.yaml create mode 100644 benchs/llms/qwen2.5.yaml create mode 100644 benchs/llms/transformers.yaml diff --git a/benchs/llms/.gitignore b/benchs/llms/.gitignore new file mode 100644 index 00000000..afed0735 --- /dev/null +++ b/benchs/llms/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/benchs/llms/benchs.py b/benchs/llms/benchs.py new file mode 100644 index 00000000..9af52bb3 --- /dev/null +++ b/benchs/llms/benchs.py @@ -0,0 +1,149 @@ +"""TODO: Add docstring.""" + +import argparse +import ast + +# Create an empty csv file with header in the current directory if file does not exist +import csv +import os +import time + +import numpy as np +import pyarrow as pa +from dora import Node + + +def write_to_csv(filename, header, row): + """ + Create a CSV file with a header if it does not exist, and write a row to it. + If the file exists, append the row to the file. + + :param filename: Name of the CSV file. + :param header: List of column names to use as the header. + :param row: List of data to write as a row in the CSV file. + """ + file_exists = os.path.exists(filename) + + with open( + filename, mode="a" if file_exists else "w", newline="", encoding="utf8" + ) as file: + writer = csv.writer(file) + + # Write the header if the file is being created + if not file_exists: + writer.writerow(header) + print(f"File '{filename}' created with header: {header}") + + # Write the row + writer.writerow(row) + print(f"Row written to '{filename}': {row}") + + +def main(): + # Handle dynamic nodes, ask for the name of the node in the dataflow, and the same values as the ENV variables. + """TODO: Add docstring.""" + parser = argparse.ArgumentParser(description="Simple arrow sender") + + parser.add_argument( + "--name", + type=str, + required=False, + help="The name of the node in the dataflow.", + default="pyarrow-sender", + ) + parser.add_argument( + "--data", + type=str, + required=False, + help="Arrow Data as string.", + default=None, + ) + + args = parser.parse_args() + + data = os.getenv("DATA", args.data) + + node = Node( + args.name, + ) # provide the name to connect to the dataflow if dynamic node + name = node.dataflow_descriptor()["nodes"][1]["path"] + + if data is None: + raise ValueError( + "No data provided. Please specify `DATA` environment argument or as `--data` argument", + ) + try: + data = ast.literal_eval(data) + except Exception: # noqa + print("Passing input as string") + + if isinstance(data, (str, int, float)): + data = pa.array([data]) + else: + data = pa.array(data) # initialize pyarrow array + + durations = [] + speed = [] + for i in range(50): + start_time = time.time() + node.send_output("data", data) + event = node.next() + duration = time.time() - start_time + if event is not None and event["type"] == "INPUT": + text = event["value"][0].as_py() + tokens = event["metadata"].get("tokens", 6) + assert ( + "this is a test" in text.lower() + ), f"Expected 'This is a test', got {text}" + durations.append(duration) + speed.append(tokens / duration) + time.sleep(0.1) + durations = np.array(durations) + speed = np.array(speed) + print( + f"\nAverage duration: {sum(durations) / len(durations)}" + + f"\nMax duration: {max(durations)}" + + f"\nMin duration: {min(durations)}" + + f"\nMedian duration: {np.median(durations)}" + + f"\nMedian frequency: {1/np.median(durations)}" + + f"\nAverage speed: {sum(speed) / len(speed)}" + + f"\nMax speed: {max(speed)}" + + f"\nMin speed: {min(speed)}" + + f"\nMedian speed: {np.median(speed)}" + + f"\nTotal tokens: {tokens}" + ) + write_to_csv( + "benchmark.csv", + [ + "path", + "date", + "average_duration", + "max_duration", + "min_duration", + "median_duration", + "median_frequency", + "average_speed", + "max_speed", + "min_speed", + "median_speed", + "total_tokens", + ], + [ + name, + time.strftime("%Y-%m-%d %H:%M:%S"), + sum(durations) / len(durations), + max(durations), + min(durations), + np.median(durations), + 1 / np.median(durations), + sum(speed) / len(speed), + max(speed), + min(speed), + np.median(speed), + tokens, + ], + ) + + +if __name__ == "__main__": + main() diff --git a/benchs/llms/llama_cpp_python.yaml b/benchs/llms/llama_cpp_python.yaml new file mode 100644 index 00000000..29c0ced7 --- /dev/null +++ b/benchs/llms/llama_cpp_python.yaml @@ -0,0 +1,27 @@ +nodes: + - id: pyarrow-sender + path: benchs.py + inputs: + text: llm/text + outputs: + - data + env: + DATA: "Please only generate the following output: This is a test" + + - id: llm + build: pip install -e ../../node-hub/dora-llama-cpp-python + path: dora-llama-cpp-python + inputs: + text: + source: pyarrow-sender/data + queue-size: 10 + outputs: + - text + env: + MODEL_NAME_OR_PATH: "Qwen/Qwen2.5-0.5B-Instruct-GGUF" + MODEL_FILE_PATTERN: "*fp16.gguf" + SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers." + MAX_TOKENS: "512" + N_GPU_LAYERS: "35" # Enable GPU acceleration + N_THREADS: "16" # CPU threads + CONTEXT_SIZE: "4096" # Maximum context window diff --git a/benchs/llms/phi4.yaml b/benchs/llms/phi4.yaml new file mode 100644 index 00000000..5a5cb45a --- /dev/null +++ b/benchs/llms/phi4.yaml @@ -0,0 +1,19 @@ +nodes: + - id: pyarrow-sender + path: benchs.py + inputs: + text: llm/text + outputs: + - data + env: + DATA: "Please only generate the following output: This is a test" + + - id: llm + build: | + pip install flash-attn --no-build-isolation + pip install -e ../../node-hub/dora-phi4 + path: dora-phi4 + inputs: + text: pyarrow-sender/data + outputs: + - text diff --git a/benchs/llms/qwen2.5.yaml b/benchs/llms/qwen2.5.yaml new file mode 100644 index 00000000..1c1c09be --- /dev/null +++ b/benchs/llms/qwen2.5.yaml @@ -0,0 +1,21 @@ +nodes: + - id: pyarrow-sender + path: benchs.py + inputs: + text: llm/text + outputs: + - data + env: + DATA: "Please only generate the following output: This is a test" + + - id: llm + build: | + pip install flash-attn --no-build-isolation + pip install -e ../../node-hub/dora-qwen + path: dora-qwen + inputs: + text: + source: pyarrow-sender/data + queue-size: 10 + outputs: + - text diff --git a/benchs/llms/transformers.yaml b/benchs/llms/transformers.yaml new file mode 100644 index 00000000..f50b6a06 --- /dev/null +++ b/benchs/llms/transformers.yaml @@ -0,0 +1,26 @@ +nodes: + - id: pyarrow-sender + path: benchs.py + inputs: + text: llm/text + outputs: + - data + env: + DATA: "Please only generate the following output: This is a test" + + - id: llm + build: pip install -e ../../node-hub/dora-transformers + path: dora-transformers + inputs: + text: + source: pyarrow-sender/data + queue-size: 10 + outputs: + - text + env: + MODEL_NAME: "Qwen/Qwen2.5-0.5B-Instruct" # Model from Hugging Face + SYSTEM_PROMPT: "You're a very succinct AI assistant with short answers." + MAX_TOKENS: "128" # Reduced for concise responses + DEVICE: "cuda" # Use "cpu" for CPU, "cuda" for NVIDIA GPU, "mps" for Apple Silicon + ENABLE_MEMORY_EFFICIENT: "true" # Enable 8-bit quantization and memory optimizations + TORCH_DTYPE: "float16" # Use half precision for better memory efficiency