Rename benchs -> benches to be conformant with cargo

11 months ago · 5ebd723c52
--- a/benches/llms/.gitignore
+++ b/benches/llms/.gitignore
--- a/benches/llms/README.md
+++ b/benches/llms/README.md
--- a/benches/llms/llama_cpp_python.yaml
+++ b/benches/llms/llama_cpp_python.yaml
@@ -1,10 +1,10 @@
 nodes:
  - id: benchmark_script
    path: benchmark_script.py
    path: ../mllm/benchmark_script.py
    inputs:
      text: llm/text
    outputs:
      - data
      - text
    env:
      DATA: "Please only generate the following output: This is a test"

--- a/benches/llms/phi4.yaml
+++ b/benches/llms/phi4.yaml
@@ -1,10 +1,10 @@
 nodes:
  - id: benchmark_script
    path: benchmark_script.py
    path: ../mllm/benchmark_script.py
    inputs:
      text: llm/text
    outputs:
      - data
      - text
    env:
      DATA: "Please only generate the following output: This is a test"

--- a/benches/llms/qwen2.5.yaml
+++ b/benches/llms/qwen2.5.yaml
@@ -1,10 +1,10 @@
 nodes:
  - id: benchmark_script
    path: benchmark_script.py
    path: ../mllm/benchmark_script.py
    inputs:
      text: llm/text
    outputs:
      - data
      - text
    env:
      DATA: "Please only generate the following output: This is a test"

--- a/benches/llms/transformers.yaml
+++ b/benches/llms/transformers.yaml
@@ -1,10 +1,10 @@
 nodes:
  - id: benchmark_script
    path: benchmark_script.py
    path: ../mllm/benchmark_script.py
    inputs:
      text: llm/text
    outputs:
      - data
      - text
    env:
      DATA: "Please only generate the following output: This is a test"

--- a/benches/mllm/.gitignore
+++ b/benches/mllm/.gitignore
--- a/benches/mllm/README.md
+++ b/benches/mllm/README.md
--- a/benches/mllm/benchmark_script.py
+++ b/benches/mllm/benchmark_script.py
--- a/benches/mllm/phi4.yaml
+++ b/benches/mllm/phi4.yaml
--- a/benches/vlm/.gitignore
+++ b/benches/vlm/.gitignore
--- a/benches/vlm/README.md
+++ b/benches/vlm/README.md
--- a/benches/vlm/magma.yaml
+++ b/benches/vlm/magma.yaml
@@ -1,6 +1,6 @@
 nodes:
  - id: benchmark_script
    path: benchmark_script.py
    path: ../mllm/benchmark_script.py
    inputs:
      text: llm/text
    outputs:
--- a/benches/vlm/phi4.yaml
+++ b/benches/vlm/phi4.yaml
@@ -1,6 +1,6 @@
 nodes:
  - id: benchmark_script
    path: benchmark_script.py
    path: ../mllm/benchmark_script.py
    inputs:
      text: llm/text
    outputs:
--- a/benches/vlm/qwen2.5vl.yaml
+++ b/benches/vlm/qwen2.5vl.yaml
@@ -1,6 +1,6 @@
 nodes:
  - id: benchmark_script
    path: benchmark_script.py
    path: ../mllm/benchmark_script.py
    inputs:
      text: vlm/text
    outputs:
--- a/benchs/llms/benchmark_script.py
+++ b/benchs/llms/benchmark_script.py
@@ -1,149 +0,0 @@
 """TODO: Add docstring."""

 import argparse
 import ast

 # Create an empty csv file with header in the current directory if file does not exist
 import csv
 import os
 import time

 import numpy as np
 import pyarrow as pa
 from dora import Node


 def write_to_csv(filename, header, row):
    """
    Create a CSV file with a header if it does not exist, and write a row to it.
    If the file exists, append the row to the file.

    :param filename: Name of the CSV file.
    :param header: List of column names to use as the header.
    :param row: List of data to write as a row in the CSV file.
    """
    file_exists = os.path.exists(filename)

    with open(
        filename, mode="a" if file_exists else "w", newline="", encoding="utf8"
    ) as file:
        writer = csv.writer(file)

        # Write the header if the file is being created
        if not file_exists:
            writer.writerow(header)
            print(f"File '{filename}' created with header: {header}")

        # Write the row
        writer.writerow(row)
        print(f"Row written to '{filename}': {row}")


 def main():
    # Handle dynamic nodes, ask for the name of the node in the dataflow, and the same values as the ENV variables.
    """TODO: Add docstring."""
    parser = argparse.ArgumentParser(description="Simple arrow sender")

    parser.add_argument(
        "--name",
        type=str,
        required=False,
        help="The name of the node in the dataflow.",
        default="pyarrow-sender",
    )
    parser.add_argument(
        "--data",
        type=str,
        required=False,
        help="Arrow Data as string.",
        default=None,
    )

    args = parser.parse_args()

    data = os.getenv("DATA", args.data)

    node = Node(
        args.name,
    )  # provide the name to connect to the dataflow if dynamic node
    name = node.dataflow_descriptor()["nodes"][1]["path"]

    if data is None:
        raise ValueError(
            "No data provided. Please specify `DATA` environment argument or as `--data` argument",
        )
    try:
        data = ast.literal_eval(data)
    except Exception:  # noqa
        print("Passing input as string")

    if isinstance(data, (str, int, float)):
        data = pa.array([data])
    else:
        data = pa.array(data)  # initialize pyarrow array

    durations = []
    speed = []
    for _ in range(10):
        start_time = time.time()
        node.send_output("data", data)
        event = node.next()
        duration = time.time() - start_time
        if event is not None and event["type"] == "INPUT":
            text = event["value"][0].as_py()
            tokens = event["metadata"].get("tokens", 6)
            assert "this is a test" in text.lower(), (
                f"Expected 'This is a test', got {text}"
            )
            durations.append(duration)
            speed.append(tokens / duration)
            time.sleep(0.1)
    durations = np.array(durations)
    speed = np.array(speed)
    print(
        f"\nAverage duration: {sum(durations) / len(durations)}"
        + f"\nMax duration: {max(durations)}"
        + f"\nMin duration: {min(durations)}"
        + f"\nMedian duration: {np.median(durations)}"
        + f"\nMedian frequency: {1 / np.median(durations)}"
        + f"\nAverage speed: {sum(speed) / len(speed)}"
        + f"\nMax speed: {max(speed)}"
        + f"\nMin speed: {min(speed)}"
        + f"\nMedian speed: {np.median(speed)}"
        + f"\nTotal tokens: {tokens}"
    )
    write_to_csv(
        "benchmark.csv",
        [
            "path",
            "date",
            "average_duration(s)",
            "max_duration(s)",
            "min_duration(s)",
            "median_duration(s)",
            "median_frequency(Hz)",
            "average_speed(tok/s)",
            "max_speed(tok/s)",
            "min_speed(tok/s)",
            "median_speed(tok/s)",
            "total_tokens",
        ],
        [
            name,
            time.strftime("%Y-%m-%d %H:%M:%S"),
            sum(durations) / len(durations),
            max(durations),
            min(durations),
            np.median(durations),
            1 / np.median(durations),
            sum(speed) / len(speed),
            max(speed),
            min(speed),
            np.median(speed),
            tokens,
        ],
    )


 if __name__ == "__main__":
    main()
--- a/benchs/vlm/benchmark_script.py
+++ b/benchs/vlm/benchmark_script.py
@@ -1,184 +0,0 @@
 """TODO: Add docstring."""

 import argparse
 import ast

 # Create an empty csv file with header in the current directory if file does not exist
 import csv
 import os
 import time
 from io import BytesIO

 import cv2
 import numpy as np
 import pyarrow as pa
 import requests
 from dora import Node
 from PIL import Image

 CAT_URL = "https://i.ytimg.com/vi/fzzjgBAaWZw/hqdefault.jpg"


 def get_cat_image():
    """
    Get a cat image as a numpy array.

    :return: Cat image as a numpy array.
    """
    # Fetch the image from the URL
    response = requests.get(CAT_URL)
    response.raise_for_status()

    # Open the image using PIL

    image = Image.open(BytesIO(response.content))
    # Convert the image to a numpy array

    image_array = np.array(image)
    cv2.resize(image_array, (640, 480))
    # Convert RGB to BGR for

    return image_array


 def write_to_csv(filename, header, row):
    """
    Create a CSV file with a header if it does not exist, and write a row to it.
    If the file exists, append the row to the file.

    :param filename: Name of the CSV file.
    :param header: List of column names to use as the header.
    :param row: List of data to write as a row in the CSV file.
    """
    file_exists = os.path.exists(filename)

    with open(
        filename, mode="a" if file_exists else "w", newline="", encoding="utf8"
    ) as file:
        writer = csv.writer(file)

        # Write the header if the file is being created
        if not file_exists:
            writer.writerow(header)
            print(f"File '{filename}' created with header: {header}")

        # Write the row
        writer.writerow(row)
        print(f"Row written to '{filename}': {row}")


 def main():
    # Handle dynamic nodes, ask for the name of the node in the dataflow, and the same values as the ENV variables.
    """TODO: Add docstring."""
    parser = argparse.ArgumentParser(description="Simple arrow sender")

    parser.add_argument(
        "--name",
        type=str,
        required=False,
        help="The name of the node in the dataflow.",
        default="pyarrow-sender",
    )
    parser.add_argument(
        "--data",
        type=str,
        required=False,
        help="Arrow Data as string.",
        default=None,
    )

    args = parser.parse_args()

    data = os.getenv("DATA", args.data)

    node = Node(
        args.name,
    )  # provide the name to connect to the dataflow if dynamic node
    name = node.dataflow_descriptor()["nodes"][1]["path"]

    if data is None:
        raise ValueError(
            "No data provided. Please specify `DATA` environment argument or as `--data` argument",
        )
    try:
        data = ast.literal_eval(data)
    except Exception:  # noqa
        print("Passing input as string")

    if isinstance(data, (str, int, float)):
        data = pa.array([data])
    else:
        data = pa.array(data)  # initialize pyarrow array

    cat = get_cat_image()
    durations = []
    speed = []
    for _ in range(10):
        node.send_output(
            "image",
            pa.array(cat.ravel()),
            {"encoding": "rgb8", "width": cat.shape[1], "height": cat.shape[0]},
        )
        time.sleep(0.1)
        start_time = time.time()
        node.send_output("text", data)
        event = node.next()
        duration = time.time() - start_time
        if event is not None and event["type"] == "INPUT":
            text = event["value"][0].as_py()
            tokens = event["metadata"].get("tokens", 6)
            assert (
                "this is a cat" in text.lower()
            ), f"Expected 'This is a cat', got {text}"
            durations.append(duration)
            speed.append(tokens / duration)
            time.sleep(0.1)
    durations = np.array(durations)
    speed = np.array(speed)
    print(
        f"\nAverage duration: {sum(durations) / len(durations)}"
        + f"\nMax duration: {max(durations)}"
        + f"\nMin duration: {min(durations)}"
        + f"\nMedian duration: {np.median(durations)}"
        + f"\nMedian frequency: {1/np.median(durations)}"
        + f"\nAverage speed: {sum(speed) / len(speed)}"
        + f"\nMax speed: {max(speed)}"
        + f"\nMin speed: {min(speed)}"
        + f"\nMedian speed: {np.median(speed)}"
        + f"\nTotal tokens: {tokens}"
    )
    write_to_csv(
        "benchmark.csv",
        [
            "path",
            "date",
            "average_duration(s)",
            "max_duration(s)",
            "min_duration(s)",
            "median_duration(s)",
            "median_frequency(Hz)",
            "average_speed(tok/s)",
            "max_speed(tok/s)",
            "min_speed(tok/s)",
            "median_speed(tok/s)",
            "total_tokens",
        ],
        [
            name,
            time.strftime("%Y-%m-%d %H:%M:%S"),
            sum(durations) / len(durations),
            max(durations),
            min(durations),
            np.median(durations),
            1 / np.median(durations),
            sum(speed) / len(speed),
            max(speed),
            min(speed),
            np.median(speed),
            tokens,
        ],
    )


 if __name__ == "__main__":
    main()