Adding example dataflow

11 months ago · 87c7df5838
--- a/examples/reachy2-remote/dataflow_reachy.yml
+++ b/examples/reachy2-remote/dataflow_reachy.yml
@@ -0,0 +1,180 @@
 nodes:
  - id: camera
    path: dora-reachy2-camera
    _unstable_deploy:
      machine: encoder
    inputs:
      tick: dora/timer/millis/10
    outputs:
      - image_left
      - image_depth
      - depth
    env:
      CAPTURE_PATH: 0
      IMAGE_WIDTH: 640
      IMAGE_HEIGHT: 480
      ROBOT_IP: 127.0.0.1

  - id: rav1e-local-image
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    _unstable_deploy:
      machine: encoder
    inputs:
      image_depth: camera/image_depth
      image_left: camera/image_left
    outputs:
      - image_left
      - image_depth
      - depth
    env:
      RAV1E_SPEED: 10

  - id: dav1d-remote
    path: dora-dav1d
    build: cargo build -p dora-dav1d --release
    _unstable_deploy:
      machine: gpu
    inputs:
      image_depth: rav1e-local-image/image_depth
      image_left: rav1e-local-image/image_left
      # depth: rav1e-local/depth
    outputs:
      - image_left
      - image_depth
      - depth

  - id: dora-microphone
    build: pip install -e ../../node-hub/dora-microphone
    path: dora-microphone
    _unstable_deploy:
      machine: macbook
    inputs:
      tick: dora/timer/millis/2000
    outputs:
      - audio

  - id: dora-vad
    build: pip install -e ../../node-hub/dora-vad
    _unstable_deploy:
      machine: macbook
    path: dora-vad
    inputs:
      audio: dora-microphone/audio
    outputs:
      - audio

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    _unstable_deploy:
      machine: macbook
    path: dora-distil-whisper
    inputs:
      input: dora-vad/audio
    outputs:
      - text
    env:
      TARGET_LANGUAGE: english

  - id: parse_whisper
    path: parse_whisper.py
    _unstable_deploy:
      machine: gpu
    inputs:
      text: dora-distil-whisper/text
    outputs:
      - bbox
      - action
      - points
      - text
    env:
      IMAGE_RESIZE_RATIO: "1.0"

  - id: dora-qwenvl
    build: pip install -e ../../node-hub/dora-qwen2-5-vl
    path: dora-qwen2-5-vl
    _unstable_deploy:
      machine: gpu
    inputs:
      image_left: dav1d-remote/image_left
      text: parse_whisper/text
    outputs:
      - text
    env:
      DEFAULT_QUESTION: Output the bounding box of the suitcase.
      IMAGE_RESIZE_RATIO: "1.0"

  - id: parse_bbox
    path: parse_bbox.py
    _unstable_deploy:
      machine: gpu
    inputs:
      text: dora-qwenvl/text
      points: parse_whisper/points
    outputs:
      - bbox
    env:
      IMAGE_RESIZE_RATIO: "1.0"

  - id: tracker
    build: pip install -e ../../node-hub/dora-cotracker
    path: dora-cotracker
    _unstable_deploy:
      machine: gpu
    inputs:
      image: dav1d-remote/image_left
      boxes2d: parse_bbox/bbox
    outputs:
      - tracked_image
      - points
    env:
      INTERACTIVE_MODE: false

  #- id: sam2
  #build: pip install -e ../../node-hub/dora-sam2
  #path: dora-sam2
  #_unstable_deploy:
  #machine: gpu
  #inputs:
  #image_left: dav1d-remote/image_left
  #boxes2d: parse_bbox/bbox
  #outputs:
  #- masks

  - id: parse_point
    path: parse_point.py
    _unstable_deploy:
      machine: gpu
    inputs:
      points: tracker/points
    outputs:
      - action
    env:
      IMAGE_RESIZE_RATIO: "1.0"

  - id: reachy-mobile-base
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-mobile-base
    _unstable_deploy:
      machine: encoder
    inputs:
      action_base: parse_point/action
      action_whipser: parse_whisper/action
    outputs:
      - response_base
    env:
      ROBOT_IP: 127.0.0.1

  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    _unstable_deploy:
      machine: macbook
    inputs:
      image: dav1d-remote/image_left
      image_depth: dav1d-remote/image_depth
      boxes2d: parse_bbox/bbox
      original_text: dora-distil-whisper/text
      parsed_text: parse_whisper/text
      qwenvl_text: dora-qwenvl/text
      tracked_image: tracker/tracked_image
--- a/examples/reachy2-remote/parse_bbox.py
+++ b/examples/reachy2-remote/parse_bbox.py
@@ -0,0 +1,66 @@
 """TODO: Add docstring."""

 import json
 import os

 import numpy as np
 import pyarrow as pa
 from dora import Node

 node = Node()

 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


 def extract_bboxes(json_text):
    """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.

    Parameters
    ----------
    json_text : str
        JSON string containing bounding box data, including ```json markers.

    Returns
    -------
    np.ndarray: NumPy array of bounding boxes.

    """
    # Ensure all lines are stripped of whitespace and markers
    lines = json_text.strip().splitlines()

    # Filter out lines that are markdown markers
    clean_lines = [line for line in lines if not line.strip().startswith("```")]

    # Join the lines back into a single string
    clean_text = "\n".join(clean_lines)
    # Parse the cleaned JSON text
    try:
        data = json.loads(clean_text)

        # Extract bounding boxes
        bboxes = [item["bbox_2d"] for item in data]
        labels = [item["label"] for item in data]

        return np.array(bboxes), np.array(labels)
    except Exception as _e:  # noqa
        pass
    return None, None


 for event in node:
    if event["type"] == "INPUT":
        if len(event["value"]) == 0:
            node.send_output("bbox", pa.array([]))
            continue

        text = event["value"][0].as_py()
        image_id = event["metadata"]["image_id"]

        bboxes, labels = extract_bboxes(text)
        if bboxes is not None and len(bboxes) > 0:
            bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
            node.send_output(
                "bbox",
                pa.array(bboxes.ravel()),
                metadata={"encoding": "xyxy", "image_id": image_id},
            )
--- a/examples/reachy2-remote/parse_point.py
+++ b/examples/reachy2-remote/parse_point.py
@@ -0,0 +1,47 @@
 """TODO: Add docstring."""

 import json
 import os

 import numpy as np
 import pyarrow as pa
 from dora import Node

 node = Node()

 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


 for event in node:
    if event["type"] == "INPUT":
        text = event["value"][0].as_py()
        width = event["metadata"]["width"]
        height = event["metadata"]["height"]
        values = event["value"].to_numpy().reshape((-1, 2))
        values = values * int(1 / IMAGE_RESIZE_RATIO)

        # Do point 0 first
        if len(values) == 0:
            print("No points detected")
            continue
        elif len(values) > 1:
            print("Multiple points detected, taking the first one")
        point = values[0]

        rz = int((width / 2) - point[0]) / (width / 2)
        x_distance = min(height / 2, height - point[1])

        if abs(rz) > 0.3:
            rz = np.deg2rad(30) * np.sign(rz)
        elif abs(rz) > 0.1:
            rz = np.deg2rad(30) * np.sign(rz)
        else:
            x = 0

        if x_distance > (height * 0.15):
            x = 0.5
        else:
            x = 0
        # Action
        action = pa.array([x, 0, 0, 0, 0, rz])
        node.send_output("action", action)
--- a/examples/reachy2-remote/parse_whisper.py
+++ b/examples/reachy2-remote/parse_whisper.py
@@ -0,0 +1,75 @@
 """TODO: Add docstring."""

 import json
 import os
 import time

 import numpy as np
 import pyarrow as pa
 from dora import Node

 node = Node()

 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


 def extract_bboxes(json_text):
    """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.

    Parameters
    ----------
    json_text : str
        JSON string containing bounding box data, including ```json markers.

    Returns
    -------
    np.ndarray: NumPy array of bounding boxes.

    """
    # Ensure all lines are stripped of whitespace and markers
    lines = json_text.strip().splitlines()

    # Filter out lines that are markdown markers
    clean_lines = [line for line in lines if not line.strip().startswith("```")]

    # Join the lines back into a single string
    clean_text = "\n".join(clean_lines)
    # Parse the cleaned JSON text
    try:
        data = json.loads(clean_text)

        # Extract bounding boxes
        bboxes = [item["bbox_2d"] for item in data]
        labels = [item["label"] for item in data]

        return np.array(bboxes), np.array(labels)
    except Exception as _e:  # noqa
        pass
    return None, None


 for event in node:
    if event["type"] == "INPUT":
        text = event["value"][0].as_py().lower()

        if "stop" in text:
            node.send_output("points", pa.array([], type=pa.float64()))
        elif "follow" in text:
            text = f"Given the prompt: {text}. Output the bounding boxes for the given followed object"
            node.send_output("text", pa.array([text]), {"image_id": "image_left"})
        elif "left" in text:
            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
            time.sleep(0.25)
            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
            time.sleep(0.25)
            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
            node.send_output("points", pa.array([]))
            node.send_output("action", action)
        elif "right" in text:
            action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
            time.sleep(0.25)
            action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
            time.sleep(0.25)
            action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
            node.send_output("points", pa.array([]))
            node.send_output("action", action)
--- a/examples/reachy2-remote/whisper-dev.yml
+++ b/examples/reachy2-remote/whisper-dev.yml
@@ -0,0 +1,42 @@
 nodes:
  - id: dora-microphone
    build: pip install -e ../../node-hub/dora-microphone
    path: dora-microphone
    _unstable_deploy:
      machine: macbook
    inputs:
      tick: dora/timer/millis/2000
    outputs:
      - audio

  - id: dora-vad
    build: pip install -e ../../node-hub/dora-vad
    _unstable_deploy:
      machine: macbook
    path: dora-vad
    inputs:
      audio: dora-microphone/audio
    outputs:
      - audio

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    _unstable_deploy:
      machine: macbook
    path: dora-distil-whisper
    inputs:
      input: dora-vad/audio
    outputs:
      - text
    env:
      TARGET_LANGUAGE: english
      # For China
      # USE_MODELSCOPE_HUB: true

  - id: dora-rerun
    build: cargo build -p dora-rerun --release
    _unstable_deploy:
      machine: macbook
    path: dora-rerun
    inputs:
      original_text: dora-distil-whisper/text
--- a/node-hub/dora-cotracker/dora_cotracker/main.py
+++ b/node-hub/dora-cotracker/dora_cotracker/main.py
@@ -25,6 +25,7 @@ class VideoTrackingNode:
        self.accept_new_points = True
        self.clicked_points = []
        self.input_points = []
        self.input_masks = []

    def mouse_callback(self, event, x, y, flags, param):
        if event == cv2.EVENT_LBUTTONDOWN:
@@ -52,9 +53,9 @@ class VideoTrackingNode:
            # Track points
            pred_tracks, pred_visibility = self.model(
                video_chunk,
                queries=queries,
                is_first_step=self.is_first_step,
                grid_size=0,
                queries=queries,
                add_support_grid=False,
            )
            self.is_first_step = False
@@ -118,6 +119,8 @@ class VideoTrackingNode:
                            "num_points": len(visible_tracks),
                            "dtype": "float32",
                            "shape": (len(visible_tracks), 2),
                            "width": frame.shape[1],
                            "height": frame.shape[0],
                        },
                    )

@@ -153,7 +156,7 @@ class VideoTrackingNode:
                        cv2.imshow("Interactive Feed to track point", display_frame)
                        cv2.waitKey(1)

                if event["id"] == "points":
                elif event["id"] == "points":
                    if not self.accept_new_points:
                        continue
                    # Handle points from input_stream node
@@ -162,9 +165,13 @@ class VideoTrackingNode:
                    self.input_points = points_array.reshape((-1, 2)).tolist()
                    self.accept_new_points = False
                    self.is_first_step = True
                if event["id"] == "boxes2d":
                elif event["id"] == "boxes2d":
                    if not self.accept_new_points:
                        continue
                    if len(event["value"]) == 0:
                        self.input_points = []
                        self.is_first_step = True
                        continue

                    # Handle points from input_stream node
                    metadata = event["metadata"]
@@ -185,7 +192,11 @@ class VideoTrackingNode:
                        _labels = None

                    self.input_points = [
                        [int((x_min + x_max) / 2), int((y_min + y_max) / 2)]
                        [
                            int(x_min + (x_max - x_min) * 2 / 4),
                            int(y_min + (y_max - y_min) * i / 10),
                        ]
                        for i in range(4, 7)
                        for x_min, y_min, x_max, y_max in boxes2d
                    ]