From 87c7df5838f8e3152f26111f835dfd4ce1b6ea59 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Wed, 9 Apr 2025 15:14:32 +0200
Subject: [PATCH] Adding example dataflow

---
 examples/reachy2-remote/dataflow_reachy.yml   | 180 ++++++++++++++++++
 examples/reachy2-remote/parse_bbox.py         |  66 +++++++
 examples/reachy2-remote/parse_point.py        |  47 +++++
 examples/reachy2-remote/parse_whisper.py      |  75 ++++++++
 examples/reachy2-remote/whisper-dev.yml       |  42 ++++
 .../dora-cotracker/dora_cotracker/main.py     |  19 +-
 6 files changed, 425 insertions(+), 4 deletions(-)
 create mode 100644 examples/reachy2-remote/dataflow_reachy.yml
 create mode 100644 examples/reachy2-remote/parse_bbox.py
 create mode 100644 examples/reachy2-remote/parse_point.py
 create mode 100644 examples/reachy2-remote/parse_whisper.py
 create mode 100644 examples/reachy2-remote/whisper-dev.yml

diff --git a/examples/reachy2-remote/dataflow_reachy.yml b/examples/reachy2-remote/dataflow_reachy.yml
new file mode 100644
index 00000000..aa5574bb
--- /dev/null
+++ b/examples/reachy2-remote/dataflow_reachy.yml
@@ -0,0 +1,180 @@
+nodes:
+  - id: camera
+    path: dora-reachy2-camera
+    _unstable_deploy:
+      machine: encoder
+    inputs:
+      tick: dora/timer/millis/10
+    outputs:
+      - image_left
+      - image_depth
+      - depth
+    env:
+      CAPTURE_PATH: 0
+      IMAGE_WIDTH: 640
+      IMAGE_HEIGHT: 480
+      ROBOT_IP: 127.0.0.1
+
+  - id: rav1e-local-image
+    path: dora-rav1e
+    build: cargo build -p dora-rav1e --release
+    _unstable_deploy:
+      machine: encoder
+    inputs:
+      image_depth: camera/image_depth
+      image_left: camera/image_left
+    outputs:
+      - image_left
+      - image_depth
+      - depth
+    env:
+      RAV1E_SPEED: 10
+
+  - id: dav1d-remote
+    path: dora-dav1d
+    build: cargo build -p dora-dav1d --release
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      image_depth: rav1e-local-image/image_depth
+      image_left: rav1e-local-image/image_left
+      # depth: rav1e-local/depth
+    outputs:
+      - image_left
+      - image_depth
+      - depth
+
+  - id: dora-microphone
+    build: pip install -e ../../node-hub/dora-microphone
+    path: dora-microphone
+    _unstable_deploy:
+      machine: macbook
+    inputs:
+      tick: dora/timer/millis/2000
+    outputs:
+      - audio
+
+  - id: dora-vad
+    build: pip install -e ../../node-hub/dora-vad
+    _unstable_deploy:
+      machine: macbook
+    path: dora-vad
+    inputs:
+      audio: dora-microphone/audio
+    outputs:
+      - audio
+
+  - id: dora-distil-whisper
+    build: pip install -e ../../node-hub/dora-distil-whisper
+    _unstable_deploy:
+      machine: macbook
+    path: dora-distil-whisper
+    inputs:
+      input: dora-vad/audio
+    outputs:
+      - text
+    env:
+      TARGET_LANGUAGE: english
+
+  - id: parse_whisper
+    path: parse_whisper.py
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      text: dora-distil-whisper/text
+    outputs:
+      - bbox
+      - action
+      - points
+      - text
+    env:
+      IMAGE_RESIZE_RATIO: "1.0"
+
+  - id: dora-qwenvl
+    build: pip install -e ../../node-hub/dora-qwen2-5-vl
+    path: dora-qwen2-5-vl
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      image_left: dav1d-remote/image_left
+      text: parse_whisper/text
+    outputs:
+      - text
+    env:
+      DEFAULT_QUESTION: Output the bounding box of the suitcase.
+      IMAGE_RESIZE_RATIO: "1.0"
+
+  - id: parse_bbox
+    path: parse_bbox.py
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      text: dora-qwenvl/text
+      points: parse_whisper/points
+    outputs:
+      - bbox
+    env:
+      IMAGE_RESIZE_RATIO: "1.0"
+
+  - id: tracker
+    build: pip install -e ../../node-hub/dora-cotracker
+    path: dora-cotracker
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      image: dav1d-remote/image_left
+      boxes2d: parse_bbox/bbox
+    outputs:
+      - tracked_image
+      - points
+    env:
+      INTERACTIVE_MODE: false
+
+  #- id: sam2
+  #build: pip install -e ../../node-hub/dora-sam2
+  #path: dora-sam2
+  #_unstable_deploy:
+  #machine: gpu
+  #inputs:
+  #image_left: dav1d-remote/image_left
+  #boxes2d: parse_bbox/bbox
+  #outputs:
+  #- masks
+
+  - id: parse_point
+    path: parse_point.py
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      points: tracker/points
+    outputs:
+      - action
+    env:
+      IMAGE_RESIZE_RATIO: "1.0"
+
+  - id: reachy-mobile-base
+    build: pip install -e ../../node-hub/dora-reachy2
+    path: dora-reachy2-mobile-base
+    _unstable_deploy:
+      machine: encoder
+    inputs:
+      action_base: parse_point/action
+      action_whipser: parse_whisper/action
+    outputs:
+      - response_base
+    env:
+      ROBOT_IP: 127.0.0.1
+
+  - id: plot
+    build: pip install -e ../../node-hub/dora-rerun
+    path: dora-rerun
+    _unstable_deploy:
+      machine: macbook
+    inputs:
+      image: dav1d-remote/image_left
+      image_depth: dav1d-remote/image_depth
+      boxes2d: parse_bbox/bbox
+      original_text: dora-distil-whisper/text
+      parsed_text: parse_whisper/text
+      qwenvl_text: dora-qwenvl/text
+      tracked_image: tracker/tracked_image
diff --git a/examples/reachy2-remote/parse_bbox.py b/examples/reachy2-remote/parse_bbox.py
new file mode 100644
index 00000000..09bca4e7
--- /dev/null
+++ b/examples/reachy2-remote/parse_bbox.py
@@ -0,0 +1,66 @@
+"""TODO: Add docstring."""
+
+import json
+import os
+
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+node = Node()
+
+IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))
+
+
+def extract_bboxes(json_text):
+    """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.
+
+    Parameters
+    ----------
+    json_text : str
+        JSON string containing bounding box data, including ```json markers.
+
+    Returns
+    -------
+    np.ndarray: NumPy array of bounding boxes.
+
+    """
+    # Ensure all lines are stripped of whitespace and markers
+    lines = json_text.strip().splitlines()
+
+    # Filter out lines that are markdown markers
+    clean_lines = [line for line in lines if not line.strip().startswith("```")]
+
+    # Join the lines back into a single string
+    clean_text = "\n".join(clean_lines)
+    # Parse the cleaned JSON text
+    try:
+        data = json.loads(clean_text)
+
+        # Extract bounding boxes
+        bboxes = [item["bbox_2d"] for item in data]
+        labels = [item["label"] for item in data]
+
+        return np.array(bboxes), np.array(labels)
+    except Exception as _e:  # noqa
+        pass
+    return None, None
+
+
+for event in node:
+    if event["type"] == "INPUT":
+        if len(event["value"]) == 0:
+            node.send_output("bbox", pa.array([]))
+            continue
+
+        text = event["value"][0].as_py()
+        image_id = event["metadata"]["image_id"]
+
+        bboxes, labels = extract_bboxes(text)
+        if bboxes is not None and len(bboxes) > 0:
+            bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
+            node.send_output(
+                "bbox",
+                pa.array(bboxes.ravel()),
+                metadata={"encoding": "xyxy", "image_id": image_id},
+            )
diff --git a/examples/reachy2-remote/parse_point.py b/examples/reachy2-remote/parse_point.py
new file mode 100644
index 00000000..0617f3d2
--- /dev/null
+++ b/examples/reachy2-remote/parse_point.py
@@ -0,0 +1,47 @@
+"""TODO: Add docstring."""
+
+import json
+import os
+
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+node = Node()
+
+IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))
+
+
+for event in node:
+    if event["type"] == "INPUT":
+        text = event["value"][0].as_py()
+        width = event["metadata"]["width"]
+        height = event["metadata"]["height"]
+        values = event["value"].to_numpy().reshape((-1, 2))
+        values = values * int(1 / IMAGE_RESIZE_RATIO)
+
+        # Do point 0 first
+        if len(values) == 0:
+            print("No points detected")
+            continue
+        elif len(values) > 1:
+            print("Multiple points detected, taking the first one")
+        point = values[0]
+
+        rz = int((width / 2) - point[0]) / (width / 2)
+        x_distance = min(height / 2, height - point[1])
+
+        if abs(rz) > 0.3:
+            rz = np.deg2rad(30) * np.sign(rz)
+        elif abs(rz) > 0.1:
+            rz = np.deg2rad(30) * np.sign(rz)
+        else:
+            x = 0
+
+        if x_distance > (height * 0.15):
+            x = 0.5
+        else:
+            x = 0
+        # Action
+        action = pa.array([x, 0, 0, 0, 0, rz])
+        node.send_output("action", action)
diff --git a/examples/reachy2-remote/parse_whisper.py b/examples/reachy2-remote/parse_whisper.py
new file mode 100644
index 00000000..e91f4a45
--- /dev/null
+++ b/examples/reachy2-remote/parse_whisper.py
@@ -0,0 +1,75 @@
+"""TODO: Add docstring."""
+
+import json
+import os
+import time
+
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+node = Node()
+
+IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))
+
+
+def extract_bboxes(json_text):
+    """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.
+
+    Parameters
+    ----------
+    json_text : str
+        JSON string containing bounding box data, including ```json markers.
+
+    Returns
+    -------
+    np.ndarray: NumPy array of bounding boxes.
+
+    """
+    # Ensure all lines are stripped of whitespace and markers
+    lines = json_text.strip().splitlines()
+
+    # Filter out lines that are markdown markers
+    clean_lines = [line for line in lines if not line.strip().startswith("```")]
+
+    # Join the lines back into a single string
+    clean_text = "\n".join(clean_lines)
+    # Parse the cleaned JSON text
+    try:
+        data = json.loads(clean_text)
+
+        # Extract bounding boxes
+        bboxes = [item["bbox_2d"] for item in data]
+        labels = [item["label"] for item in data]
+
+        return np.array(bboxes), np.array(labels)
+    except Exception as _e:  # noqa
+        pass
+    return None, None
+
+
+for event in node:
+    if event["type"] == "INPUT":
+        text = event["value"][0].as_py().lower()
+
+        if "stop" in text:
+            node.send_output("points", pa.array([], type=pa.float64()))
+        elif "follow" in text:
+            text = f"Given the prompt: {text}. Output the bounding boxes for the given followed object"
+            node.send_output("text", pa.array([text]), {"image_id": "image_left"})
+        elif "left" in text:
+            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
+            time.sleep(0.25)
+            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
+            time.sleep(0.25)
+            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
+            node.send_output("points", pa.array([]))
+            node.send_output("action", action)
+        elif "right" in text:
+            action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
+            time.sleep(0.25)
+            action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
+            time.sleep(0.25)
+            action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
+            node.send_output("points", pa.array([]))
+            node.send_output("action", action)
diff --git a/examples/reachy2-remote/whisper-dev.yml b/examples/reachy2-remote/whisper-dev.yml
new file mode 100644
index 00000000..c52e52f4
--- /dev/null
+++ b/examples/reachy2-remote/whisper-dev.yml
@@ -0,0 +1,42 @@
+nodes:
+  - id: dora-microphone
+    build: pip install -e ../../node-hub/dora-microphone
+    path: dora-microphone
+    _unstable_deploy:
+      machine: macbook
+    inputs:
+      tick: dora/timer/millis/2000
+    outputs:
+      - audio
+
+  - id: dora-vad
+    build: pip install -e ../../node-hub/dora-vad
+    _unstable_deploy:
+      machine: macbook
+    path: dora-vad
+    inputs:
+      audio: dora-microphone/audio
+    outputs:
+      - audio
+
+  - id: dora-distil-whisper
+    build: pip install -e ../../node-hub/dora-distil-whisper
+    _unstable_deploy:
+      machine: macbook
+    path: dora-distil-whisper
+    inputs:
+      input: dora-vad/audio
+    outputs:
+      - text
+    env:
+      TARGET_LANGUAGE: english
+      # For China
+      # USE_MODELSCOPE_HUB: true
+
+  - id: dora-rerun
+    build: cargo build -p dora-rerun --release
+    _unstable_deploy:
+      machine: macbook
+    path: dora-rerun
+    inputs:
+      original_text: dora-distil-whisper/text
diff --git a/node-hub/dora-cotracker/dora_cotracker/main.py b/node-hub/dora-cotracker/dora_cotracker/main.py
index dcfaeb54..27bcbc74 100644
--- a/node-hub/dora-cotracker/dora_cotracker/main.py
+++ b/node-hub/dora-cotracker/dora_cotracker/main.py
@@ -25,6 +25,7 @@ class VideoTrackingNode:
         self.accept_new_points = True
         self.clicked_points = []
         self.input_points = []
+        self.input_masks = []
 
     def mouse_callback(self, event, x, y, flags, param):
         if event == cv2.EVENT_LBUTTONDOWN:
@@ -52,9 +53,9 @@ class VideoTrackingNode:
             # Track points
             pred_tracks, pred_visibility = self.model(
                 video_chunk,
+                queries=queries,
                 is_first_step=self.is_first_step,
                 grid_size=0,
-                queries=queries,
                 add_support_grid=False,
             )
             self.is_first_step = False
@@ -118,6 +119,8 @@ class VideoTrackingNode:
                             "num_points": len(visible_tracks),
                             "dtype": "float32",
                             "shape": (len(visible_tracks), 2),
+                            "width": frame.shape[1],
+                            "height": frame.shape[0],
                         },
                     )
 
@@ -153,7 +156,7 @@ class VideoTrackingNode:
                         cv2.imshow("Interactive Feed to track point", display_frame)
                         cv2.waitKey(1)
 
-                if event["id"] == "points":
+                elif event["id"] == "points":
                     if not self.accept_new_points:
                         continue
                     # Handle points from input_stream node
@@ -162,9 +165,13 @@ class VideoTrackingNode:
                     self.input_points = points_array.reshape((-1, 2)).tolist()
                     self.accept_new_points = False
                     self.is_first_step = True
-                if event["id"] == "boxes2d":
+                elif event["id"] == "boxes2d":
                     if not self.accept_new_points:
                         continue
+                    if len(event["value"]) == 0:
+                        self.input_points = []
+                        self.is_first_step = True
+                        continue
 
                     # Handle points from input_stream node
                     metadata = event["metadata"]
@@ -185,7 +192,11 @@ class VideoTrackingNode:
                         _labels = None
 
                     self.input_points = [
-                        [int((x_min + x_max) / 2), int((y_min + y_max) / 2)]
+                        [
+                            int(x_min + (x_max - x_min) * 2 / 4),
+                            int(y_min + (y_max - y_min) * i / 10),
+                        ]
+                        for i in range(4, 7)
                         for x_min, y_min, x_max, y_max in boxes2d
                     ]