From 87c7df5838f8e3152f26111f835dfd4ce1b6ea59 Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Wed, 9 Apr 2025 15:14:32 +0200 Subject: [PATCH] Adding example dataflow --- examples/reachy2-remote/dataflow_reachy.yml | 180 ++++++++++++++++++ examples/reachy2-remote/parse_bbox.py | 66 +++++++ examples/reachy2-remote/parse_point.py | 47 +++++ examples/reachy2-remote/parse_whisper.py | 75 ++++++++ examples/reachy2-remote/whisper-dev.yml | 42 ++++ .../dora-cotracker/dora_cotracker/main.py | 19 +- 6 files changed, 425 insertions(+), 4 deletions(-) create mode 100644 examples/reachy2-remote/dataflow_reachy.yml create mode 100644 examples/reachy2-remote/parse_bbox.py create mode 100644 examples/reachy2-remote/parse_point.py create mode 100644 examples/reachy2-remote/parse_whisper.py create mode 100644 examples/reachy2-remote/whisper-dev.yml diff --git a/examples/reachy2-remote/dataflow_reachy.yml b/examples/reachy2-remote/dataflow_reachy.yml new file mode 100644 index 00000000..aa5574bb --- /dev/null +++ b/examples/reachy2-remote/dataflow_reachy.yml @@ -0,0 +1,180 @@ +nodes: + - id: camera + path: dora-reachy2-camera + _unstable_deploy: + machine: encoder + inputs: + tick: dora/timer/millis/10 + outputs: + - image_left + - image_depth + - depth + env: + CAPTURE_PATH: 0 + IMAGE_WIDTH: 640 + IMAGE_HEIGHT: 480 + ROBOT_IP: 127.0.0.1 + + - id: rav1e-local-image + path: dora-rav1e + build: cargo build -p dora-rav1e --release + _unstable_deploy: + machine: encoder + inputs: + image_depth: camera/image_depth + image_left: camera/image_left + outputs: + - image_left + - image_depth + - depth + env: + RAV1E_SPEED: 10 + + - id: dav1d-remote + path: dora-dav1d + build: cargo build -p dora-dav1d --release + _unstable_deploy: + machine: gpu + inputs: + image_depth: rav1e-local-image/image_depth + image_left: rav1e-local-image/image_left + # depth: rav1e-local/depth + outputs: + - image_left + - image_depth + - depth + + - id: dora-microphone + build: pip install -e ../../node-hub/dora-microphone + path: dora-microphone + _unstable_deploy: + machine: macbook + inputs: + tick: dora/timer/millis/2000 + outputs: + - audio + + - id: dora-vad + build: pip install -e ../../node-hub/dora-vad + _unstable_deploy: + machine: macbook + path: dora-vad + inputs: + audio: dora-microphone/audio + outputs: + - audio + + - id: dora-distil-whisper + build: pip install -e ../../node-hub/dora-distil-whisper + _unstable_deploy: + machine: macbook + path: dora-distil-whisper + inputs: + input: dora-vad/audio + outputs: + - text + env: + TARGET_LANGUAGE: english + + - id: parse_whisper + path: parse_whisper.py + _unstable_deploy: + machine: gpu + inputs: + text: dora-distil-whisper/text + outputs: + - bbox + - action + - points + - text + env: + IMAGE_RESIZE_RATIO: "1.0" + + - id: dora-qwenvl + build: pip install -e ../../node-hub/dora-qwen2-5-vl + path: dora-qwen2-5-vl + _unstable_deploy: + machine: gpu + inputs: + image_left: dav1d-remote/image_left + text: parse_whisper/text + outputs: + - text + env: + DEFAULT_QUESTION: Output the bounding box of the suitcase. + IMAGE_RESIZE_RATIO: "1.0" + + - id: parse_bbox + path: parse_bbox.py + _unstable_deploy: + machine: gpu + inputs: + text: dora-qwenvl/text + points: parse_whisper/points + outputs: + - bbox + env: + IMAGE_RESIZE_RATIO: "1.0" + + - id: tracker + build: pip install -e ../../node-hub/dora-cotracker + path: dora-cotracker + _unstable_deploy: + machine: gpu + inputs: + image: dav1d-remote/image_left + boxes2d: parse_bbox/bbox + outputs: + - tracked_image + - points + env: + INTERACTIVE_MODE: false + + #- id: sam2 + #build: pip install -e ../../node-hub/dora-sam2 + #path: dora-sam2 + #_unstable_deploy: + #machine: gpu + #inputs: + #image_left: dav1d-remote/image_left + #boxes2d: parse_bbox/bbox + #outputs: + #- masks + + - id: parse_point + path: parse_point.py + _unstable_deploy: + machine: gpu + inputs: + points: tracker/points + outputs: + - action + env: + IMAGE_RESIZE_RATIO: "1.0" + + - id: reachy-mobile-base + build: pip install -e ../../node-hub/dora-reachy2 + path: dora-reachy2-mobile-base + _unstable_deploy: + machine: encoder + inputs: + action_base: parse_point/action + action_whipser: parse_whisper/action + outputs: + - response_base + env: + ROBOT_IP: 127.0.0.1 + + - id: plot + build: pip install -e ../../node-hub/dora-rerun + path: dora-rerun + _unstable_deploy: + machine: macbook + inputs: + image: dav1d-remote/image_left + image_depth: dav1d-remote/image_depth + boxes2d: parse_bbox/bbox + original_text: dora-distil-whisper/text + parsed_text: parse_whisper/text + qwenvl_text: dora-qwenvl/text + tracked_image: tracker/tracked_image diff --git a/examples/reachy2-remote/parse_bbox.py b/examples/reachy2-remote/parse_bbox.py new file mode 100644 index 00000000..09bca4e7 --- /dev/null +++ b/examples/reachy2-remote/parse_bbox.py @@ -0,0 +1,66 @@ +"""TODO: Add docstring.""" + +import json +import os + +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() + +IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) + + +def extract_bboxes(json_text): + """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. + + Parameters + ---------- + json_text : str + JSON string containing bounding box data, including ```json markers. + + Returns + ------- + np.ndarray: NumPy array of bounding boxes. + + """ + # Ensure all lines are stripped of whitespace and markers + lines = json_text.strip().splitlines() + + # Filter out lines that are markdown markers + clean_lines = [line for line in lines if not line.strip().startswith("```")] + + # Join the lines back into a single string + clean_text = "\n".join(clean_lines) + # Parse the cleaned JSON text + try: + data = json.loads(clean_text) + + # Extract bounding boxes + bboxes = [item["bbox_2d"] for item in data] + labels = [item["label"] for item in data] + + return np.array(bboxes), np.array(labels) + except Exception as _e: # noqa + pass + return None, None + + +for event in node: + if event["type"] == "INPUT": + if len(event["value"]) == 0: + node.send_output("bbox", pa.array([])) + continue + + text = event["value"][0].as_py() + image_id = event["metadata"]["image_id"] + + bboxes, labels = extract_bboxes(text) + if bboxes is not None and len(bboxes) > 0: + bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) + node.send_output( + "bbox", + pa.array(bboxes.ravel()), + metadata={"encoding": "xyxy", "image_id": image_id}, + ) diff --git a/examples/reachy2-remote/parse_point.py b/examples/reachy2-remote/parse_point.py new file mode 100644 index 00000000..0617f3d2 --- /dev/null +++ b/examples/reachy2-remote/parse_point.py @@ -0,0 +1,47 @@ +"""TODO: Add docstring.""" + +import json +import os + +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() + +IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) + + +for event in node: + if event["type"] == "INPUT": + text = event["value"][0].as_py() + width = event["metadata"]["width"] + height = event["metadata"]["height"] + values = event["value"].to_numpy().reshape((-1, 2)) + values = values * int(1 / IMAGE_RESIZE_RATIO) + + # Do point 0 first + if len(values) == 0: + print("No points detected") + continue + elif len(values) > 1: + print("Multiple points detected, taking the first one") + point = values[0] + + rz = int((width / 2) - point[0]) / (width / 2) + x_distance = min(height / 2, height - point[1]) + + if abs(rz) > 0.3: + rz = np.deg2rad(30) * np.sign(rz) + elif abs(rz) > 0.1: + rz = np.deg2rad(30) * np.sign(rz) + else: + x = 0 + + if x_distance > (height * 0.15): + x = 0.5 + else: + x = 0 + # Action + action = pa.array([x, 0, 0, 0, 0, rz]) + node.send_output("action", action) diff --git a/examples/reachy2-remote/parse_whisper.py b/examples/reachy2-remote/parse_whisper.py new file mode 100644 index 00000000..e91f4a45 --- /dev/null +++ b/examples/reachy2-remote/parse_whisper.py @@ -0,0 +1,75 @@ +"""TODO: Add docstring.""" + +import json +import os +import time + +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() + +IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) + + +def extract_bboxes(json_text): + """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. + + Parameters + ---------- + json_text : str + JSON string containing bounding box data, including ```json markers. + + Returns + ------- + np.ndarray: NumPy array of bounding boxes. + + """ + # Ensure all lines are stripped of whitespace and markers + lines = json_text.strip().splitlines() + + # Filter out lines that are markdown markers + clean_lines = [line for line in lines if not line.strip().startswith("```")] + + # Join the lines back into a single string + clean_text = "\n".join(clean_lines) + # Parse the cleaned JSON text + try: + data = json.loads(clean_text) + + # Extract bounding boxes + bboxes = [item["bbox_2d"] for item in data] + labels = [item["label"] for item in data] + + return np.array(bboxes), np.array(labels) + except Exception as _e: # noqa + pass + return None, None + + +for event in node: + if event["type"] == "INPUT": + text = event["value"][0].as_py().lower() + + if "stop" in text: + node.send_output("points", pa.array([], type=pa.float64())) + elif "follow" in text: + text = f"Given the prompt: {text}. Output the bounding boxes for the given followed object" + node.send_output("text", pa.array([text]), {"image_id": "image_left"}) + elif "left" in text: + action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) + time.sleep(0.25) + action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) + time.sleep(0.25) + action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) + node.send_output("points", pa.array([])) + node.send_output("action", action) + elif "right" in text: + action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) + time.sleep(0.25) + action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) + time.sleep(0.25) + action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) + node.send_output("points", pa.array([])) + node.send_output("action", action) diff --git a/examples/reachy2-remote/whisper-dev.yml b/examples/reachy2-remote/whisper-dev.yml new file mode 100644 index 00000000..c52e52f4 --- /dev/null +++ b/examples/reachy2-remote/whisper-dev.yml @@ -0,0 +1,42 @@ +nodes: + - id: dora-microphone + build: pip install -e ../../node-hub/dora-microphone + path: dora-microphone + _unstable_deploy: + machine: macbook + inputs: + tick: dora/timer/millis/2000 + outputs: + - audio + + - id: dora-vad + build: pip install -e ../../node-hub/dora-vad + _unstable_deploy: + machine: macbook + path: dora-vad + inputs: + audio: dora-microphone/audio + outputs: + - audio + + - id: dora-distil-whisper + build: pip install -e ../../node-hub/dora-distil-whisper + _unstable_deploy: + machine: macbook + path: dora-distil-whisper + inputs: + input: dora-vad/audio + outputs: + - text + env: + TARGET_LANGUAGE: english + # For China + # USE_MODELSCOPE_HUB: true + + - id: dora-rerun + build: cargo build -p dora-rerun --release + _unstable_deploy: + machine: macbook + path: dora-rerun + inputs: + original_text: dora-distil-whisper/text diff --git a/node-hub/dora-cotracker/dora_cotracker/main.py b/node-hub/dora-cotracker/dora_cotracker/main.py index dcfaeb54..27bcbc74 100644 --- a/node-hub/dora-cotracker/dora_cotracker/main.py +++ b/node-hub/dora-cotracker/dora_cotracker/main.py @@ -25,6 +25,7 @@ class VideoTrackingNode: self.accept_new_points = True self.clicked_points = [] self.input_points = [] + self.input_masks = [] def mouse_callback(self, event, x, y, flags, param): if event == cv2.EVENT_LBUTTONDOWN: @@ -52,9 +53,9 @@ class VideoTrackingNode: # Track points pred_tracks, pred_visibility = self.model( video_chunk, + queries=queries, is_first_step=self.is_first_step, grid_size=0, - queries=queries, add_support_grid=False, ) self.is_first_step = False @@ -118,6 +119,8 @@ class VideoTrackingNode: "num_points": len(visible_tracks), "dtype": "float32", "shape": (len(visible_tracks), 2), + "width": frame.shape[1], + "height": frame.shape[0], }, ) @@ -153,7 +156,7 @@ class VideoTrackingNode: cv2.imshow("Interactive Feed to track point", display_frame) cv2.waitKey(1) - if event["id"] == "points": + elif event["id"] == "points": if not self.accept_new_points: continue # Handle points from input_stream node @@ -162,9 +165,13 @@ class VideoTrackingNode: self.input_points = points_array.reshape((-1, 2)).tolist() self.accept_new_points = False self.is_first_step = True - if event["id"] == "boxes2d": + elif event["id"] == "boxes2d": if not self.accept_new_points: continue + if len(event["value"]) == 0: + self.input_points = [] + self.is_first_step = True + continue # Handle points from input_stream node metadata = event["metadata"] @@ -185,7 +192,11 @@ class VideoTrackingNode: _labels = None self.input_points = [ - [int((x_min + x_max) / 2), int((y_min + y_max) / 2)] + [ + int(x_min + (x_max - x_min) * 2 / 4), + int(y_min + (y_max - y_min) * i / 10), + ] + for i in range(4, 7) for x_min, y_min, x_max, y_max in boxes2d ]