| @@ -0,0 +1,180 @@ | |||
| nodes: | |||
| - id: camera | |||
| path: dora-reachy2-camera | |||
| _unstable_deploy: | |||
| machine: encoder | |||
| inputs: | |||
| tick: dora/timer/millis/10 | |||
| outputs: | |||
| - image_left | |||
| - image_depth | |||
| - depth | |||
| env: | |||
| CAPTURE_PATH: 0 | |||
| IMAGE_WIDTH: 640 | |||
| IMAGE_HEIGHT: 480 | |||
| ROBOT_IP: 127.0.0.1 | |||
| - id: rav1e-local-image | |||
| path: dora-rav1e | |||
| build: cargo build -p dora-rav1e --release | |||
| _unstable_deploy: | |||
| machine: encoder | |||
| inputs: | |||
| image_depth: camera/image_depth | |||
| image_left: camera/image_left | |||
| outputs: | |||
| - image_left | |||
| - image_depth | |||
| - depth | |||
| env: | |||
| RAV1E_SPEED: 10 | |||
| - id: dav1d-remote | |||
| path: dora-dav1d | |||
| build: cargo build -p dora-dav1d --release | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| image_depth: rav1e-local-image/image_depth | |||
| image_left: rav1e-local-image/image_left | |||
| # depth: rav1e-local/depth | |||
| outputs: | |||
| - image_left | |||
| - image_depth | |||
| - depth | |||
| - id: dora-microphone | |||
| build: pip install -e ../../node-hub/dora-microphone | |||
| path: dora-microphone | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| inputs: | |||
| tick: dora/timer/millis/2000 | |||
| outputs: | |||
| - audio | |||
| - id: dora-vad | |||
| build: pip install -e ../../node-hub/dora-vad | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| path: dora-vad | |||
| inputs: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - id: dora-distil-whisper | |||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| path: dora-distil-whisper | |||
| inputs: | |||
| input: dora-vad/audio | |||
| outputs: | |||
| - text | |||
| env: | |||
| TARGET_LANGUAGE: english | |||
| - id: parse_whisper | |||
| path: parse_whisper.py | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| text: dora-distil-whisper/text | |||
| outputs: | |||
| - bbox | |||
| - action | |||
| - points | |||
| - text | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "1.0" | |||
| - id: dora-qwenvl | |||
| build: pip install -e ../../node-hub/dora-qwen2-5-vl | |||
| path: dora-qwen2-5-vl | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| image_left: dav1d-remote/image_left | |||
| text: parse_whisper/text | |||
| outputs: | |||
| - text | |||
| env: | |||
| DEFAULT_QUESTION: Output the bounding box of the suitcase. | |||
| IMAGE_RESIZE_RATIO: "1.0" | |||
| - id: parse_bbox | |||
| path: parse_bbox.py | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| text: dora-qwenvl/text | |||
| points: parse_whisper/points | |||
| outputs: | |||
| - bbox | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "1.0" | |||
| - id: tracker | |||
| build: pip install -e ../../node-hub/dora-cotracker | |||
| path: dora-cotracker | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| image: dav1d-remote/image_left | |||
| boxes2d: parse_bbox/bbox | |||
| outputs: | |||
| - tracked_image | |||
| - points | |||
| env: | |||
| INTERACTIVE_MODE: false | |||
| #- id: sam2 | |||
| #build: pip install -e ../../node-hub/dora-sam2 | |||
| #path: dora-sam2 | |||
| #_unstable_deploy: | |||
| #machine: gpu | |||
| #inputs: | |||
| #image_left: dav1d-remote/image_left | |||
| #boxes2d: parse_bbox/bbox | |||
| #outputs: | |||
| #- masks | |||
| - id: parse_point | |||
| path: parse_point.py | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| points: tracker/points | |||
| outputs: | |||
| - action | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "1.0" | |||
| - id: reachy-mobile-base | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-mobile-base | |||
| _unstable_deploy: | |||
| machine: encoder | |||
| inputs: | |||
| action_base: parse_point/action | |||
| action_whipser: parse_whisper/action | |||
| outputs: | |||
| - response_base | |||
| env: | |||
| ROBOT_IP: 127.0.0.1 | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| inputs: | |||
| image: dav1d-remote/image_left | |||
| image_depth: dav1d-remote/image_depth | |||
| boxes2d: parse_bbox/bbox | |||
| original_text: dora-distil-whisper/text | |||
| parsed_text: parse_whisper/text | |||
| qwenvl_text: dora-qwenvl/text | |||
| tracked_image: tracker/tracked_image | |||
| @@ -0,0 +1,66 @@ | |||
| """TODO: Add docstring.""" | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| node = Node() | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| def extract_bboxes(json_text): | |||
| """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. | |||
| Parameters | |||
| ---------- | |||
| json_text : str | |||
| JSON string containing bounding box data, including ```json markers. | |||
| Returns | |||
| ------- | |||
| np.ndarray: NumPy array of bounding boxes. | |||
| """ | |||
| # Ensure all lines are stripped of whitespace and markers | |||
| lines = json_text.strip().splitlines() | |||
| # Filter out lines that are markdown markers | |||
| clean_lines = [line for line in lines if not line.strip().startswith("```")] | |||
| # Join the lines back into a single string | |||
| clean_text = "\n".join(clean_lines) | |||
| # Parse the cleaned JSON text | |||
| try: | |||
| data = json.loads(clean_text) | |||
| # Extract bounding boxes | |||
| bboxes = [item["bbox_2d"] for item in data] | |||
| labels = [item["label"] for item in data] | |||
| return np.array(bboxes), np.array(labels) | |||
| except Exception as _e: # noqa | |||
| pass | |||
| return None, None | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| if len(event["value"]) == 0: | |||
| node.send_output("bbox", pa.array([])) | |||
| continue | |||
| text = event["value"][0].as_py() | |||
| image_id = event["metadata"]["image_id"] | |||
| bboxes, labels = extract_bboxes(text) | |||
| if bboxes is not None and len(bboxes) > 0: | |||
| bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) | |||
| node.send_output( | |||
| "bbox", | |||
| pa.array(bboxes.ravel()), | |||
| metadata={"encoding": "xyxy", "image_id": image_id}, | |||
| ) | |||
| @@ -0,0 +1,47 @@ | |||
| """TODO: Add docstring.""" | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| node = Node() | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| text = event["value"][0].as_py() | |||
| width = event["metadata"]["width"] | |||
| height = event["metadata"]["height"] | |||
| values = event["value"].to_numpy().reshape((-1, 2)) | |||
| values = values * int(1 / IMAGE_RESIZE_RATIO) | |||
| # Do point 0 first | |||
| if len(values) == 0: | |||
| print("No points detected") | |||
| continue | |||
| elif len(values) > 1: | |||
| print("Multiple points detected, taking the first one") | |||
| point = values[0] | |||
| rz = int((width / 2) - point[0]) / (width / 2) | |||
| x_distance = min(height / 2, height - point[1]) | |||
| if abs(rz) > 0.3: | |||
| rz = np.deg2rad(30) * np.sign(rz) | |||
| elif abs(rz) > 0.1: | |||
| rz = np.deg2rad(30) * np.sign(rz) | |||
| else: | |||
| x = 0 | |||
| if x_distance > (height * 0.15): | |||
| x = 0.5 | |||
| else: | |||
| x = 0 | |||
| # Action | |||
| action = pa.array([x, 0, 0, 0, 0, rz]) | |||
| node.send_output("action", action) | |||
| @@ -0,0 +1,75 @@ | |||
| """TODO: Add docstring.""" | |||
| import json | |||
| import os | |||
| import time | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| node = Node() | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| def extract_bboxes(json_text): | |||
| """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. | |||
| Parameters | |||
| ---------- | |||
| json_text : str | |||
| JSON string containing bounding box data, including ```json markers. | |||
| Returns | |||
| ------- | |||
| np.ndarray: NumPy array of bounding boxes. | |||
| """ | |||
| # Ensure all lines are stripped of whitespace and markers | |||
| lines = json_text.strip().splitlines() | |||
| # Filter out lines that are markdown markers | |||
| clean_lines = [line for line in lines if not line.strip().startswith("```")] | |||
| # Join the lines back into a single string | |||
| clean_text = "\n".join(clean_lines) | |||
| # Parse the cleaned JSON text | |||
| try: | |||
| data = json.loads(clean_text) | |||
| # Extract bounding boxes | |||
| bboxes = [item["bbox_2d"] for item in data] | |||
| labels = [item["label"] for item in data] | |||
| return np.array(bboxes), np.array(labels) | |||
| except Exception as _e: # noqa | |||
| pass | |||
| return None, None | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| text = event["value"][0].as_py().lower() | |||
| if "stop" in text: | |||
| node.send_output("points", pa.array([], type=pa.float64())) | |||
| elif "follow" in text: | |||
| text = f"Given the prompt: {text}. Output the bounding boxes for the given followed object" | |||
| node.send_output("text", pa.array([text]), {"image_id": "image_left"}) | |||
| elif "left" in text: | |||
| action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) | |||
| time.sleep(0.25) | |||
| action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) | |||
| time.sleep(0.25) | |||
| action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) | |||
| node.send_output("points", pa.array([])) | |||
| node.send_output("action", action) | |||
| elif "right" in text: | |||
| action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) | |||
| time.sleep(0.25) | |||
| action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) | |||
| time.sleep(0.25) | |||
| action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) | |||
| node.send_output("points", pa.array([])) | |||
| node.send_output("action", action) | |||
| @@ -0,0 +1,42 @@ | |||
| nodes: | |||
| - id: dora-microphone | |||
| build: pip install -e ../../node-hub/dora-microphone | |||
| path: dora-microphone | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| inputs: | |||
| tick: dora/timer/millis/2000 | |||
| outputs: | |||
| - audio | |||
| - id: dora-vad | |||
| build: pip install -e ../../node-hub/dora-vad | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| path: dora-vad | |||
| inputs: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - id: dora-distil-whisper | |||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| path: dora-distil-whisper | |||
| inputs: | |||
| input: dora-vad/audio | |||
| outputs: | |||
| - text | |||
| env: | |||
| TARGET_LANGUAGE: english | |||
| # For China | |||
| # USE_MODELSCOPE_HUB: true | |||
| - id: dora-rerun | |||
| build: cargo build -p dora-rerun --release | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| path: dora-rerun | |||
| inputs: | |||
| original_text: dora-distil-whisper/text | |||
| @@ -25,6 +25,7 @@ class VideoTrackingNode: | |||
| self.accept_new_points = True | |||
| self.clicked_points = [] | |||
| self.input_points = [] | |||
| self.input_masks = [] | |||
| def mouse_callback(self, event, x, y, flags, param): | |||
| if event == cv2.EVENT_LBUTTONDOWN: | |||
| @@ -52,9 +53,9 @@ class VideoTrackingNode: | |||
| # Track points | |||
| pred_tracks, pred_visibility = self.model( | |||
| video_chunk, | |||
| queries=queries, | |||
| is_first_step=self.is_first_step, | |||
| grid_size=0, | |||
| queries=queries, | |||
| add_support_grid=False, | |||
| ) | |||
| self.is_first_step = False | |||
| @@ -118,6 +119,8 @@ class VideoTrackingNode: | |||
| "num_points": len(visible_tracks), | |||
| "dtype": "float32", | |||
| "shape": (len(visible_tracks), 2), | |||
| "width": frame.shape[1], | |||
| "height": frame.shape[0], | |||
| }, | |||
| ) | |||
| @@ -153,7 +156,7 @@ class VideoTrackingNode: | |||
| cv2.imshow("Interactive Feed to track point", display_frame) | |||
| cv2.waitKey(1) | |||
| if event["id"] == "points": | |||
| elif event["id"] == "points": | |||
| if not self.accept_new_points: | |||
| continue | |||
| # Handle points from input_stream node | |||
| @@ -162,9 +165,13 @@ class VideoTrackingNode: | |||
| self.input_points = points_array.reshape((-1, 2)).tolist() | |||
| self.accept_new_points = False | |||
| self.is_first_step = True | |||
| if event["id"] == "boxes2d": | |||
| elif event["id"] == "boxes2d": | |||
| if not self.accept_new_points: | |||
| continue | |||
| if len(event["value"]) == 0: | |||
| self.input_points = [] | |||
| self.is_first_step = True | |||
| continue | |||
| # Handle points from input_stream node | |||
| metadata = event["metadata"] | |||
| @@ -185,7 +192,11 @@ class VideoTrackingNode: | |||
| _labels = None | |||
| self.input_points = [ | |||
| [int((x_min + x_max) / 2), int((y_min + y_max) / 2)] | |||
| [ | |||
| int(x_min + (x_max - x_min) * 2 / 4), | |||
| int(y_min + (y_max - y_min) * i / 10), | |||
| ] | |||
| for i in range(4, 7) | |||
| for x_min, y_min, x_max, y_max in boxes2d | |||
| ] | |||