| @@ -0,0 +1,63 @@ | |||
| """TODO: Add docstring.""" | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| node = Node() | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| def extract_bboxes(json_text): | |||
| """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. | |||
| Parameters | |||
| ---------- | |||
| json_text : str | |||
| JSON string containing bounding box data, including ```json markers. | |||
| Returns | |||
| ------- | |||
| np.ndarray: NumPy array of bounding boxes. | |||
| """ | |||
| # Ensure all lines are stripped of whitespace and markers | |||
| lines = json_text.strip().splitlines() | |||
| # Filter out lines that are markdown markers | |||
| clean_lines = [line for line in lines if not line.strip().startswith("```")] | |||
| # Join the lines back into a single string | |||
| clean_text = "\n".join(clean_lines) | |||
| # Parse the cleaned JSON text | |||
| try: | |||
| data = json.loads(clean_text) | |||
| # Extract bounding boxes | |||
| bboxes = [item["bbox_2d"] for item in data] | |||
| labels = [item["label"] for item in data] | |||
| return np.array(bboxes), np.array(labels) | |||
| except Exception as _e: # noqa | |||
| pass | |||
| return None, None | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| text = event["value"][0].as_py() | |||
| image_id = event["metadata"]["image_id"] | |||
| bboxes, labels = extract_bboxes(text) | |||
| if bboxes is not None and len(bboxes) > 0: | |||
| bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) | |||
| node.send_output( | |||
| "bbox", | |||
| pa.array(bboxes.ravel()), | |||
| metadata={"encoding": "xyxy", "image_id": image_id}, | |||
| ) | |||
| @@ -0,0 +1,67 @@ | |||
| nodes: | |||
| - id: camera | |||
| build: pip install -e ../../node-hub/opencv-video-capture | |||
| path: opencv-video-capture | |||
| inputs: | |||
| tick: dora/timer/millis/100 | |||
| outputs: | |||
| - image | |||
| env: | |||
| CAPTURE_PATH: "0" | |||
| ENCODING: "rgb8" | |||
| IMAGE_WIDTH: "640" | |||
| IMAGE_HEIGHT: "480" | |||
| - id: dora-qwenvl | |||
| build: pip install -e ../../node-hub/dora-qwen2-5-vl | |||
| path: dora-qwen2-5-vl | |||
| inputs: | |||
| image: camera/image | |||
| text_1: dora/timer/millis/600 | |||
| outputs: | |||
| - text | |||
| env: | |||
| DEFAULT_QUESTION: Output the bounding box of the eyes. | |||
| IMAGE_RESIZE_RATIO: "0.5" | |||
| # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have | |||
| #SYSTEM_PROMPT: You're a robot. | |||
| - id: parse_bbox | |||
| path: parse_bbox.py | |||
| inputs: | |||
| text: dora-qwenvl/text | |||
| outputs: | |||
| - bbox | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "0.5" | |||
| - id: tracker | |||
| build: pip install -e ../../node-hub/dora-cotracker | |||
| path: dora-cotracker | |||
| inputs: | |||
| image: camera/image | |||
| boxes2d: parse_bbox/bbox | |||
| # points_to_track: input/points_to_track # uncomment this if using input node | |||
| outputs: | |||
| - tracked_image | |||
| - points | |||
| env: | |||
| INTERACTIVE_MODE: false | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| inputs: | |||
| image: camera/image | |||
| boxes2d: parse_bbox/bbox | |||
| tracked_image: tracker/tracked_image | |||
| # replace with your own node that outputs tracking points # uncomment if input via node | |||
| # (e.g., YOLO detector, pose estimator, etc.) | |||
| # - id: point_source | |||
| # build: pip install your-node # Replace with your node's name | |||
| # path: your-point-source-node # Replace with your node's path | |||
| # inputs: | |||
| # image: camera/image # If your node needs image input | |||
| # outputs: | |||
| # - points_to_track # Must output points in required format | |||
| @@ -22,6 +22,7 @@ class VideoTrackingNode: | |||
| self.buffer_size = self.model.step * 2 | |||
| self.window_frames = deque(maxlen=self.buffer_size) | |||
| self.is_first_step = True | |||
| self.accept_new_points = True | |||
| self.clicked_points = [] | |||
| self.input_points = [] | |||
| @@ -59,6 +60,7 @@ class VideoTrackingNode: | |||
| self.is_first_step = False | |||
| if pred_tracks is not None and pred_visibility is not None: | |||
| self.accept_new_points = True | |||
| tracks = pred_tracks[0, -1].cpu().numpy() | |||
| visibility = pred_visibility[0, -1].cpu().numpy() | |||
| visible_tracks = [] | |||
| @@ -152,25 +154,29 @@ class VideoTrackingNode: | |||
| cv2.waitKey(1) | |||
| if event["id"] == "points": | |||
| if not self.accept_new_points: | |||
| continue | |||
| # Handle points from input_stream node | |||
| metadata = event["metadata"] | |||
| points_array = event["value"].to_numpy() | |||
| self.input_points = points_array.reshape((-1, 2)).tolist() | |||
| self.accept_new_points = False | |||
| self.is_first_step = True | |||
| if event["id"] == "boxes2d": | |||
| if not self.is_first_step: | |||
| if not self.accept_new_points: | |||
| continue | |||
| # Handle points from input_stream node | |||
| metadata = event["metadata"] | |||
| if isinstance(event["value"], pa.StructArray): | |||
| boxes2d = ( | |||
| event["value"][0] | |||
| event["value"] | |||
| .get("bbox") | |||
| .values.to_numpy() | |||
| .reshape((-1, 4)) | |||
| ) | |||
| _labels = ( | |||
| event["value"][0] | |||
| event["value"] | |||
| .get("labels") | |||
| .values.to_numpy(zero_copy_only=False) | |||
| ) | |||
| @@ -184,6 +190,7 @@ class VideoTrackingNode: | |||
| ] | |||
| self.is_first_step = True | |||
| self.accept_new_points = False | |||
| def main(): | |||