Minor fix on cotracker as well as adding support for tracking bounding box center pointtags/v0.3.12-rc0
| @@ -0,0 +1,51 @@ | |||
| nodes: | |||
| - id: camera | |||
| build: pip install -e ../../node-hub/opencv-video-capture | |||
| path: opencv-video-capture | |||
| inputs: | |||
| tick: dora/timer/millis/100 | |||
| outputs: | |||
| - image | |||
| env: | |||
| CAPTURE_PATH: "0" | |||
| ENCODING: "rgb8" | |||
| IMAGE_WIDTH: "640" | |||
| IMAGE_HEIGHT: "480" | |||
| - id: object-detection | |||
| build: pip install -e ../../node-hub/dora-yolo | |||
| path: dora-yolo | |||
| inputs: | |||
| image: camera/image | |||
| outputs: | |||
| - bbox | |||
| - id: tracker | |||
| build: pip install -e ../../node-hub/dora-cotracker | |||
| path: dora-cotracker | |||
| inputs: | |||
| image: camera/image | |||
| boxes2d: object-detection/bbox | |||
| # points_to_track: input/points_to_track # uncomment this if using input node | |||
| outputs: | |||
| - tracked_image | |||
| - points | |||
| env: | |||
| INTERACTIVE_MODE: false | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| inputs: | |||
| image: camera/image | |||
| tracked_image: tracker/tracked_image | |||
| # replace with your own node that outputs tracking points # uncomment if input via node | |||
| # (e.g., YOLO detector, pose estimator, etc.) | |||
| # - id: point_source | |||
| # build: pip install your-node # Replace with your node's name | |||
| # path: your-point-source-node # Replace with your node's path | |||
| # inputs: | |||
| # image: camera/image # If your node needs image input | |||
| # outputs: | |||
| # - points_to_track # Must output points in required format | |||
| @@ -0,0 +1,63 @@ | |||
| """TODO: Add docstring.""" | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| node = Node() | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| def extract_bboxes(json_text): | |||
| """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. | |||
| Parameters | |||
| ---------- | |||
| json_text : str | |||
| JSON string containing bounding box data, including ```json markers. | |||
| Returns | |||
| ------- | |||
| np.ndarray: NumPy array of bounding boxes. | |||
| """ | |||
| # Ensure all lines are stripped of whitespace and markers | |||
| lines = json_text.strip().splitlines() | |||
| # Filter out lines that are markdown markers | |||
| clean_lines = [line for line in lines if not line.strip().startswith("```")] | |||
| # Join the lines back into a single string | |||
| clean_text = "\n".join(clean_lines) | |||
| # Parse the cleaned JSON text | |||
| try: | |||
| data = json.loads(clean_text) | |||
| # Extract bounding boxes | |||
| bboxes = [item["bbox_2d"] for item in data] | |||
| labels = [item["label"] for item in data] | |||
| return np.array(bboxes), np.array(labels) | |||
| except Exception as _e: # noqa | |||
| pass | |||
| return None, None | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| text = event["value"][0].as_py() | |||
| image_id = event["metadata"]["image_id"] | |||
| bboxes, labels = extract_bboxes(text) | |||
| if bboxes is not None and len(bboxes) > 0: | |||
| bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) | |||
| node.send_output( | |||
| "bbox", | |||
| pa.array(bboxes.ravel()), | |||
| metadata={"encoding": "xyxy", "image_id": image_id}, | |||
| ) | |||
| @@ -0,0 +1,67 @@ | |||
| nodes: | |||
| - id: camera | |||
| build: pip install -e ../../node-hub/opencv-video-capture | |||
| path: opencv-video-capture | |||
| inputs: | |||
| tick: dora/timer/millis/100 | |||
| outputs: | |||
| - image | |||
| env: | |||
| CAPTURE_PATH: "0" | |||
| ENCODING: "rgb8" | |||
| IMAGE_WIDTH: "640" | |||
| IMAGE_HEIGHT: "480" | |||
| - id: dora-qwenvl | |||
| build: pip install -e ../../node-hub/dora-qwen2-5-vl | |||
| path: dora-qwen2-5-vl | |||
| inputs: | |||
| image: camera/image | |||
| text_1: dora/timer/millis/600 | |||
| outputs: | |||
| - text | |||
| env: | |||
| DEFAULT_QUESTION: Output the bounding box of the eyes. | |||
| IMAGE_RESIZE_RATIO: "0.5" | |||
| # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have | |||
| #SYSTEM_PROMPT: You're a robot. | |||
| - id: parse_bbox | |||
| path: parse_bbox.py | |||
| inputs: | |||
| text: dora-qwenvl/text | |||
| outputs: | |||
| - bbox | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "0.5" | |||
| - id: tracker | |||
| build: pip install -e ../../node-hub/dora-cotracker | |||
| path: dora-cotracker | |||
| inputs: | |||
| image: camera/image | |||
| boxes2d: parse_bbox/bbox | |||
| # points_to_track: input/points_to_track # uncomment this if using input node | |||
| outputs: | |||
| - tracked_image | |||
| - points | |||
| env: | |||
| INTERACTIVE_MODE: false | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| inputs: | |||
| image: camera/image | |||
| boxes2d: parse_bbox/bbox | |||
| tracked_image: tracker/tracked_image | |||
| # replace with your own node that outputs tracking points # uncomment if input via node | |||
| # (e.g., YOLO detector, pose estimator, etc.) | |||
| # - id: point_source | |||
| # build: pip install your-node # Replace with your node's name | |||
| # path: your-point-source-node # Replace with your node's path | |||
| # inputs: | |||
| # image: camera/image # If your node needs image input | |||
| # outputs: | |||
| # - points_to_track # Must output points in required format | |||
| @@ -13,14 +13,14 @@ nodes: | |||
| IMAGE_HEIGHT: "480" | |||
| - id: tracker | |||
| build: pip install dora-cotracker | |||
| build: pip install -e . | |||
| path: dora-cotracker | |||
| inputs: | |||
| image: camera/image | |||
| # points_to_track: input/points_to_track # uncomment this if using input node | |||
| outputs: | |||
| - tracked_image | |||
| - tracked_points | |||
| - points | |||
| - id: plot | |||
| build: pip install dora-rerun | |||
| @@ -29,8 +29,7 @@ nodes: | |||
| image: camera/image | |||
| tracked_image: tracker/tracked_image | |||
| # replace with your own node that outputs tracking points # uncomment if input via node | |||
| # replace with your own node that outputs tracking points # uncomment if input via node | |||
| # (e.g., YOLO detector, pose estimator, etc.) | |||
| # - id: point_source | |||
| # build: pip install your-node # Replace with your node's name | |||
| @@ -38,4 +37,4 @@ nodes: | |||
| # inputs: | |||
| # image: camera/image # If your node needs image input | |||
| # outputs: | |||
| # - points_to_track # Must output points in required format | |||
| # - points_to_track # Must output points in required format | |||
| @@ -1,9 +1,14 @@ | |||
| import os | |||
| from collections import deque | |||
| import cv2 | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| import cv2 | |||
| import torch | |||
| from collections import deque | |||
| from dora import Node | |||
| INTERACTIVE_MODE = os.getenv("INTERACTIVE_MODE", "false").lower() == "true" | |||
| class VideoTrackingNode: | |||
| def __init__(self): | |||
| @@ -12,10 +17,12 @@ class VideoTrackingNode: | |||
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |||
| self.model = torch.hub.load("facebookresearch/co-tracker", "cotracker3_online") | |||
| self.model = self.model.to(self.device) | |||
| self.model.eval() | |||
| self.model.step = 8 | |||
| self.buffer_size = self.model.step * 2 | |||
| self.buffer_size = self.model.step * 2 | |||
| self.window_frames = deque(maxlen=self.buffer_size) | |||
| self.is_first_step = True | |||
| self.accept_new_points = True | |||
| self.clicked_points = [] | |||
| self.input_points = [] | |||
| @@ -29,14 +36,12 @@ class VideoTrackingNode: | |||
| """Process frame for tracking""" | |||
| if len(self.window_frames) == self.buffer_size: | |||
| all_points = self.input_points + self.clicked_points | |||
| if not all_points: | |||
| print("No points to track") | |||
| return None, None | |||
| video_chunk = torch.tensor( | |||
| np.stack(list(self.window_frames)), | |||
| device=self.device | |||
| np.stack(list(self.window_frames)), device=self.device | |||
| ).float() | |||
| video_chunk = video_chunk / 255.0 | |||
| # Reshape to [B,T,C,H,W] | |||
| @@ -50,11 +55,12 @@ class VideoTrackingNode: | |||
| is_first_step=self.is_first_step, | |||
| grid_size=0, | |||
| queries=queries, | |||
| add_support_grid=False | |||
| add_support_grid=False, | |||
| ) | |||
| self.is_first_step = False | |||
| if pred_tracks is not None and pred_visibility is not None: | |||
| self.accept_new_points = True | |||
| tracks = pred_tracks[0, -1].cpu().numpy() | |||
| visibility = pred_visibility[0, -1].cpu().numpy() | |||
| visible_tracks = [] | |||
| @@ -66,84 +72,131 @@ class VideoTrackingNode: | |||
| frame_viz = frame.copy() | |||
| num_input_stream = len(self.input_points) | |||
| # Draw input points in red | |||
| for i, (pt, vis) in enumerate(zip(tracks[:num_input_stream], visibility[:num_input_stream])): | |||
| for i, (pt, vis) in enumerate( | |||
| zip(tracks[:num_input_stream], visibility[:num_input_stream]) | |||
| ): | |||
| if vis > 0.5: | |||
| x, y = int(pt[0]), int(pt[1]) | |||
| cv2.circle(frame_viz, (x, y), radius=3, | |||
| color=(0, 255, 0), thickness=-1) | |||
| cv2.putText(frame_viz, f"I{i}", (x + 5, y - 5), | |||
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) | |||
| cv2.circle( | |||
| frame_viz, (x, y), radius=3, color=(0, 255, 0), thickness=-1 | |||
| ) | |||
| cv2.putText( | |||
| frame_viz, | |||
| f"I{i}", | |||
| (x + 5, y - 5), | |||
| cv2.FONT_HERSHEY_SIMPLEX, | |||
| 0.5, | |||
| (0, 255, 0), | |||
| 1, | |||
| ) | |||
| # Draw clicked points in red | |||
| for i, (pt, vis) in enumerate(zip(tracks[num_input_stream:], visibility[num_input_stream:])): | |||
| for i, (pt, vis) in enumerate( | |||
| zip(tracks[num_input_stream:], visibility[num_input_stream:]) | |||
| ): | |||
| if vis > 0.5: | |||
| x, y = int(pt[0]), int(pt[1]) | |||
| cv2.circle(frame_viz, (x, y), radius=3, | |||
| color=(0, 0, 255), thickness=-1) | |||
| cv2.putText(frame_viz, f"C{i}", (x + 5, y - 5), | |||
| cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) | |||
| cv2.circle( | |||
| frame_viz, (x, y), radius=3, color=(0, 0, 255), thickness=-1 | |||
| ) | |||
| cv2.putText( | |||
| frame_viz, | |||
| f"C{i}", | |||
| (x + 5, y - 5), | |||
| cv2.FONT_HERSHEY_SIMPLEX, | |||
| 0.5, | |||
| (0, 0, 255), | |||
| 1, | |||
| ) | |||
| # Send tracked points | |||
| if len(visible_tracks) > 0: | |||
| self.node.send_output( | |||
| "tracked_points", | |||
| "points", | |||
| pa.array(visible_tracks.ravel()), | |||
| { | |||
| "num_points": len(visible_tracks), | |||
| "dtype": "float32", | |||
| "shape": (len(visible_tracks), 2) | |||
| } | |||
| "shape": (len(visible_tracks), 2), | |||
| }, | |||
| ) | |||
| return frame, frame_viz | |||
| return None, None | |||
| def run(self): | |||
| """Main run loop""" | |||
| cv2.namedWindow("Raw Feed", cv2.WINDOW_NORMAL) | |||
| cv2.setMouseCallback("Raw Feed", self.mouse_callback) | |||
| if INTERACTIVE_MODE: | |||
| cv2.namedWindow("Interactive Feed to track point", cv2.WINDOW_NORMAL) | |||
| cv2.setMouseCallback("Interactive Feed to track point", self.mouse_callback) | |||
| for event in self.node: | |||
| if event["type"] == "INPUT": | |||
| if event["id"] == "image": | |||
| metadata = event["metadata"] | |||
| frame = event["value"].to_numpy().reshape(( | |||
| metadata["height"], | |||
| metadata["width"], | |||
| 3 | |||
| )) | |||
| frame = ( | |||
| event["value"] | |||
| .to_numpy() | |||
| .reshape((metadata["height"], metadata["width"], 3)) | |||
| ) | |||
| # Add frame to tracking window | |||
| self.window_frames.append(frame) | |||
| original_frame, tracked_frame = self.process_tracking(frame) | |||
| if original_frame is not None and tracked_frame is not None: | |||
| self.node.send_output("image", | |||
| pa.array(original_frame.ravel()), | |||
| metadata | |||
| ) | |||
| self.node.send_output("tracked_image", | |||
| pa.array(tracked_frame.ravel()), | |||
| metadata | |||
| self.node.send_output( | |||
| "tracked_image", pa.array(tracked_frame.ravel()), metadata | |||
| ) | |||
| display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) | |||
| cv2.imshow("Raw Feed", display_frame) | |||
| cv2.waitKey(1) | |||
| if event["id"] == "points_to_track": | |||
| if INTERACTIVE_MODE: | |||
| display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) | |||
| cv2.imshow("Interactive Feed to track point", display_frame) | |||
| cv2.waitKey(1) | |||
| if event["id"] == "points": | |||
| if not self.accept_new_points: | |||
| continue | |||
| # Handle points from input_stream node | |||
| metadata = event["metadata"] | |||
| points_array = event["value"].to_numpy() | |||
| num_points = metadata["num_points"] | |||
| self.input_points = points_array.reshape((num_points, 2)).tolist() | |||
| self.input_points = points_array.reshape((-1, 2)).tolist() | |||
| self.accept_new_points = False | |||
| self.is_first_step = True | |||
| print(f"Received {num_points} points from input_stream") | |||
| if event["id"] == "boxes2d": | |||
| if not self.accept_new_points: | |||
| continue | |||
| # Handle points from input_stream node | |||
| metadata = event["metadata"] | |||
| if isinstance(event["value"], pa.StructArray): | |||
| boxes2d = ( | |||
| event["value"] | |||
| .get("bbox") | |||
| .values.to_numpy() | |||
| .reshape((-1, 4)) | |||
| ) | |||
| _labels = ( | |||
| event["value"] | |||
| .get("labels") | |||
| .values.to_numpy(zero_copy_only=False) | |||
| ) | |||
| else: | |||
| boxes2d = event["value"].to_numpy().reshape((-1, 4)) | |||
| _labels = None | |||
| self.input_points = [ | |||
| [int((x_min + x_max) / 2), int((y_min + y_max) / 2)] | |||
| for x_min, y_min, x_max, y_max in boxes2d | |||
| ] | |||
| self.is_first_step = True | |||
| self.accept_new_points = False | |||
| def main(): | |||
| tracker = VideoTrackingNode() | |||
| tracker.run() | |||
| if __name__ == "__main__": | |||
| main() | |||
| main() | |||
| @@ -1,11 +1,9 @@ | |||
| [project] | |||
| name = "dora-cotracker" | |||
| version = "0.1.0" | |||
| authors = [ | |||
| { name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" } | |||
| ] | |||
| authors = [{ name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" }] | |||
| description = "A Dora node implementing real-time object tracking using Facebook's CoTracker model" | |||
| license = { text = "MIT" } | |||
| license = "CC-BY-1.0" | |||
| readme = "README.md" | |||
| requires-python = ">=3.10" | |||
| @@ -26,9 +24,9 @@ dora-cotracker = "dora_cotracker.main:main" | |||
| [tool.ruff.lint] | |||
| extend-select = [ | |||
| "PERF", # Performance | |||
| "RET", # Return statements | |||
| "RSE", # Runtime errors | |||
| "NPY", # NumPy | |||
| "N", # Naming | |||
| "PERF", # Performance | |||
| "RET", # Return statements | |||
| "RSE", # Runtime errors | |||
| "NPY", # NumPy | |||
| "N", # Naming | |||
| ] | |||