diff --git a/examples/tracker/facebook_cotracker.yml b/examples/tracker/facebook_cotracker.yml new file mode 100644 index 00000000..c81e40cb --- /dev/null +++ b/examples/tracker/facebook_cotracker.yml @@ -0,0 +1,51 @@ +nodes: + - id: camera + build: pip install -e ../../node-hub/opencv-video-capture + path: opencv-video-capture + inputs: + tick: dora/timer/millis/100 + outputs: + - image + env: + CAPTURE_PATH: "0" + ENCODING: "rgb8" + IMAGE_WIDTH: "640" + IMAGE_HEIGHT: "480" + + - id: object-detection + build: pip install -e ../../node-hub/dora-yolo + path: dora-yolo + inputs: + image: camera/image + outputs: + - bbox + + - id: tracker + build: pip install -e ../../node-hub/dora-cotracker + path: dora-cotracker + inputs: + image: camera/image + boxes2d: object-detection/bbox + # points_to_track: input/points_to_track # uncomment this if using input node + outputs: + - tracked_image + - points + env: + INTERACTIVE_MODE: false + + - id: plot + build: pip install -e ../../node-hub/dora-rerun + path: dora-rerun + inputs: + image: camera/image + tracked_image: tracker/tracked_image + + # replace with your own node that outputs tracking points # uncomment if input via node + # (e.g., YOLO detector, pose estimator, etc.) + # - id: point_source + # build: pip install your-node # Replace with your node's name + # path: your-point-source-node # Replace with your node's path + # inputs: + # image: camera/image # If your node needs image input + # outputs: + # - points_to_track # Must output points in required format diff --git a/examples/tracker/parse_bbox.py b/examples/tracker/parse_bbox.py new file mode 100644 index 00000000..056b0af3 --- /dev/null +++ b/examples/tracker/parse_bbox.py @@ -0,0 +1,63 @@ +"""TODO: Add docstring.""" + +import json +import os + +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() + +IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) + + +def extract_bboxes(json_text): + """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. + + Parameters + ---------- + json_text : str + JSON string containing bounding box data, including ```json markers. + + Returns + ------- + np.ndarray: NumPy array of bounding boxes. + + """ + # Ensure all lines are stripped of whitespace and markers + lines = json_text.strip().splitlines() + + # Filter out lines that are markdown markers + clean_lines = [line for line in lines if not line.strip().startswith("```")] + + # Join the lines back into a single string + clean_text = "\n".join(clean_lines) + # Parse the cleaned JSON text + try: + data = json.loads(clean_text) + + # Extract bounding boxes + bboxes = [item["bbox_2d"] for item in data] + labels = [item["label"] for item in data] + + return np.array(bboxes), np.array(labels) + except Exception as _e: # noqa + pass + return None, None + + +for event in node: + if event["type"] == "INPUT": + text = event["value"][0].as_py() + image_id = event["metadata"]["image_id"] + + bboxes, labels = extract_bboxes(text) + if bboxes is not None and len(bboxes) > 0: + bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) + + node.send_output( + "bbox", + pa.array(bboxes.ravel()), + metadata={"encoding": "xyxy", "image_id": image_id}, + ) diff --git a/examples/tracker/qwenvl_cotracker.yml b/examples/tracker/qwenvl_cotracker.yml new file mode 100644 index 00000000..b620297d --- /dev/null +++ b/examples/tracker/qwenvl_cotracker.yml @@ -0,0 +1,67 @@ +nodes: + - id: camera + build: pip install -e ../../node-hub/opencv-video-capture + path: opencv-video-capture + inputs: + tick: dora/timer/millis/100 + outputs: + - image + env: + CAPTURE_PATH: "0" + ENCODING: "rgb8" + IMAGE_WIDTH: "640" + IMAGE_HEIGHT: "480" + + - id: dora-qwenvl + build: pip install -e ../../node-hub/dora-qwen2-5-vl + path: dora-qwen2-5-vl + inputs: + image: camera/image + text_1: dora/timer/millis/600 + outputs: + - text + env: + DEFAULT_QUESTION: Output the bounding box of the eyes. + IMAGE_RESIZE_RATIO: "0.5" + # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have + #SYSTEM_PROMPT: You're a robot. + + - id: parse_bbox + path: parse_bbox.py + inputs: + text: dora-qwenvl/text + outputs: + - bbox + env: + IMAGE_RESIZE_RATIO: "0.5" + + - id: tracker + build: pip install -e ../../node-hub/dora-cotracker + path: dora-cotracker + inputs: + image: camera/image + boxes2d: parse_bbox/bbox + # points_to_track: input/points_to_track # uncomment this if using input node + outputs: + - tracked_image + - points + env: + INTERACTIVE_MODE: false + + - id: plot + build: pip install -e ../../node-hub/dora-rerun + path: dora-rerun + inputs: + image: camera/image + boxes2d: parse_bbox/bbox + tracked_image: tracker/tracked_image + + # replace with your own node that outputs tracking points # uncomment if input via node + # (e.g., YOLO detector, pose estimator, etc.) + # - id: point_source + # build: pip install your-node # Replace with your node's name + # path: your-point-source-node # Replace with your node's path + # inputs: + # image: camera/image # If your node needs image input + # outputs: + # - points_to_track # Must output points in required format diff --git a/node-hub/dora-cotracker/demo.yml b/node-hub/dora-cotracker/demo.yml index 6bb36707..240237f2 100644 --- a/node-hub/dora-cotracker/demo.yml +++ b/node-hub/dora-cotracker/demo.yml @@ -13,14 +13,14 @@ nodes: IMAGE_HEIGHT: "480" - id: tracker - build: pip install dora-cotracker + build: pip install -e . path: dora-cotracker inputs: image: camera/image # points_to_track: input/points_to_track # uncomment this if using input node outputs: - tracked_image - - tracked_points + - points - id: plot build: pip install dora-rerun @@ -29,8 +29,7 @@ nodes: image: camera/image tracked_image: tracker/tracked_image - - # replace with your own node that outputs tracking points # uncomment if input via node + # replace with your own node that outputs tracking points # uncomment if input via node # (e.g., YOLO detector, pose estimator, etc.) # - id: point_source # build: pip install your-node # Replace with your node's name @@ -38,4 +37,4 @@ nodes: # inputs: # image: camera/image # If your node needs image input # outputs: - # - points_to_track # Must output points in required format \ No newline at end of file + # - points_to_track # Must output points in required format diff --git a/node-hub/dora-cotracker/dora_cotracker/main.py b/node-hub/dora-cotracker/dora_cotracker/main.py index 5e4c08a4..dcfaeb54 100644 --- a/node-hub/dora-cotracker/dora_cotracker/main.py +++ b/node-hub/dora-cotracker/dora_cotracker/main.py @@ -1,9 +1,14 @@ +import os +from collections import deque + +import cv2 import numpy as np import pyarrow as pa -from dora import Node -import cv2 import torch -from collections import deque +from dora import Node + +INTERACTIVE_MODE = os.getenv("INTERACTIVE_MODE", "false").lower() == "true" + class VideoTrackingNode: def __init__(self): @@ -12,10 +17,12 @@ class VideoTrackingNode: self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = torch.hub.load("facebookresearch/co-tracker", "cotracker3_online") self.model = self.model.to(self.device) + self.model.eval() self.model.step = 8 - self.buffer_size = self.model.step * 2 + self.buffer_size = self.model.step * 2 self.window_frames = deque(maxlen=self.buffer_size) self.is_first_step = True + self.accept_new_points = True self.clicked_points = [] self.input_points = [] @@ -29,14 +36,12 @@ class VideoTrackingNode: """Process frame for tracking""" if len(self.window_frames) == self.buffer_size: all_points = self.input_points + self.clicked_points - + if not all_points: print("No points to track") return None, None - video_chunk = torch.tensor( - np.stack(list(self.window_frames)), - device=self.device + np.stack(list(self.window_frames)), device=self.device ).float() video_chunk = video_chunk / 255.0 # Reshape to [B,T,C,H,W] @@ -50,11 +55,12 @@ class VideoTrackingNode: is_first_step=self.is_first_step, grid_size=0, queries=queries, - add_support_grid=False + add_support_grid=False, ) self.is_first_step = False if pred_tracks is not None and pred_visibility is not None: + self.accept_new_points = True tracks = pred_tracks[0, -1].cpu().numpy() visibility = pred_visibility[0, -1].cpu().numpy() visible_tracks = [] @@ -66,84 +72,131 @@ class VideoTrackingNode: frame_viz = frame.copy() num_input_stream = len(self.input_points) # Draw input points in red - for i, (pt, vis) in enumerate(zip(tracks[:num_input_stream], visibility[:num_input_stream])): + for i, (pt, vis) in enumerate( + zip(tracks[:num_input_stream], visibility[:num_input_stream]) + ): if vis > 0.5: x, y = int(pt[0]), int(pt[1]) - cv2.circle(frame_viz, (x, y), radius=3, - color=(0, 255, 0), thickness=-1) - cv2.putText(frame_viz, f"I{i}", (x + 5, y - 5), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) - + cv2.circle( + frame_viz, (x, y), radius=3, color=(0, 255, 0), thickness=-1 + ) + cv2.putText( + frame_viz, + f"I{i}", + (x + 5, y - 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 255, 0), + 1, + ) + # Draw clicked points in red - for i, (pt, vis) in enumerate(zip(tracks[num_input_stream:], visibility[num_input_stream:])): + for i, (pt, vis) in enumerate( + zip(tracks[num_input_stream:], visibility[num_input_stream:]) + ): if vis > 0.5: x, y = int(pt[0]), int(pt[1]) - cv2.circle(frame_viz, (x, y), radius=3, - color=(0, 0, 255), thickness=-1) - cv2.putText(frame_viz, f"C{i}", (x + 5, y - 5), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) - + cv2.circle( + frame_viz, (x, y), radius=3, color=(0, 0, 255), thickness=-1 + ) + cv2.putText( + frame_viz, + f"C{i}", + (x + 5, y - 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 0, 255), + 1, + ) + # Send tracked points if len(visible_tracks) > 0: self.node.send_output( - "tracked_points", + "points", pa.array(visible_tracks.ravel()), { "num_points": len(visible_tracks), "dtype": "float32", - "shape": (len(visible_tracks), 2) - } + "shape": (len(visible_tracks), 2), + }, ) - + return frame, frame_viz return None, None def run(self): """Main run loop""" - cv2.namedWindow("Raw Feed", cv2.WINDOW_NORMAL) - cv2.setMouseCallback("Raw Feed", self.mouse_callback) + if INTERACTIVE_MODE: + cv2.namedWindow("Interactive Feed to track point", cv2.WINDOW_NORMAL) + cv2.setMouseCallback("Interactive Feed to track point", self.mouse_callback) for event in self.node: if event["type"] == "INPUT": if event["id"] == "image": metadata = event["metadata"] - frame = event["value"].to_numpy().reshape(( - metadata["height"], - metadata["width"], - 3 - )) + frame = ( + event["value"] + .to_numpy() + .reshape((metadata["height"], metadata["width"], 3)) + ) # Add frame to tracking window self.window_frames.append(frame) original_frame, tracked_frame = self.process_tracking(frame) if original_frame is not None and tracked_frame is not None: - self.node.send_output("image", - pa.array(original_frame.ravel()), - metadata - ) - self.node.send_output("tracked_image", - pa.array(tracked_frame.ravel()), - metadata + self.node.send_output( + "tracked_image", pa.array(tracked_frame.ravel()), metadata ) - display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) - cv2.imshow("Raw Feed", display_frame) - cv2.waitKey(1) - - if event["id"] == "points_to_track": + if INTERACTIVE_MODE: + display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + cv2.imshow("Interactive Feed to track point", display_frame) + cv2.waitKey(1) + + if event["id"] == "points": + if not self.accept_new_points: + continue # Handle points from input_stream node metadata = event["metadata"] points_array = event["value"].to_numpy() - num_points = metadata["num_points"] - self.input_points = points_array.reshape((num_points, 2)).tolist() + self.input_points = points_array.reshape((-1, 2)).tolist() + self.accept_new_points = False self.is_first_step = True - print(f"Received {num_points} points from input_stream") + if event["id"] == "boxes2d": + if not self.accept_new_points: + continue + + # Handle points from input_stream node + metadata = event["metadata"] + if isinstance(event["value"], pa.StructArray): + boxes2d = ( + event["value"] + .get("bbox") + .values.to_numpy() + .reshape((-1, 4)) + ) + _labels = ( + event["value"] + .get("labels") + .values.to_numpy(zero_copy_only=False) + ) + else: + boxes2d = event["value"].to_numpy().reshape((-1, 4)) + _labels = None + self.input_points = [ + [int((x_min + x_max) / 2), int((y_min + y_max) / 2)] + for x_min, y_min, x_max, y_max in boxes2d + ] + + self.is_first_step = True + self.accept_new_points = False def main(): tracker = VideoTrackingNode() tracker.run() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/node-hub/dora-cotracker/pyproject.toml b/node-hub/dora-cotracker/pyproject.toml index 3359a888..a27d21c6 100644 --- a/node-hub/dora-cotracker/pyproject.toml +++ b/node-hub/dora-cotracker/pyproject.toml @@ -1,11 +1,9 @@ [project] name = "dora-cotracker" version = "0.1.0" -authors = [ - { name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" } -] +authors = [{ name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" }] description = "A Dora node implementing real-time object tracking using Facebook's CoTracker model" -license = { text = "MIT" } +license = "CC-BY-1.0" readme = "README.md" requires-python = ">=3.10" @@ -26,9 +24,9 @@ dora-cotracker = "dora_cotracker.main:main" [tool.ruff.lint] extend-select = [ - "PERF", # Performance - "RET", # Return statements - "RSE", # Runtime errors - "NPY", # NumPy - "N", # Naming + "PERF", # Performance + "RET", # Return statements + "RSE", # Runtime errors + "NPY", # NumPy + "N", # Naming ]