Minor fix and add boxes2d example to facebook/cotracker (#950)

Minor fix on cotracker as well as adding support for tracking bounding box center point
11 months ago · ccbc82fc8f
--- a/examples/tracker/facebook_cotracker.yml
+++ b/examples/tracker/facebook_cotracker.yml
@@ -0,0 +1,51 @@
 nodes:
  - id: camera
    build: pip install -e ../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: "0"
      ENCODING: "rgb8"
      IMAGE_WIDTH: "640"
      IMAGE_HEIGHT: "480"
  - id: object-detection
    build: pip install -e ../../node-hub/dora-yolo
    path: dora-yolo
    inputs:
      image: camera/image
    outputs:
      - bbox
  - id: tracker
    build: pip install -e ../../node-hub/dora-cotracker
    path: dora-cotracker
    inputs:
      image: camera/image
      boxes2d: object-detection/bbox
      # points_to_track: input/points_to_track    # uncomment this if using input node
    outputs:
      - tracked_image
      - points
    env:
      INTERACTIVE_MODE: false
  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      image: camera/image
      tracked_image: tracker/tracked_image
  # replace with your own node that outputs tracking points # uncomment if input via node
  # (e.g., YOLO detector, pose estimator, etc.)
  # - id: point_source
  #   build: pip install your-node  # Replace with your node's name
  #   path: your-point-source-node  # Replace with your node's path
  #   inputs:
  #     image: camera/image  # If your node needs image input
  #   outputs:
  #     - points_to_track    # Must output points in required format
--- a/examples/tracker/parse_bbox.py
+++ b/examples/tracker/parse_bbox.py
@@ -0,0 +1,63 @@
 """TODO: Add docstring."""
 import json
 import os
 import numpy as np
 import pyarrow as pa
 from dora import Node
 node = Node()
 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))
 def extract_bboxes(json_text):
    """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.
    Parameters
    ----------
    json_text : str
        JSON string containing bounding box data, including ```json markers.
    Returns
    -------
    np.ndarray: NumPy array of bounding boxes.
    """
    # Ensure all lines are stripped of whitespace and markers
    lines = json_text.strip().splitlines()
    # Filter out lines that are markdown markers
    clean_lines = [line for line in lines if not line.strip().startswith("```")]
    # Join the lines back into a single string
    clean_text = "\n".join(clean_lines)
    # Parse the cleaned JSON text
    try:
        data = json.loads(clean_text)
        # Extract bounding boxes
        bboxes = [item["bbox_2d"] for item in data]
        labels = [item["label"] for item in data]
        return np.array(bboxes), np.array(labels)
    except Exception as _e:  # noqa
        pass
    return None, None
 for event in node:
    if event["type"] == "INPUT":
        text = event["value"][0].as_py()
        image_id = event["metadata"]["image_id"]
        bboxes, labels = extract_bboxes(text)
        if bboxes is not None and len(bboxes) > 0:
            bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
            node.send_output(
                "bbox",
                pa.array(bboxes.ravel()),
                metadata={"encoding": "xyxy", "image_id": image_id},
            )
--- a/examples/tracker/qwenvl_cotracker.yml
+++ b/examples/tracker/qwenvl_cotracker.yml
@@ -0,0 +1,67 @@
 nodes:
  - id: camera
    build: pip install -e ../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: "0"
      ENCODING: "rgb8"
      IMAGE_WIDTH: "640"
      IMAGE_HEIGHT: "480"
  - id: dora-qwenvl
    build: pip install -e ../../node-hub/dora-qwen2-5-vl
    path: dora-qwen2-5-vl
    inputs:
      image: camera/image
      text_1: dora/timer/millis/600
    outputs:
      - text
    env:
      DEFAULT_QUESTION: Output the bounding box of the eyes.
      IMAGE_RESIZE_RATIO: "0.5"
      # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have
      #SYSTEM_PROMPT: You're a robot.
  - id: parse_bbox
    path: parse_bbox.py
    inputs:
      text: dora-qwenvl/text
    outputs:
      - bbox
    env:
      IMAGE_RESIZE_RATIO: "0.5"
  - id: tracker
    build: pip install -e ../../node-hub/dora-cotracker
    path: dora-cotracker
    inputs:
      image: camera/image
      boxes2d: parse_bbox/bbox
      # points_to_track: input/points_to_track    # uncomment this if using input node
    outputs:
      - tracked_image
      - points
    env:
      INTERACTIVE_MODE: false
  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      image: camera/image
      boxes2d: parse_bbox/bbox
      tracked_image: tracker/tracked_image
  # replace with your own node that outputs tracking points # uncomment if input via node
  # (e.g., YOLO detector, pose estimator, etc.)
  # - id: point_source
  #   build: pip install your-node  # Replace with your node's name
  #   path: your-point-source-node  # Replace with your node's path
  #   inputs:
  #     image: camera/image  # If your node needs image input
  #   outputs:
  #     - points_to_track    # Must output points in required format
--- a/node-hub/dora-cotracker/demo.yml
+++ b/node-hub/dora-cotracker/demo.yml
@@ -13,14 +13,14 @@ nodes:
      IMAGE_HEIGHT: "480"
  - id: tracker
    build: pip install dora-cotracker
    build: pip install -e .
    path: dora-cotracker
    inputs:
      image: camera/image
      # points_to_track: input/points_to_track    # uncomment this if using input node
    outputs:
      - tracked_image
      - tracked_points
      - points
  - id: plot
    build: pip install dora-rerun
@@ -29,8 +29,7 @@ nodes:
      image: camera/image
      tracked_image: tracker/tracked_image
  # replace with your own node that outputs tracking points # uncomment if input via node 
  # replace with your own node that outputs tracking points # uncomment if input via node
  # (e.g., YOLO detector, pose estimator, etc.)
  # - id: point_source
  #   build: pip install your-node  # Replace with your node's name
@@ -38,4 +37,4 @@ nodes:
  #   inputs:
  #     image: camera/image  # If your node needs image input
  #   outputs:
  #     - points_to_track    # Must output points in required format
  #     - points_to_track    # Must output points in required format
--- a/node-hub/dora-cotracker/dora_cotracker/main.py
+++ b/node-hub/dora-cotracker/dora_cotracker/main.py
@@ -1,9 +1,14 @@
 import os
 from collections import deque
 import cv2
 import numpy as np
 import pyarrow as pa
 from dora import Node
 import cv2
 import torch
 from collections import deque
 from dora import Node
 INTERACTIVE_MODE = os.getenv("INTERACTIVE_MODE", "false").lower() == "true"
 class VideoTrackingNode:
    def __init__(self):
@@ -12,10 +17,12 @@ class VideoTrackingNode:
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = torch.hub.load("facebookresearch/co-tracker", "cotracker3_online")
        self.model = self.model.to(self.device)
        self.model.eval()
        self.model.step = 8
        self.buffer_size = self.model.step * 2 
        self.buffer_size = self.model.step * 2
        self.window_frames = deque(maxlen=self.buffer_size)
        self.is_first_step = True
        self.accept_new_points = True
        self.clicked_points = []
        self.input_points = []
@@ -29,14 +36,12 @@ class VideoTrackingNode:
        """Process frame for tracking"""
        if len(self.window_frames) == self.buffer_size:
            all_points = self.input_points + self.clicked_points
            if not all_points:
                print("No points to track")
                return None, None
            video_chunk = torch.tensor(
                np.stack(list(self.window_frames)), 
                device=self.device
                np.stack(list(self.window_frames)), device=self.device
            ).float()
            video_chunk = video_chunk / 255.0
            # Reshape to [B,T,C,H,W]
@@ -50,11 +55,12 @@ class VideoTrackingNode:
                is_first_step=self.is_first_step,
                grid_size=0,
                queries=queries,
                add_support_grid=False
                add_support_grid=False,
            )
            self.is_first_step = False
            if pred_tracks is not None and pred_visibility is not None:
                self.accept_new_points = True
                tracks = pred_tracks[0, -1].cpu().numpy()
                visibility = pred_visibility[0, -1].cpu().numpy()
                visible_tracks = []
@@ -66,84 +72,131 @@ class VideoTrackingNode:
                frame_viz = frame.copy()
                num_input_stream = len(self.input_points)
                # Draw input points in red
                for i, (pt, vis) in enumerate(zip(tracks[:num_input_stream], visibility[:num_input_stream])):
                for i, (pt, vis) in enumerate(
                    zip(tracks[:num_input_stream], visibility[:num_input_stream])
                ):
                    if vis > 0.5:
                        x, y = int(pt[0]), int(pt[1])
                        cv2.circle(frame_viz, (x, y), radius=3, 
                                    color=(0, 255, 0), thickness=-1)
                        cv2.putText(frame_viz, f"I{i}", (x + 5, y - 5),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
                        cv2.circle(
                            frame_viz, (x, y), radius=3, color=(0, 255, 0), thickness=-1
                        )
                        cv2.putText(
                            frame_viz,
                            f"I{i}",
                            (x + 5, y - 5),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            0.5,
                            (0, 255, 0),
                            1,
                        )
                # Draw clicked points in red
                for i, (pt, vis) in enumerate(zip(tracks[num_input_stream:], visibility[num_input_stream:])):
                for i, (pt, vis) in enumerate(
                    zip(tracks[num_input_stream:], visibility[num_input_stream:])
                ):
                    if vis > 0.5:
                        x, y = int(pt[0]), int(pt[1])
                        cv2.circle(frame_viz, (x, y), radius=3, 
                                    color=(0, 0, 255), thickness=-1)
                        cv2.putText(frame_viz, f"C{i}", (x + 5, y - 5),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
                        cv2.circle(
                            frame_viz, (x, y), radius=3, color=(0, 0, 255), thickness=-1
                        )
                        cv2.putText(
                            frame_viz,
                            f"C{i}",
                            (x + 5, y - 5),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            0.5,
                            (0, 0, 255),
                            1,
                        )
                # Send tracked points
                if len(visible_tracks) > 0:
                    self.node.send_output(
                        "tracked_points", 
                        "points",
                        pa.array(visible_tracks.ravel()),
                        {
                            "num_points": len(visible_tracks),
                            "dtype": "float32",
                            "shape": (len(visible_tracks), 2)
                        }
                            "shape": (len(visible_tracks), 2),
                        },
                    )
                return frame, frame_viz
        return None, None
    def run(self):
        """Main run loop"""
        cv2.namedWindow("Raw Feed", cv2.WINDOW_NORMAL)
        cv2.setMouseCallback("Raw Feed", self.mouse_callback)
        if INTERACTIVE_MODE:
            cv2.namedWindow("Interactive Feed to track point", cv2.WINDOW_NORMAL)
            cv2.setMouseCallback("Interactive Feed to track point", self.mouse_callback)
        for event in self.node:
            if event["type"] == "INPUT":
                if event["id"] == "image":
                    metadata = event["metadata"]
                    frame = event["value"].to_numpy().reshape((
                        metadata["height"],
                        metadata["width"],
                        3
                    ))
                    frame = (
                        event["value"]
                        .to_numpy()
                        .reshape((metadata["height"], metadata["width"], 3))
                    )
                    # Add frame to tracking window
                    self.window_frames.append(frame)
                    original_frame, tracked_frame = self.process_tracking(frame)
                    if original_frame is not None and tracked_frame is not None:
                        self.node.send_output("image", 
                            pa.array(original_frame.ravel()), 
                            metadata
                        )
                        self.node.send_output("tracked_image", 
                            pa.array(tracked_frame.ravel()), 
                            metadata
                        self.node.send_output(
                            "tracked_image", pa.array(tracked_frame.ravel()), metadata
                        )
                    display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                    cv2.imshow("Raw Feed", display_frame)
                    cv2.waitKey(1)
                if event["id"] == "points_to_track":
                    if INTERACTIVE_MODE:
                        display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                        cv2.imshow("Interactive Feed to track point", display_frame)
                        cv2.waitKey(1)
                if event["id"] == "points":
                    if not self.accept_new_points:
                        continue
                    # Handle points from input_stream node
                    metadata = event["metadata"]
                    points_array = event["value"].to_numpy()
                    num_points = metadata["num_points"]
                    self.input_points = points_array.reshape((num_points, 2)).tolist()
                    self.input_points = points_array.reshape((-1, 2)).tolist()
                    self.accept_new_points = False
                    self.is_first_step = True
                    print(f"Received {num_points} points from input_stream")
                if event["id"] == "boxes2d":
                    if not self.accept_new_points:
                        continue
                    # Handle points from input_stream node
                    metadata = event["metadata"]
                    if isinstance(event["value"], pa.StructArray):
                        boxes2d = (
                            event["value"]
                            .get("bbox")
                            .values.to_numpy()
                            .reshape((-1, 4))
                        )
                        _labels = (
                            event["value"]
                            .get("labels")
                            .values.to_numpy(zero_copy_only=False)
                        )
                    else:
                        boxes2d = event["value"].to_numpy().reshape((-1, 4))
                        _labels = None
                    self.input_points = [
                        [int((x_min + x_max) / 2), int((y_min + y_max) / 2)]
                        for x_min, y_min, x_max, y_max in boxes2d
                    ]
                    self.is_first_step = True
                    self.accept_new_points = False
 def main():
    tracker = VideoTrackingNode()
    tracker.run()
 if __name__ == "__main__":
    main()
    main()
--- a/node-hub/dora-cotracker/pyproject.toml
+++ b/node-hub/dora-cotracker/pyproject.toml
@@ -1,11 +1,9 @@
 [project]
 name = "dora-cotracker"
 version = "0.1.0"
 authors = [
    { name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" }
 ]
 authors = [{ name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" }]
 description = "A Dora node implementing real-time object tracking using Facebook's CoTracker model"
 license = { text = "MIT" }
 license = "CC-BY-1.0"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -26,9 +24,9 @@ dora-cotracker = "dora_cotracker.main:main"
 [tool.ruff.lint]
 extend-select = [
    "PERF",  # Performance
    "RET",   # Return statements
    "RSE",   # Runtime errors
    "NPY",   # NumPy
    "N",     # Naming
    "PERF", # Performance
    "RET",  # Return statements
    "RSE",  # Runtime errors
    "NPY",  # NumPy
    "N",    # Naming
 ]