Make it possible to track multi point and always wait for first pred before allowing new tracked points

9 months ago · b5cf729b4e
--- a/examples/tracker/parse_bbox.py
+++ b/examples/tracker/parse_bbox.py
@@ -0,0 +1,63 @@
 """TODO: Add docstring."""

 import json
 import os

 import numpy as np
 import pyarrow as pa
 from dora import Node

 node = Node()

 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


 def extract_bboxes(json_text):
    """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.

    Parameters
    ----------
    json_text : str
        JSON string containing bounding box data, including ```json markers.

    Returns
    -------
    np.ndarray: NumPy array of bounding boxes.

    """
    # Ensure all lines are stripped of whitespace and markers
    lines = json_text.strip().splitlines()

    # Filter out lines that are markdown markers
    clean_lines = [line for line in lines if not line.strip().startswith("```")]

    # Join the lines back into a single string
    clean_text = "\n".join(clean_lines)
    # Parse the cleaned JSON text
    try:
        data = json.loads(clean_text)

        # Extract bounding boxes
        bboxes = [item["bbox_2d"] for item in data]
        labels = [item["label"] for item in data]

        return np.array(bboxes), np.array(labels)
    except Exception as _e:  # noqa
        pass
    return None, None


 for event in node:
    if event["type"] == "INPUT":
        text = event["value"][0].as_py()
        image_id = event["metadata"]["image_id"]

        bboxes, labels = extract_bboxes(text)
        if bboxes is not None and len(bboxes) > 0:
            bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)

            node.send_output(
                "bbox",
                pa.array(bboxes.ravel()),
                metadata={"encoding": "xyxy", "image_id": image_id},
            )
--- a/examples/tracker/qwenvl_cotracker.yml
+++ b/examples/tracker/qwenvl_cotracker.yml
@@ -0,0 +1,67 @@
 nodes:
  - id: camera
    build: pip install -e ../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: "0"
      ENCODING: "rgb8"
      IMAGE_WIDTH: "640"
      IMAGE_HEIGHT: "480"

  - id: dora-qwenvl
    build: pip install -e ../../node-hub/dora-qwen2-5-vl
    path: dora-qwen2-5-vl
    inputs:
      image: camera/image
      text_1: dora/timer/millis/600
    outputs:
      - text
    env:
      DEFAULT_QUESTION: Output the bounding box of the eyes.
      IMAGE_RESIZE_RATIO: "0.5"
      # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have
      #SYSTEM_PROMPT: You're a robot.

  - id: parse_bbox
    path: parse_bbox.py
    inputs:
      text: dora-qwenvl/text
    outputs:
      - bbox
    env:
      IMAGE_RESIZE_RATIO: "0.5"

  - id: tracker
    build: pip install -e ../../node-hub/dora-cotracker
    path: dora-cotracker
    inputs:
      image: camera/image
      boxes2d: parse_bbox/bbox
      # points_to_track: input/points_to_track    # uncomment this if using input node
    outputs:
      - tracked_image
      - points
    env:
      INTERACTIVE_MODE: false

  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      image: camera/image
      boxes2d: parse_bbox/bbox
      tracked_image: tracker/tracked_image

  # replace with your own node that outputs tracking points # uncomment if input via node
  # (e.g., YOLO detector, pose estimator, etc.)
  # - id: point_source
  #   build: pip install your-node  # Replace with your node's name
  #   path: your-point-source-node  # Replace with your node's path
  #   inputs:
  #     image: camera/image  # If your node needs image input
  #   outputs:
  #     - points_to_track    # Must output points in required format
--- a/node-hub/dora-cotracker/dora_cotracker/main.py
+++ b/node-hub/dora-cotracker/dora_cotracker/main.py
@@ -22,6 +22,7 @@ class VideoTrackingNode:
        self.buffer_size = self.model.step * 2
        self.window_frames = deque(maxlen=self.buffer_size)
        self.is_first_step = True
        self.accept_new_points = True
        self.clicked_points = []
        self.input_points = []

@@ -59,6 +60,7 @@ class VideoTrackingNode:
            self.is_first_step = False

            if pred_tracks is not None and pred_visibility is not None:
                self.accept_new_points = True
                tracks = pred_tracks[0, -1].cpu().numpy()
                visibility = pred_visibility[0, -1].cpu().numpy()
                visible_tracks = []
@@ -152,25 +154,29 @@ class VideoTrackingNode:
                        cv2.waitKey(1)

                if event["id"] == "points":
                    if not self.accept_new_points:
                        continue
                    # Handle points from input_stream node
                    metadata = event["metadata"]
                    points_array = event["value"].to_numpy()
                    self.input_points = points_array.reshape((-1, 2)).tolist()
                    self.accept_new_points = False
                    self.is_first_step = True
                if event["id"] == "boxes2d":
                    if not self.is_first_step:
                    if not self.accept_new_points:
                        continue

                    # Handle points from input_stream node
                    metadata = event["metadata"]
                    if isinstance(event["value"], pa.StructArray):
                        boxes2d = (
                            event["value"][0]
                            event["value"]
                            .get("bbox")
                            .values.to_numpy()
                            .reshape((-1, 4))
                        )
                        _labels = (
                            event["value"][0]
                            event["value"]
                            .get("labels")
                            .values.to_numpy(zero_copy_only=False)
                        )
@@ -184,6 +190,7 @@ class VideoTrackingNode:
                    ]

                    self.is_first_step = True
                    self.accept_new_points = False


 def main():