Browse Source

Make it possible to track multi point and always wait for first pred before allowing new tracked points

tags/v0.3.12-rc0
haixuanTao 9 months ago
parent
commit
b5cf729b4e
3 changed files with 140 additions and 3 deletions
  1. +63
    -0
      examples/tracker/parse_bbox.py
  2. +67
    -0
      examples/tracker/qwenvl_cotracker.yml
  3. +10
    -3
      node-hub/dora-cotracker/dora_cotracker/main.py

+ 63
- 0
examples/tracker/parse_bbox.py View File

@@ -0,0 +1,63 @@
"""TODO: Add docstring."""

import json
import os

import numpy as np
import pyarrow as pa
from dora import Node

node = Node()

IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


def extract_bboxes(json_text):
"""Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.

Parameters
----------
json_text : str
JSON string containing bounding box data, including ```json markers.

Returns
-------
np.ndarray: NumPy array of bounding boxes.

"""
# Ensure all lines are stripped of whitespace and markers
lines = json_text.strip().splitlines()

# Filter out lines that are markdown markers
clean_lines = [line for line in lines if not line.strip().startswith("```")]

# Join the lines back into a single string
clean_text = "\n".join(clean_lines)
# Parse the cleaned JSON text
try:
data = json.loads(clean_text)

# Extract bounding boxes
bboxes = [item["bbox_2d"] for item in data]
labels = [item["label"] for item in data]

return np.array(bboxes), np.array(labels)
except Exception as _e: # noqa
pass
return None, None


for event in node:
if event["type"] == "INPUT":
text = event["value"][0].as_py()
image_id = event["metadata"]["image_id"]

bboxes, labels = extract_bboxes(text)
if bboxes is not None and len(bboxes) > 0:
bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)

node.send_output(
"bbox",
pa.array(bboxes.ravel()),
metadata={"encoding": "xyxy", "image_id": image_id},
)

+ 67
- 0
examples/tracker/qwenvl_cotracker.yml View File

@@ -0,0 +1,67 @@
nodes:
- id: camera
build: pip install -e ../../node-hub/opencv-video-capture
path: opencv-video-capture
inputs:
tick: dora/timer/millis/100
outputs:
- image
env:
CAPTURE_PATH: "0"
ENCODING: "rgb8"
IMAGE_WIDTH: "640"
IMAGE_HEIGHT: "480"

- id: dora-qwenvl
build: pip install -e ../../node-hub/dora-qwen2-5-vl
path: dora-qwen2-5-vl
inputs:
image: camera/image
text_1: dora/timer/millis/600
outputs:
- text
env:
DEFAULT_QUESTION: Output the bounding box of the eyes.
IMAGE_RESIZE_RATIO: "0.5"
# ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have
#SYSTEM_PROMPT: You're a robot.

- id: parse_bbox
path: parse_bbox.py
inputs:
text: dora-qwenvl/text
outputs:
- bbox
env:
IMAGE_RESIZE_RATIO: "0.5"

- id: tracker
build: pip install -e ../../node-hub/dora-cotracker
path: dora-cotracker
inputs:
image: camera/image
boxes2d: parse_bbox/bbox
# points_to_track: input/points_to_track # uncomment this if using input node
outputs:
- tracked_image
- points
env:
INTERACTIVE_MODE: false

- id: plot
build: pip install -e ../../node-hub/dora-rerun
path: dora-rerun
inputs:
image: camera/image
boxes2d: parse_bbox/bbox
tracked_image: tracker/tracked_image

# replace with your own node that outputs tracking points # uncomment if input via node
# (e.g., YOLO detector, pose estimator, etc.)
# - id: point_source
# build: pip install your-node # Replace with your node's name
# path: your-point-source-node # Replace with your node's path
# inputs:
# image: camera/image # If your node needs image input
# outputs:
# - points_to_track # Must output points in required format

+ 10
- 3
node-hub/dora-cotracker/dora_cotracker/main.py View File

@@ -22,6 +22,7 @@ class VideoTrackingNode:
self.buffer_size = self.model.step * 2
self.window_frames = deque(maxlen=self.buffer_size)
self.is_first_step = True
self.accept_new_points = True
self.clicked_points = []
self.input_points = []

@@ -59,6 +60,7 @@ class VideoTrackingNode:
self.is_first_step = False

if pred_tracks is not None and pred_visibility is not None:
self.accept_new_points = True
tracks = pred_tracks[0, -1].cpu().numpy()
visibility = pred_visibility[0, -1].cpu().numpy()
visible_tracks = []
@@ -152,25 +154,29 @@ class VideoTrackingNode:
cv2.waitKey(1)

if event["id"] == "points":
if not self.accept_new_points:
continue
# Handle points from input_stream node
metadata = event["metadata"]
points_array = event["value"].to_numpy()
self.input_points = points_array.reshape((-1, 2)).tolist()
self.accept_new_points = False
self.is_first_step = True
if event["id"] == "boxes2d":
if not self.is_first_step:
if not self.accept_new_points:
continue

# Handle points from input_stream node
metadata = event["metadata"]
if isinstance(event["value"], pa.StructArray):
boxes2d = (
event["value"][0]
event["value"]
.get("bbox")
.values.to_numpy()
.reshape((-1, 4))
)
_labels = (
event["value"][0]
event["value"]
.get("labels")
.values.to_numpy(zero_copy_only=False)
)
@@ -184,6 +190,7 @@ class VideoTrackingNode:
]

self.is_first_step = True
self.accept_new_points = False


def main():


Loading…
Cancel
Save