diff --git a/examples/tracker/facebook_cotracker.yml b/examples/tracker/facebook_cotracker.yml
new file mode 100644
index 00000000..c81e40cb
--- /dev/null
+++ b/examples/tracker/facebook_cotracker.yml
@@ -0,0 +1,51 @@
+nodes:
+  - id: camera
+    build: pip install -e ../../node-hub/opencv-video-capture
+    path: opencv-video-capture
+    inputs:
+      tick: dora/timer/millis/100
+    outputs:
+      - image
+    env:
+      CAPTURE_PATH: "0"
+      ENCODING: "rgb8"
+      IMAGE_WIDTH: "640"
+      IMAGE_HEIGHT: "480"
+
+  - id: object-detection
+    build: pip install -e ../../node-hub/dora-yolo
+    path: dora-yolo
+    inputs:
+      image: camera/image
+    outputs:
+      - bbox
+
+  - id: tracker
+    build: pip install -e ../../node-hub/dora-cotracker
+    path: dora-cotracker
+    inputs:
+      image: camera/image
+      boxes2d: object-detection/bbox
+      # points_to_track: input/points_to_track    # uncomment this if using input node
+    outputs:
+      - tracked_image
+      - points
+    env:
+      INTERACTIVE_MODE: false
+
+  - id: plot
+    build: pip install -e ../../node-hub/dora-rerun
+    path: dora-rerun
+    inputs:
+      image: camera/image
+      tracked_image: tracker/tracked_image
+
+  # replace with your own node that outputs tracking points # uncomment if input via node
+  # (e.g., YOLO detector, pose estimator, etc.)
+  # - id: point_source
+  #   build: pip install your-node  # Replace with your node's name
+  #   path: your-point-source-node  # Replace with your node's path
+  #   inputs:
+  #     image: camera/image  # If your node needs image input
+  #   outputs:
+  #     - points_to_track    # Must output points in required format
diff --git a/examples/tracker/parse_bbox.py b/examples/tracker/parse_bbox.py
new file mode 100644
index 00000000..056b0af3
--- /dev/null
+++ b/examples/tracker/parse_bbox.py
@@ -0,0 +1,63 @@
+"""TODO: Add docstring."""
+
+import json
+import os
+
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+node = Node()
+
+IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))
+
+
+def extract_bboxes(json_text):
+    """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.
+
+    Parameters
+    ----------
+    json_text : str
+        JSON string containing bounding box data, including ```json markers.
+
+    Returns
+    -------
+    np.ndarray: NumPy array of bounding boxes.
+
+    """
+    # Ensure all lines are stripped of whitespace and markers
+    lines = json_text.strip().splitlines()
+
+    # Filter out lines that are markdown markers
+    clean_lines = [line for line in lines if not line.strip().startswith("```")]
+
+    # Join the lines back into a single string
+    clean_text = "\n".join(clean_lines)
+    # Parse the cleaned JSON text
+    try:
+        data = json.loads(clean_text)
+
+        # Extract bounding boxes
+        bboxes = [item["bbox_2d"] for item in data]
+        labels = [item["label"] for item in data]
+
+        return np.array(bboxes), np.array(labels)
+    except Exception as _e:  # noqa
+        pass
+    return None, None
+
+
+for event in node:
+    if event["type"] == "INPUT":
+        text = event["value"][0].as_py()
+        image_id = event["metadata"]["image_id"]
+
+        bboxes, labels = extract_bboxes(text)
+        if bboxes is not None and len(bboxes) > 0:
+            bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
+
+            node.send_output(
+                "bbox",
+                pa.array(bboxes.ravel()),
+                metadata={"encoding": "xyxy", "image_id": image_id},
+            )
diff --git a/examples/tracker/qwenvl_cotracker.yml b/examples/tracker/qwenvl_cotracker.yml
new file mode 100644
index 00000000..b620297d
--- /dev/null
+++ b/examples/tracker/qwenvl_cotracker.yml
@@ -0,0 +1,67 @@
+nodes:
+  - id: camera
+    build: pip install -e ../../node-hub/opencv-video-capture
+    path: opencv-video-capture
+    inputs:
+      tick: dora/timer/millis/100
+    outputs:
+      - image
+    env:
+      CAPTURE_PATH: "0"
+      ENCODING: "rgb8"
+      IMAGE_WIDTH: "640"
+      IMAGE_HEIGHT: "480"
+
+  - id: dora-qwenvl
+    build: pip install -e ../../node-hub/dora-qwen2-5-vl
+    path: dora-qwen2-5-vl
+    inputs:
+      image: camera/image
+      text_1: dora/timer/millis/600
+    outputs:
+      - text
+    env:
+      DEFAULT_QUESTION: Output the bounding box of the eyes.
+      IMAGE_RESIZE_RATIO: "0.5"
+      # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have
+      #SYSTEM_PROMPT: You're a robot.
+
+  - id: parse_bbox
+    path: parse_bbox.py
+    inputs:
+      text: dora-qwenvl/text
+    outputs:
+      - bbox
+    env:
+      IMAGE_RESIZE_RATIO: "0.5"
+
+  - id: tracker
+    build: pip install -e ../../node-hub/dora-cotracker
+    path: dora-cotracker
+    inputs:
+      image: camera/image
+      boxes2d: parse_bbox/bbox
+      # points_to_track: input/points_to_track    # uncomment this if using input node
+    outputs:
+      - tracked_image
+      - points
+    env:
+      INTERACTIVE_MODE: false
+
+  - id: plot
+    build: pip install -e ../../node-hub/dora-rerun
+    path: dora-rerun
+    inputs:
+      image: camera/image
+      boxes2d: parse_bbox/bbox
+      tracked_image: tracker/tracked_image
+
+  # replace with your own node that outputs tracking points # uncomment if input via node
+  # (e.g., YOLO detector, pose estimator, etc.)
+  # - id: point_source
+  #   build: pip install your-node  # Replace with your node's name
+  #   path: your-point-source-node  # Replace with your node's path
+  #   inputs:
+  #     image: camera/image  # If your node needs image input
+  #   outputs:
+  #     - points_to_track    # Must output points in required format
diff --git a/node-hub/dora-cotracker/demo.yml b/node-hub/dora-cotracker/demo.yml
index 6bb36707..240237f2 100644
--- a/node-hub/dora-cotracker/demo.yml
+++ b/node-hub/dora-cotracker/demo.yml
@@ -13,14 +13,14 @@ nodes:
       IMAGE_HEIGHT: "480"
 
   - id: tracker
-    build: pip install dora-cotracker
+    build: pip install -e .
     path: dora-cotracker
     inputs:
       image: camera/image
       # points_to_track: input/points_to_track    # uncomment this if using input node
     outputs:
       - tracked_image
-      - tracked_points
+      - points
 
   - id: plot
     build: pip install dora-rerun
@@ -29,8 +29,7 @@ nodes:
       image: camera/image
       tracked_image: tracker/tracked_image
 
-
-  # replace with your own node that outputs tracking points # uncomment if input via node 
+  # replace with your own node that outputs tracking points # uncomment if input via node
   # (e.g., YOLO detector, pose estimator, etc.)
   # - id: point_source
   #   build: pip install your-node  # Replace with your node's name
@@ -38,4 +37,4 @@ nodes:
   #   inputs:
   #     image: camera/image  # If your node needs image input
   #   outputs:
-  #     - points_to_track    # Must output points in required format
\ No newline at end of file
+  #     - points_to_track    # Must output points in required format
diff --git a/node-hub/dora-cotracker/dora_cotracker/main.py b/node-hub/dora-cotracker/dora_cotracker/main.py
index 5e4c08a4..dcfaeb54 100644
--- a/node-hub/dora-cotracker/dora_cotracker/main.py
+++ b/node-hub/dora-cotracker/dora_cotracker/main.py
@@ -1,9 +1,14 @@
+import os
+from collections import deque
+
+import cv2
 import numpy as np
 import pyarrow as pa
-from dora import Node
-import cv2
 import torch
-from collections import deque
+from dora import Node
+
+INTERACTIVE_MODE = os.getenv("INTERACTIVE_MODE", "false").lower() == "true"
+
 
 class VideoTrackingNode:
     def __init__(self):
@@ -12,10 +17,12 @@ class VideoTrackingNode:
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = torch.hub.load("facebookresearch/co-tracker", "cotracker3_online")
         self.model = self.model.to(self.device)
+        self.model.eval()
         self.model.step = 8
-        self.buffer_size = self.model.step * 2 
+        self.buffer_size = self.model.step * 2
         self.window_frames = deque(maxlen=self.buffer_size)
         self.is_first_step = True
+        self.accept_new_points = True
         self.clicked_points = []
         self.input_points = []
 
@@ -29,14 +36,12 @@ class VideoTrackingNode:
         """Process frame for tracking"""
         if len(self.window_frames) == self.buffer_size:
             all_points = self.input_points + self.clicked_points
-            
+
             if not all_points:
                 print("No points to track")
                 return None, None
-            
             video_chunk = torch.tensor(
-                np.stack(list(self.window_frames)), 
-                device=self.device
+                np.stack(list(self.window_frames)), device=self.device
             ).float()
             video_chunk = video_chunk / 255.0
             # Reshape to [B,T,C,H,W]
@@ -50,11 +55,12 @@ class VideoTrackingNode:
                 is_first_step=self.is_first_step,
                 grid_size=0,
                 queries=queries,
-                add_support_grid=False
+                add_support_grid=False,
             )
             self.is_first_step = False
 
             if pred_tracks is not None and pred_visibility is not None:
+                self.accept_new_points = True
                 tracks = pred_tracks[0, -1].cpu().numpy()
                 visibility = pred_visibility[0, -1].cpu().numpy()
                 visible_tracks = []
@@ -66,84 +72,131 @@ class VideoTrackingNode:
                 frame_viz = frame.copy()
                 num_input_stream = len(self.input_points)
                 # Draw input points in red
-                for i, (pt, vis) in enumerate(zip(tracks[:num_input_stream], visibility[:num_input_stream])):
+                for i, (pt, vis) in enumerate(
+                    zip(tracks[:num_input_stream], visibility[:num_input_stream])
+                ):
                     if vis > 0.5:
                         x, y = int(pt[0]), int(pt[1])
-                        cv2.circle(frame_viz, (x, y), radius=3, 
-                                    color=(0, 255, 0), thickness=-1)
-                        cv2.putText(frame_viz, f"I{i}", (x + 5, y - 5),
-                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
-                
+                        cv2.circle(
+                            frame_viz, (x, y), radius=3, color=(0, 255, 0), thickness=-1
+                        )
+                        cv2.putText(
+                            frame_viz,
+                            f"I{i}",
+                            (x + 5, y - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.5,
+                            (0, 255, 0),
+                            1,
+                        )
+
                 # Draw clicked points in red
-                for i, (pt, vis) in enumerate(zip(tracks[num_input_stream:], visibility[num_input_stream:])):
+                for i, (pt, vis) in enumerate(
+                    zip(tracks[num_input_stream:], visibility[num_input_stream:])
+                ):
                     if vis > 0.5:
                         x, y = int(pt[0]), int(pt[1])
-                        cv2.circle(frame_viz, (x, y), radius=3, 
-                                    color=(0, 0, 255), thickness=-1)
-                        cv2.putText(frame_viz, f"C{i}", (x + 5, y - 5),
-                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
-                
+                        cv2.circle(
+                            frame_viz, (x, y), radius=3, color=(0, 0, 255), thickness=-1
+                        )
+                        cv2.putText(
+                            frame_viz,
+                            f"C{i}",
+                            (x + 5, y - 5),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            0.5,
+                            (0, 0, 255),
+                            1,
+                        )
+
                 # Send tracked points
                 if len(visible_tracks) > 0:
                     self.node.send_output(
-                        "tracked_points", 
+                        "points",
                         pa.array(visible_tracks.ravel()),
                         {
                             "num_points": len(visible_tracks),
                             "dtype": "float32",
-                            "shape": (len(visible_tracks), 2)
-                        }
+                            "shape": (len(visible_tracks), 2),
+                        },
                     )
-                
+
                 return frame, frame_viz
 
         return None, None
 
     def run(self):
         """Main run loop"""
-        cv2.namedWindow("Raw Feed", cv2.WINDOW_NORMAL)
-        cv2.setMouseCallback("Raw Feed", self.mouse_callback)
+        if INTERACTIVE_MODE:
+            cv2.namedWindow("Interactive Feed to track point", cv2.WINDOW_NORMAL)
+            cv2.setMouseCallback("Interactive Feed to track point", self.mouse_callback)
 
         for event in self.node:
             if event["type"] == "INPUT":
                 if event["id"] == "image":
                     metadata = event["metadata"]
-                    frame = event["value"].to_numpy().reshape((
-                        metadata["height"],
-                        metadata["width"],
-                        3
-                    ))
+                    frame = (
+                        event["value"]
+                        .to_numpy()
+                        .reshape((metadata["height"], metadata["width"], 3))
+                    )
                     # Add frame to tracking window
                     self.window_frames.append(frame)
                     original_frame, tracked_frame = self.process_tracking(frame)
                     if original_frame is not None and tracked_frame is not None:
-                        self.node.send_output("image", 
-                            pa.array(original_frame.ravel()), 
-                            metadata
-                        )
-                        self.node.send_output("tracked_image", 
-                            pa.array(tracked_frame.ravel()), 
-                            metadata
+                        self.node.send_output(
+                            "tracked_image", pa.array(tracked_frame.ravel()), metadata
                         )
 
-                    display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-                    cv2.imshow("Raw Feed", display_frame)
-                    cv2.waitKey(1)
-                
-                if event["id"] == "points_to_track":
+                    if INTERACTIVE_MODE:
+                        display_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                        cv2.imshow("Interactive Feed to track point", display_frame)
+                        cv2.waitKey(1)
+
+                if event["id"] == "points":
+                    if not self.accept_new_points:
+                        continue
                     # Handle points from input_stream node
                     metadata = event["metadata"]
                     points_array = event["value"].to_numpy()
-                    num_points = metadata["num_points"]
-                    self.input_points = points_array.reshape((num_points, 2)).tolist()
+                    self.input_points = points_array.reshape((-1, 2)).tolist()
+                    self.accept_new_points = False
                     self.is_first_step = True
-                    print(f"Received {num_points} points from input_stream")
+                if event["id"] == "boxes2d":
+                    if not self.accept_new_points:
+                        continue
+
+                    # Handle points from input_stream node
+                    metadata = event["metadata"]
+                    if isinstance(event["value"], pa.StructArray):
+                        boxes2d = (
+                            event["value"]
+                            .get("bbox")
+                            .values.to_numpy()
+                            .reshape((-1, 4))
+                        )
+                        _labels = (
+                            event["value"]
+                            .get("labels")
+                            .values.to_numpy(zero_copy_only=False)
+                        )
+                    else:
+                        boxes2d = event["value"].to_numpy().reshape((-1, 4))
+                        _labels = None
 
+                    self.input_points = [
+                        [int((x_min + x_max) / 2), int((y_min + y_max) / 2)]
+                        for x_min, y_min, x_max, y_max in boxes2d
+                    ]
+
+                    self.is_first_step = True
+                    self.accept_new_points = False
 
 
 def main():
     tracker = VideoTrackingNode()
     tracker.run()
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/node-hub/dora-cotracker/pyproject.toml b/node-hub/dora-cotracker/pyproject.toml
index 3359a888..a27d21c6 100644
--- a/node-hub/dora-cotracker/pyproject.toml
+++ b/node-hub/dora-cotracker/pyproject.toml
@@ -1,11 +1,9 @@
 [project]
 name = "dora-cotracker"
 version = "0.1.0"
-authors = [
-    { name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" }
-]
+authors = [{ name = "Shashwat Patil", email = "shashwatpatil974@gmail.com" }]
 description = "A Dora node implementing real-time object tracking using Facebook's CoTracker model"
-license = { text = "MIT" }
+license = "CC-BY-1.0"
 readme = "README.md"
 requires-python = ">=3.10"
 
@@ -26,9 +24,9 @@ dora-cotracker = "dora_cotracker.main:main"
 
 [tool.ruff.lint]
 extend-select = [
-    "PERF",  # Performance
-    "RET",   # Return statements
-    "RSE",   # Runtime errors
-    "NPY",   # NumPy
-    "N",     # Naming
+    "PERF", # Performance
+    "RET",  # Return statements
+    "RSE",  # Runtime errors
+    "NPY",  # NumPy
+    "N",    # Naming
 ]