diff --git a/examples/reachy2-remote/dataflow_reachy.yml b/examples/reachy2-remote/dataflow_reachy.yml
index dc843baf..494b112d 100644
--- a/examples/reachy2-remote/dataflow_reachy.yml
+++ b/examples/reachy2-remote/dataflow_reachy.yml
@@ -4,17 +4,40 @@ nodes:
     _unstable_deploy:
       machine: encoder
     inputs:
-      tick: dora/timer/millis/10
+      tick: dora/timer/millis/20
     outputs:
       - image_left
       - image_depth
       - depth
     env:
-      CAPTURE_PATH: 0
       IMAGE_WIDTH: 640
       IMAGE_HEIGHT: 480
       ROBOT_IP: 127.0.0.1
 
+  - id: reachy-left-arm
+    build: pip install -e ../../node-hub/dora-reachy2
+    path: dora-reachy2-left-arm
+    _unstable_deploy:
+      machine: encoder
+    inputs:
+      pose: parse_pose/action_l_arm
+    outputs:
+      - response_l_arm
+    env:
+      ROBOT_IP: 127.0.0.1
+
+  - id: reachy-right-arm
+    build: pip install -e ../../node-hub/dora-reachy2
+    path: dora-reachy2-right-arm
+    _unstable_deploy:
+      machine: encoder
+    inputs:
+      pose: parse_pose/action_r_arm
+    outputs:
+      - response_r_arm
+    env:
+      ROBOT_IP: 127.0.0.1
+
   - id: rav1e-local-image
     path: dora-rav1e
     build: cargo build -p dora-rav1e --release
@@ -26,10 +49,21 @@ nodes:
     outputs:
       - image_left
       - image_depth
-      - depth
     env:
       RAV1E_SPEED: 10
 
+  - id: rav1e-local-depth
+    path: dora-rav1e
+    build: cargo build -p dora-rav1e --release
+    _unstable_deploy:
+      machine: encoder
+    inputs:
+      depth: camera/depth
+    outputs:
+      - depth
+    env:
+      RAV1E_SPEED: 7
+
   - id: dav1d-remote
     path: dora-dav1d
     build: cargo build -p dora-dav1d --release
@@ -38,7 +72,7 @@ nodes:
     inputs:
       image_depth: rav1e-local-image/image_depth
       image_left: rav1e-local-image/image_left
-      # depth: rav1e-local/depth
+      depth: rav1e-local-depth/depth
     outputs:
       - image_left
       - image_depth
@@ -87,6 +121,8 @@ nodes:
       - action
       - points
       - text
+      - action_release_left
+      - action_release_right
     env:
       IMAGE_RESIZE_RATIO: "1.0"
 
@@ -118,6 +154,17 @@ nodes:
     env:
       IMAGE_RESIZE_RATIO: "1.0"
 
+  - id: sam2
+    build: pip install -e ../../node-hub/dora-sam2
+    path: dora-sam2
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      image_depth: dav1d-remote/image_depth
+      boxes2d: parse_bbox/bbox_grab
+    outputs:
+      - masks
+
   - id: tracker
     build: pip install -e ../../node-hub/dora-cotracker
     path: dora-cotracker
@@ -132,24 +179,32 @@ nodes:
     env:
       INTERACTIVE_MODE: false
 
-  # - id: box_coordinates
-  # build: pip install -e ../../node-hub/dora-object-to-pose
-  # path: dora-object-to-pose
-  # inputs:
-  # depth: reachy-camera/depth
-  # boxes2d: parse_bbox/bbox
-  # outputs:
-  # - pose
-  #- id: sam2
-  #build: pip install -e ../../node-hub/dora-sam2
-  #path: dora-sam2
-  #_unstable_deploy:
-  #machine: gpu
-  #inputs:
-  #image_left: dav1d-remote/image_left
-  #boxes2d: parse_bbox/bbox
-  #outputs:
-  #- masks
+  - id: box_coordinates
+    build: pip install -e ../../node-hub/dora-object-to-pose
+    path: dora-object-to-pose
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      depth: dav1d-remote/depth
+      masks: sam2/masks
+    outputs:
+      - pose
+
+  - id: parse_pose
+    path: parse_pose.py
+    _unstable_deploy:
+      machine: gpu
+    inputs:
+      pose: box_coordinates/pose
+      response_r_arm: reachy-right-arm/response_r_arm
+      response_l_arm: reachy-left-arm/response_l_arm
+      release_left: parse_whisper/action_release_left
+      release_right: parse_whisper/action_release_right
+    outputs:
+      - action_r_arm
+      - action_l_arm
+    env:
+      IMAGE_RESIZE_RATIO: "1.0"
 
   - id: parse_point
     path: parse_point.py
@@ -179,12 +234,15 @@ nodes:
     build: pip install -e ../../node-hub/dora-rerun
     path: dora-rerun
     _unstable_deploy:
-      machine: macbook
+      machine: gpu
     inputs:
       image: dav1d-remote/image_left
-      image_depth: dav1d-remote/image_depth
-      boxes2d: parse_bbox/bbox
+      torso/image: dav1d-remote/image_depth
+      torso/depth: dav1d-remote/depth
+      torso/boxes2d: parse_bbox/bbox
       original_text: dora-distil-whisper/text
       parsed_text: parse_whisper/text
       qwenvl_text: dora-qwenvl/text
-      tracked_image: tracker/tracked_image
+    env:
+      RERUN_MEMORY_LIMIT: 5%
+      CAMERA_PITCH: 2.47
diff --git a/examples/reachy2-remote/parse_bbox.py b/examples/reachy2-remote/parse_bbox.py
index 143404ac..88667769 100644
--- a/examples/reachy2-remote/parse_bbox.py
+++ b/examples/reachy2-remote/parse_bbox.py
@@ -54,20 +54,23 @@ for event in node:
             continue
 
         text = event["value"][0].as_py()
+        metadata = event["metadata"]
         image_id = event["metadata"]["image_id"]
 
         bboxes, labels = extract_bboxes(text)
         if bboxes is not None and len(bboxes) > 0:
             bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
+            metadata["image_id"] = image_id
+            metadata["encoding"] = "xyxy"
             if image_id == "image_left":
                 node.send_output(
                     "bbox_track",
                     pa.array(bboxes.ravel()),
-                    metadata={"encoding": "xyxy", "image_id": image_id},
+                    metadata,
                 )
             elif image_id == "image_depth":
                 node.send_output(
                     "bbox_grab",
                     pa.array(bboxes.ravel()),
-                    metadata={"encoding": "xyxy", "image_id": image_id},
+                    metadata,
                 )
diff --git a/examples/reachy2-remote/parse_point.py b/examples/reachy2-remote/parse_point.py
index e3401a4e..7e9990da 100644
--- a/examples/reachy2-remote/parse_point.py
+++ b/examples/reachy2-remote/parse_point.py
@@ -29,18 +29,24 @@ for event in node:
         point = values[-1]
 
         rz = int((width / 2) - point[0]) / (width / 2)
-        x_distance = min(height / 2, height - point[1])
-
-        if abs(rz) > 0.3:
-            rz = np.deg2rad(30) * np.sign(rz)
+        x_distance = min(height, height - point[1])
+
+        if abs(rz) > 0.75:
+            rz = np.deg2rad(90) * np.sign(rz)
+        if abs(rz) > 0.5:
+            rz = np.deg2rad(60) * np.sign(rz)
+        elif abs(rz) > 0.3:
+            rz = np.deg2rad(55) * np.sign(rz)
         elif abs(rz) > 0.1:
-            rz = np.deg2rad(20) * np.sign(rz)
+            rz = np.deg2rad(45) * np.sign(rz)
         else:
             x = 0
 
-        if x_distance > (height * 0.3):
-            x = 0.7
-        elif x_distance > (height * 0.15):
+        if x_distance > (height * 0.7):
+            x = 0.5
+        elif x_distance > (height * 0.5):
+            x = 0.5
+        elif x_distance > (height * 0.2):
             x = 0.5
         else:
             x = 0
diff --git a/examples/reachy2-remote/parse_pose.py b/examples/reachy2-remote/parse_pose.py
index e69de29b..042b6c0a 100644
--- a/examples/reachy2-remote/parse_pose.py
+++ b/examples/reachy2-remote/parse_pose.py
@@ -0,0 +1,291 @@
+"""TODO: Add docstring."""
+
+import json
+import os
+
+import numpy as np
+import pyarrow as pa
+from dora import Node
+
+node = Node()
+
+IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))
+
+
+l_init_pose = [
+    -7.0631310641087435,
+    -10.432298603362307,
+    24.429809104404114,
+    -132.15000828778648,
+    -1.5494749438811133,
+    -21.749917789205202,
+    8.099312596108344,
+    100,
+]
+r_init_pose = [
+    -5.60273587426976,
+    10.780818397272316,
+    -27.868146823156042,
+    -126.15650363072193,
+    3.961108018106834,
+    -35.43682799906162,
+    350.9236448374495,
+    100,
+]
+r_release_closed_pose = [
+    -26.1507947940993,
+    12.16735021387949,
+    -2.2657319092611976,
+    -97.63648867582175,
+    -19.91084837404425,
+    22.10184328619011,
+    366.71351223614494,
+    0,
+]
+
+r_release_opened_pose = [
+    -26.1507947940993,
+    12.16735021387949,
+    -2.2657319092611976,
+    -97.63648867582175,
+    -19.91084837404425,
+    22.10184328619011,
+    366.71351223614494,
+    100,
+]
+
+l_release_opened_pose = [
+    -30.04330081906935,
+    -7.415231584691132,
+    3.6972339048071468,
+    -97.7274736257555,
+    12.996718740452982,
+    30.838020649757016,
+    -1.5572310505704858,
+    0,
+]
+
+l_release_closed_pose = [
+    -30.04330081906935,
+    -7.415231584691132,
+    3.6972339048071468,
+    -97.7274736257555,
+    12.996718740452982,
+    30.838020649757016,
+    -1.5572310505704858,
+    100,
+]
+
+
+def wait_for_event(id, timeout=None, cache={}):
+    """TODO: Add docstring."""
+    while True:
+        event = node.next(timeout=timeout)
+        if event is None:
+            cache["finished"] = True
+            return None, cache
+        if event["type"] == "INPUT":
+            cache[event["id"]] = event["value"]
+            if event["id"] == id:
+                return event["value"], cache
+
+        elif event["type"] == "ERROR":
+            return None, cache
+
+
+arm_holding_object = None
+cache = {}
+
+
+## ---- INIT ---
+node.send_output(
+    "action_r_arm",
+    pa.array(r_init_pose),
+    metadata={"encoding": "jointstate", "duration": 2},
+)
+node.send_output(
+    "action_l_arm",
+    pa.array(l_init_pose),
+    metadata={"encoding": "jointstate", "duration": 2},
+)
+
+for event in node:
+    if event["type"] == "INPUT":
+        if event["id"] == "pose":
+            values = event["value"]
+            values = values.to_numpy()
+            print("Pose: ", values)
+            if len(values) == 0:
+                continue
+            x = values[0]
+            y = values[1]
+            z = values[2]
+            action = event["metadata"]["action"]
+
+            match action:
+                case "grab":
+                    if len(values) == 0:
+                        continue
+                    x = x + 0.03
+
+                    ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement.
+                    trajectory = np.array(
+                        [
+                            [x, y, -0.16, 0, 0, 0, 100],
+                            [x, y, z, 0, 0, 0, 0],
+                            [x, y, -0.16, 0, 0, 0, 0],
+                        ],
+                    ).ravel()
+
+                    if y < 0:
+                        node.send_output(
+                            "action_r_arm",
+                            pa.array(trajectory),
+                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
+                        )
+                        event = wait_for_event(id="response_r_arm", timeout=5)
+                        if event is not None and event[0]:
+                            print("Success")
+                            arm_holding_object = "right"
+                            node.send_output(
+                                "action_r_arm",
+                                pa.array([0.1, -0.2, -0.16, 0, 0, 0, 0]),
+                                metadata={"encoding": "xyzrpy", "duration": "1"},
+                            )
+                        else:
+                            print("Failed: x: ", x, " y: ", y, " z: ", z)
+                            node.send_output(
+                                "action_r_arm",
+                                pa.array(r_init_pose),
+                                metadata={"encoding": "jointstate", "duration": "1"},
+                            )
+                            event = wait_for_event(id="response_r_arm")
+                    else:
+                        y += 0.03
+                        node.send_output(
+                            "action_l_arm",
+                            pa.array(trajectory),
+                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
+                        )
+                        event = wait_for_event(id="response_l_arm", timeout=5)
+                        if event is not None and event[0]:
+                            print("Success")
+                            arm_holding_object = "left"
+                            node.send_output(
+                                "action_l_arm",
+                                pa.array([0.1, 0.2, -0.16, 0, 0, 0, 0]),
+                                metadata={"encoding": "xyzrpy", "duration": "1"},
+                            )
+                        else:
+                            print("Failed")
+                            node.send_output(
+                                "action_l_arm",
+                                pa.array(l_init_pose),
+                                metadata={"encoding": "jointstate", "duration": "1"},
+                            )
+                            event = wait_for_event(id="response_l_arm")
+                case "release":
+                    if len(values) == 0:
+                        continue
+                    x = x + 0.03
+
+                    ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement.
+                    trajectory = np.array(
+                        [
+                            [x, y, -0.16, 0, 0, 0, 100],
+                        ],
+                    ).ravel()
+
+                    if y < 0:
+                        node.send_output(
+                            "action_r_arm",
+                            pa.array(trajectory),
+                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
+                        )
+                        event = wait_for_event(id="response_r_arm", timeout=5)
+                        if event is not None and event[0]:
+                            print("Success")
+                            arm_holding_object = "right"
+                            node.send_output(
+                                "action_r_arm",
+                                pa.array(r_init_pose),
+                                metadata={"encoding": "jointstate", "duration": 1},
+                            )
+                        else:
+                            print("Failed: x: ", x, " y: ", y, " z: ", z)
+                            node.send_output(
+                                "action_r_arm",
+                                pa.array(r_init_pose),
+                                metadata={"encoding": "jointstate", "duration": "1"},
+                            )
+                            event = wait_for_event(id="response_r_arm")
+                    else:
+                        y += 0.03
+                        node.send_output(
+                            "action_l_arm",
+                            pa.array(trajectory),
+                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
+                        )
+                        event = wait_for_event(id="response_l_arm", timeout=5)
+                        if event is not None and event[0]:
+                            print("Success")
+                            arm_holding_object = "left"
+                            node.send_output(
+                                "action_l_arm",
+                                pa.array(l_init_pose),
+                                metadata={"encoding": "jointstate", "duration": 1},
+                            )
+                        else:
+                            print("Failed")
+                            node.send_output(
+                                "action_l_arm",
+                                pa.array(l_init_pose),
+                                metadata={"encoding": "jointstate", "duration": "1"},
+                            )
+                            event = wait_for_event(id="response_l_arm")
+
+        elif event["id"] == "release_right":
+            node.send_output(
+                "action_r_arm",
+                pa.array(
+                    [
+                        0.4,
+                        0,
+                        -0.16,
+                        0,
+                        0,
+                        0,
+                        100,
+                    ],
+                ),
+                metadata={"encoding": "xyzrpy", "duration": "0.75"},
+            )
+            event, cache = wait_for_event(id="response_r_arm", cache=cache)
+            node.send_output(
+                "action_r_arm",
+                pa.array(r_init_pose),
+                metadata={"encoding": "jointstate", "duration": 1},
+            )
+        elif event["id"] == "release_left":
+            node.send_output(
+                "action_l_arm",
+                pa.array(
+                    [
+                        0.4,
+                        0,
+                        -0.16,
+                        0,
+                        0,
+                        0,
+                        100,
+                    ],
+                ),
+                metadata={"encoding": "xyzrpy", "duration": "0.75"},
+            )
+            event, cache = wait_for_event(id="response_l_arm", cache=cache)
+
+            node.send_output(
+                "action_l_arm",
+                pa.array(l_init_pose),
+                metadata={"encoding": "jointstate", "duration": 1},
+            )
diff --git a/examples/reachy2-remote/parse_whisper.py b/examples/reachy2-remote/parse_whisper.py
index 74211806..99a5e47f 100644
--- a/examples/reachy2-remote/parse_whisper.py
+++ b/examples/reachy2-remote/parse_whisper.py
@@ -59,8 +59,21 @@ for event in node:
             node.send_output("text", pa.array([text]), {"image_id": "image_left"})
         elif "grab" in text:
             text = f"Given the prompt: {text}. Output the bounding boxes for the given grabbed object"
-            node.send_output("text", pa.array([text]), {"image_id": "image_depth"})
-        elif "left" in text:
+            node.send_output(
+                "text", pa.array([text]), {"image_id": "image_depth", "action": "grab"}
+            )
+        elif "put " in text:
+            text = f"Given the prompt: {text}. Output the bounding boxes for the place to put the object"
+            node.send_output(
+                "text",
+                pa.array([text]),
+                {"image_id": "image_depth", "action": "release"},
+            )
+        elif "release left" in text:
+            node.send_output("action_release_left", pa.array([1.0]))
+        elif "release right" in text:
+            node.send_output("action_release_right", pa.array([1.0]))
+        elif "turn left" in text:
             action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
             node.send_output("action", action)
             time.sleep(0.25)
@@ -70,7 +83,7 @@ for event in node:
             action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
             node.send_output("action", action)
             node.send_output("points", pa.array([]))
-        elif "right" in text:
+        elif "turn right" in text:
             action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
             node.send_output("action", action)
             time.sleep(0.25)
diff --git a/node-hub/dora-object-to-pose/src/lib.rs b/node-hub/dora-object-to-pose/src/lib.rs
index 8c2a3779..98aadad1 100644
--- a/node-hub/dora-object-to-pose/src/lib.rs
+++ b/node-hub/dora-object-to-pose/src/lib.rs
@@ -1,7 +1,7 @@
 use core::f32;
 use dora_node_api::{
     arrow::{
-        array::{AsArray, Float64Array, UInt8Array},
+        array::{AsArray, Float64Array, UInt16Array, UInt8Array},
         datatypes::{Float32Type, Int64Type},
     },
     dora_core::config::DataId,
@@ -11,7 +11,7 @@ use eyre::Result;
 use std::collections::HashMap;
 
 fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> {
-    let (_x, _y, _z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) =
+    let (sum_x, sum_y, sum_z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) =
         points.iter().fold(
             (
                 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.0, -10.0, 10.0, -10.0, 10., -10.0,
@@ -49,11 +49,7 @@ fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> {
                 )
             },
         );
-    let (mean_x, mean_y, mean_z) = (
-        (x_max + x_min) / 2.,
-        (y_max + y_min) / 2.,
-        (z_max + z_min) / 2.,
-    );
+    let (mean_x, mean_y, mean_z) = ((sum_x) / n, (sum_y) / n, (sum_z) / n);
 
     // Compute covariance and standard deviations
     let cov = sum_xy / n - mean_x * mean_y;
@@ -116,7 +112,8 @@ pub fn lib_main() -> Result<()> {
                     } else {
                         vec![640, 480]
                     };
-                    let buffer: &Float64Array = data.as_any().downcast_ref().unwrap();
+                    let buffer: &UInt16Array = data.as_any().downcast_ref().unwrap();
+
                     depth_frame = Some(buffer.clone());
                 }
                 "masks" => {
@@ -137,6 +134,8 @@ pub fn lib_main() -> Result<()> {
                         continue;
                     };
 
+                    let mut z_2 = 0.0;
+                    let mut z_1 = 0.0;
                     let outputs: Vec<Vec<f32>> = masks
                         .chunks(height as usize * width as usize)
                         .filter_map(|data| {
@@ -150,23 +149,36 @@ pub fn lib_main() -> Result<()> {
                                     let v = i as f32 / width as f32; // Calculate y-coordinate (v)
 
                                     if let Some(z) = z {
-                                        let z = z as f32;
+                                        let z = (z as f32) / 1000.;
                                         // Skip points that have empty depth or is too far away
                                         if z == 0. || z > 20.0 {
                                             return;
                                         }
-                                        if data[i] {
-                                            let y = (u - resolution[0] as f32) * z
-                                                / focal_length[0] as f32;
-                                            let x = (v - resolution[1] as f32) * z
-                                                / focal_length[1] as f32;
-                                            let new_x = sin_theta * z + cos_theta * x;
-                                            let new_y = -y;
-                                            let new_z = cos_theta * z - sin_theta * x;
+                                        if z_2 == 0. && z_1 == 0. {
+                                            z_1 = z;
+                                        } else if z_1 == 0. {
+                                            z_2 = z_1;
+                                            z_1 = z;
+                                        } else if (z - z_2).abs() < 0.1 && (z - z_1).abs() < 0.1 {
+                                            z_2 = z_1;
+                                            z_1 = z;
 
-                                            points.push((new_x, new_y, new_z));
-                                            z_total += new_z;
-                                            n += 1.;
+                                            if data[i] {
+                                                let y = (u - resolution[0] as f32) * z
+                                                    / focal_length[0] as f32;
+                                                let x = (v - resolution[1] as f32) * z
+                                                    / focal_length[1] as f32;
+                                                let new_x = sin_theta * z + cos_theta * x;
+                                                let new_y = -y;
+                                                let new_z = cos_theta * z - sin_theta * x;
+
+                                                points.push((new_x, new_y, new_z));
+                                                z_total += new_z;
+                                                n += 1.;
+                                            }
+                                        } else {
+                                            z_2 = z_1;
+                                            z_1 = z;
                                         }
                                     }
                                 });
@@ -215,7 +227,7 @@ pub fn lib_main() -> Result<()> {
                                         let v = i as f32 / width as f32; // Calculate y-coordinate (v)
 
                                         if let Some(z) = z {
-                                            let z = z as f32;
+                                            let z = (z as f32) / 1000.;
                                             // Skip points that have empty depth or is too far away
                                             if z == 0. || z > 5.0 {
                                                 return;
diff --git a/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py b/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py
index 8a7ade0c..3125858c 100644
--- a/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py
+++ b/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py
@@ -229,10 +229,12 @@ def main():
                     past_key_values,
                     image_id,
                 )
+                metadata = event["metadata"]
+                metadata["image_id"] = image_id if image_id is not None else "all"
                 node.send_output(
                     "text",
                     pa.array([response]),
-                    {"image_id": image_id if image_id is not None else "all"},
+                    metadata,
                 )
 
         elif event_type == "ERROR":
diff --git a/node-hub/dora-sam2/dora_sam2/main.py b/node-hub/dora-sam2/dora_sam2/main.py
index d2612cac..37b216a9 100644
--- a/node-hub/dora-sam2/dora_sam2/main.py
+++ b/node-hub/dora-sam2/dora_sam2/main.py
@@ -133,7 +133,9 @@ def main():
                                 )
 
             if "boxes2d" in event_id:
-
+                if len(event["value"]) == 0:
+                    node.send_output("masks", pa.array([]))
+                    continue
                 if isinstance(event["value"], pa.StructArray):
                     boxes2d = event["value"][0].get("bbox").values.to_numpy()
                     labels = (
@@ -162,7 +164,59 @@ def main():
                 ):
                     predictor.set_image(frames[image_id])
                     masks, _scores, last_pred = predictor.predict(
-                        box=boxes2d, point_labels=labels, multimask_output=False,
+                        box=boxes2d,
+                        point_labels=labels,
+                        multimask_output=False,
+                    )
+
+                    if len(masks.shape) == 4:
+                        masks = masks[:, 0, :, :]
+                        last_pred = last_pred[:, 0, :, :]
+                    else:
+                        masks = masks[0, :, :]
+                        last_pred = last_pred[0, :, :]
+
+                    masks = masks > 0
+                    metadata["image_id"] = image_id
+                    metadata["width"] = frames[image_id].width
+                    metadata["height"] = frames[image_id].height
+                    ## Mask to 3 channel image
+                    match return_type:
+                        case pa.Array:
+                            node.send_output("masks", pa.array(masks.ravel()), metadata)
+                        case pa.StructArray:
+                            node.send_output(
+                                "masks",
+                                pa.array(
+                                    [
+                                        {
+                                            "masks": masks.ravel(),
+                                            "labels": event["value"]["labels"],
+                                        },
+                                    ],
+                                ),
+                                metadata,
+                            )
+            elif "points" in event_id:
+                points = event["value"].to_numpy().reshape((-1, 2))
+                return_type = pa.Array
+                if len(frames) == 0:
+                    continue
+                first_image = next(iter(frames.keys()))
+                image_id = event["metadata"].get("image_id", first_image)
+                with (
+                    torch.inference_mode(),
+                    torch.autocast(
+                        "cuda",
+                        dtype=torch.bfloat16,
+                    ),
+                ):
+                    predictor.set_image(frames[image_id])
+                    labels = [i for i in range(len(points))]
+                    masks, _scores, last_pred = predictor.predict(
+                        points,
+                        point_labels=labels,
+                        multimask_output=False,
                     )
 
                     if len(masks.shape) == 4: