Pick place demo (#793)

Demo of reachy doing a pick and place exercice.
1 year ago · 269d23e592
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/apis/python/operator/src/lib.rs
+++ b/apis/python/operator/src/lib.rs
@@ -199,6 +199,12 @@ pub fn pydict_to_metadata(dict: Option<Bound<'_, PyDict>>) -> Result<MetadataPar
            {
                let list: Vec<f64> = value.extract()?;
                parameters.insert(key, Parameter::ListFloat(list))
            } else if value.is_instance_of::<PyList>()
                && value.len()? > 0
                && value.get_item(0)?.is_exact_instance_of::<PyString>()
            {
                let list: Vec<String> = value.extract()?;
                parameters.insert(key, Parameter::ListString(list))
            } else {
                println!("could not convert type {value}");
                parameters.insert(key, Parameter::String(value.str()?.to_string()))
@@ -233,6 +239,9 @@ pub fn metadata_to_pydict<'a>(
            Parameter::ListFloat(l) => dict
                .set_item(k, l)
                .context("Could not insert metadata into python dictionary")?,
            Parameter::ListString(l) => dict
                .set_item(k, l)
                .context("Could not insert metadata into python dictionary")?,
        }
    }

--- a/examples/reachy2/parse_bbox_minimal.py
+++ b/examples/reachy2/parse_bbox_minimal.py
@@ -0,0 +1,81 @@
 import json
 import os

 import numpy as np
 import pyarrow as pa
 from dora import Node

 node = Node()

 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


 def extract_bboxes(json_text) -> (np.ndarray, np.ndarray):
    """
    Extracts bounding boxes from a JSON string with markdown markers and returns them as a NumPy array.

    Parameters:
    json_text (str): JSON string containing bounding box data, including ```json markers.

    Returns:
    np.ndarray: NumPy array of bounding boxes.
    """
    # Ensure all lines are stripped of whitespace and markers
    lines = json_text.strip().splitlines()

    # Filter out lines that are markdown markers
    clean_lines = [line for line in lines if not line.strip().startswith("```")]

    # Join the lines back into a single string
    clean_text = "\n".join(clean_lines)
    # Parse the cleaned JSON text
    try:
        data = json.loads(clean_text)

        # Extract bounding boxes
        bboxes = [item["bbox_2d"] for item in data]
        labels = [item["label"] for item in data]

        return np.array(bboxes), np.array(labels)
    except Exception as _e:  # noqa
        pass
    return None, None


 for event in node:
    text = "Put the chocolate in the white plate"
    if event["type"] == "INPUT":
        if event["id"] == "prompt":
            prompt = event["value"][0].as_py()

        elif event["id"] == "text":
            text = event["value"][0].as_py()
            image_id = event["metadata"]["image_id"]

            bboxes, labels = extract_bboxes(text)
            if bboxes is not None and len(bboxes) > 0:
                bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
                unique_labels = np.unique(labels)
                idx = []
                order = []
                for label in unique_labels:
                    if label in prompt:
                        # Get the index of the start of the label in the prompt
                        order.append(prompt.index(label))
                        idx.append(np.where(labels == label)[0][0])

                if len(idx) == 0:
                    continue
                # Reorder idx given the order
                # print(idx, order)
                idx = np.array(idx)[np.argsort(order)].ravel()
                bboxes = bboxes[idx]
                # Check for duplicated box
                if len(np.unique(bboxes, axis=0)) != len(bboxes):
                    print("Duplicated box")
                    continue
                node.send_output(
                    "bbox",
                    pa.array([{"bbox": bboxes.ravel(), "labels": labels[idx]}]),
                    metadata={"encoding": "xyxy", "image_id": image_id},
                )
--- a/examples/reachy2/pick-place-dev.yml
+++ b/examples/reachy2/pick-place-dev.yml
@@ -0,0 +1,151 @@
 nodes:
  - id: dora-microphone
    build: pip install -e ../../node-hub/dora-microphone
    path: dora-microphone
    inputs:
      tick: dora/timer/millis/2000
    outputs:
      - audio

  - id: sam2
    build: pip install -e ../../node-hub/dora-sam2
    path: dora-sam2
    inputs:
      image_depth: reachy-camera/image_depth
      boxes2d: parse_bbox/bbox
    outputs:
      - masks

  - id: dora-vad
    build: pip install -e ../../node-hub/dora-vad
    path: dora-vad
    inputs:
      audio: dora-microphone/audio
    outputs:
      - audio

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    path: dora-distil-whisper
    inputs:
      input: dora-vad/audio
    outputs:
      - text
    env:
      TARGET_LANGUAGE: english
      TRANSLATE: true

  - id: reachy-mobile-base
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-mobile-base
    inputs:
      action_base: state_machine/action_base
    outputs:
      - response_base

  - id: reachy-left-arm
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-left-arm
    inputs:
      pose: state_machine/action_l_arm
    outputs:
      - response_l_arm

  - id: reachy-right-arm
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-right-arm
    inputs:
      pose: state_machine/action_r_arm
    outputs:
      - response_r_arm

  - id: reachy-camera
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-camera
    inputs:
      tick: dora/timer/millis/50
    outputs:
      - image_depth
      - depth

  - id: reachy-head
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-head
    inputs:
      boxes2d: parse_bbox/bbox_face
      look: state_machine/look

  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      # camera_left/image_right: reachy-camera/image_right
      camera_torso/image: reachy-camera/image_depth
      text_response: dora-qwenvl/text
      text_whisper: dora-distil-whisper/text
      camera_torso/boxes2d: parse_bbox/bbox
      camera_left/boxes2d_face: parse_bbox/bbox_face
    env:
      RERUN_MEMORY_LIMIT: "5%"

  - id: dora-qwenvl
    build: pip install -e ../../node-hub/dora-qwen2-5-vl
    path: dora-qwen2-5-vl
    inputs:
      image_depth: reachy-camera/image_depth
      # image_left: reachy-camera/image_left
      text_1: dora/timer/millis/600
      text_2: state_machine/text_vlm
    outputs:
      - text
    env:
      DEFAULT_QUESTION: grab human.
      IMAGE_RESIZE_RATIO: "0.5"
      # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have
      #SYSTEM_PROMPT: You're a robot.

  - id: parse_bbox
    path: parse_bbox_minimal.py
    inputs:
      text: dora-qwenvl/text
      prompt: state_machine/prompt
    outputs:
      - bbox
      - bbox_face
    env:
      IMAGE_RESIZE_RATIO: "0.5"

  - id: box_coordinates
    build: pip install -e ../../node-hub/dora-object-to-pose
    path: dora-object-to-pose
    inputs:
      depth: reachy-camera/depth
      masks: sam2/masks
    outputs:
      - pose

  - id: keyboard
    build: pip install -e ../../node-hub/dora-keyboard
    path: dora-keyboard
    inputs:
      tick: dora/timer/millis/1000
    outputs:
      - char

  - id: state_machine
    path: pick_place.py
    inputs:
      text: dora-distil-whisper/text
      response_base: reachy-mobile-base/response_base
      response_r_arm: reachy-right-arm/response_r_arm
      response_l_arm: reachy-left-arm/response_l_arm
      pose: box_coordinates/pose
    outputs:
      - text_vlm
      - action_r_arm
      - action_base
      - look
      - action_l_arm
      - prompt
    env:
      ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have put
--- a/examples/reachy2/pick_place.py
+++ b/examples/reachy2/pick_place.py
@@ -0,0 +1,401 @@
 # State Machine
 import json
 import os

 import numpy as np
 import pyarrow as pa
 from dora import Node

 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))
 node = Node()

 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split()
 TABLE_HEIGHT = float(os.getenv("TABLE_HEIGHT", "-0.41"))

 l_init_pose = [
    -7.0631310641087435,
    -10.432298603362307,
    24.429809104404114,
    -132.15000828778648,
    -1.5494749438811133,
    -21.749917789205202,
    8.099312596108344,
    100,
 ]
 r_init_pose = [
    -5.60273587426976,
    10.780818397272316,
    -27.868146823156042,
    -126.15650363072193,
    3.961108018106834,
    -35.43682799906162,
    350.9236448374495,
    100,
 ]
 r_release_closed_pose = [
    -26.1507947940993,
    12.16735021387949,
    -2.2657319092611976,
    -97.63648867582175,
    -19.91084837404425,
    22.10184328619011,
    366.71351223614494,
    0,
 ]

 r_release_opened_pose = [
    -26.1507947940993,
    12.16735021387949,
    -2.2657319092611976,
    -97.63648867582175,
    -19.91084837404425,
    22.10184328619011,
    366.71351223614494,
    100,
 ]

 l_release_opened_pose = [
    -30.04330081906935,
    -7.415231584691132,
    3.6972339048071468,
    -97.7274736257555,
    12.996718740452982,
    30.838020649757016,
    -1.5572310505704858,
    0,
 ]

 l_release_closed_pose = [
    -30.04330081906935,
    -7.415231584691132,
    3.6972339048071468,
    -97.7274736257555,
    12.996718740452982,
    30.838020649757016,
    -1.5572310505704858,
    100,
 ]

 stop = True


 def extract_bboxes(json_text) -> (np.ndarray, np.ndarray):
    """
    Extracts bounding boxes from a JSON string with markdown markers and returns them as a NumPy array.

    Parameters:
    json_text (str): JSON string containing bounding box data, including ```json markers.

    Returns:
    np.ndarray: NumPy array of bounding boxes.
    """
    # Ensure all lines are stripped of whitespace and markers
    lines = json_text.strip().splitlines()

    # Filter out lines that are markdown markers
    clean_lines = [line for line in lines if not line.strip().startswith("```")]

    # Join the lines back into a single string
    clean_text = "\n".join(clean_lines)
    # Parse the cleaned JSON text
    try:
        data = json.loads(clean_text)

        # Extract bounding boxes
        bboxes = [item["bbox_2d"] for item in data]
        labels = [item["label"] for item in data]

        return np.array(bboxes), np.array(labels)
    except Exception as _e:  # noqa
        pass
    return None, None


 def handle_speech(last_text):
    global stop
    words = last_text.lower().split()
    if len(ACTIVATION_WORDS) > 0 and any(word in ACTIVATION_WORDS for word in words):

        node.send_output(
            "text_vlm",
            pa.array(
                [
                    f"Given the prompt: {cache['text']}. Output the two bounding boxes for the two objects"
                ]
            ),
            metadata={"image_id": "image_depth"},
        )
        node.send_output(
            "prompt",
            pa.array([cache["text"]]),
            metadata={"image_id": "image_depth"},
        )
        print(f"sending: {cache['text']}")
        stop = False


 def wait_for_event(id, timeout=None, cache={}):

    while True:
        event = node.next(timeout=timeout)
        if event is None:
            cache["finished"] = True
            return None, cache
        if event["type"] == "INPUT":
            cache[event["id"]] = event["value"]
            if event["id"] == "text":
                cache[event["id"]] = event["value"][0].as_py()
                handle_speech(event["value"][0].as_py())
            elif event["id"] == id:
                return event["value"], cache

        elif event["type"] == "ERROR":
            return None, cache


 def wait_for_events(ids: list[str], timeout=None, cache={}):
    response = {}
    while True:
        event = node.next(timeout=timeout)
        if event is None:
            cache["finished"] = True
            return None, cache
        if event["type"] == "INPUT":
            cache[event["id"]] = event["value"]
            if event["id"] == "text":
                cache[event["id"]] = event["value"][0].as_py()
                handle_speech(event["value"][0].as_py())
            elif event["id"] in ids:
                response[event["id"]] = event["value"]
                if len(response) == len(ids):
                    return response, cache
        elif event["type"] == "ERROR":
            return None, cache


 def get_prompt():
    text = wait_for_event(id="text", timeout=0.3)
    if text is None:
        return
    text = text[0].as_py()

    words = text.lower().split()
    if len(ACTIVATION_WORDS) > 0 and all(
        word not in ACTIVATION_WORDS for word in words
    ):
        return
    else:
        return text


 last_text = ""
 cache = {"text": "Put the orange in the metal box"}

 while True:
    ### === IDLE ===

    node.send_output(
        "action_r_arm",
        pa.array(r_init_pose),
        metadata={"encoding": "jointstate", "duration": 1},
    )
    node.send_output(
        "action_l_arm",
        pa.array(l_init_pose),
        metadata={"encoding": "jointstate", "duration": 1},
    )
    _, cache = wait_for_events(
        ids=["response_r_arm", "response_l_arm"], timeout=2, cache=cache
    )
    # handle_speech(cache["text"])

    ### === TURNING ===

    # Trigger action once text from whisper is received
    # Move left. Overwrite this with your desired movement..
    # node.send_output("action_base", pa.array([0.0, 0.0, 0.0, 0.0, 0.0, 1.57]))
    # Look straight
    # node.send_output("look", pa.array([0.3, 0, -0.1]))
    # You can add additional actions here
    # ...

    # event = wait_for_event(id="response_base")[0].as_py()
    # if not event:
    ## return to IDLE
    # node.send_output("action_base", pa.array([0.0, 0.0, 0.0, 0.0, 0.0, -1.57]))
    # event = wait_for_event(id="response_base")[0].as_py()
    # if event:
    # continue
    # else:
    # break

    ### === GRABBING ===

    # Trigger action once base is done moving
    # node.send_output(
    # "text_vlm",
    # pa.array([f"Given the prompt: {text}. Output bounding box for this action"]),
    # metadata={"image_id": "image_depth"},
    # )
    arm_holding_object = None
    # Try pose and until one is successful
    text, cache = wait_for_event(id="text", timeout=0.3, cache=cache)

    if stop:
        continue

    while True:
        values, cache = wait_for_event(id="pose", cache=cache)

        if values is None:
            continue
        values = values.to_numpy().reshape((-1, 6))
        if len(values) < 2:
            continue
        x = values[0][0]
        y = values[0][1]
        z = values[0][2]
        dest_x = values[1][0]
        dest_y = values[1][1]
        dest_z = values[1][2]
        x = x + 0.01
        dest_x = dest_x - 0.05
        print("x: ", x, " y: ", y, " z: ", z)

        ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement.
        z = np.max((z, TABLE_HEIGHT))
        node.send_output("look", pa.array([x, y, z]))
        trajectory = np.array(
            [
                [x, y, -0.16, 0, 0, 0, 100],
                [x, y, z, 0, 0, 0, 0],
                [x, y, -0.16, 0, 0, 0, 0],
            ]
        ).ravel()

        if y < 0:
            node.send_output(
                "action_r_arm",
                pa.array(trajectory),
                metadata={"encoding": "xyzrpy", "duration": "0.5"},
            )
            event, cache = wait_for_event(id="response_r_arm", timeout=5, cache=cache)
            if event is not None and event[0].as_py():
                print("Success")
                arm_holding_object = "right"
                break
            else:
                print("Failed: x: ", x, " y: ", y, " z: ", z)
                node.send_output(
                    "action_r_arm",
                    pa.array(r_init_pose),
                    metadata={"encoding": "jointstate", "duration": "1.3"},
                )
                event, cache = wait_for_event(id="response_r_arm", cache=cache)
        else:
            y += 0.03
            node.send_output(
                "action_l_arm",
                pa.array(trajectory),
                metadata={"encoding": "xyzrpy", "duration": "0.5"},
            )
            event, cache = wait_for_event(id="response_l_arm", timeout=5, cache=cache)
            if event is not None and event[0].as_py():
                print("Success")
                arm_holding_object = "left"
                break
            else:
                print("Failed")
                node.send_output(
                    "action_l_arm",
                    pa.array(l_init_pose),
                    metadata={"encoding": "jointstate", "duration": "1.3"},
                )
                event, cache = wait_for_event(id="response_l_arm", cache=cache)
    ### === RELEASING ===

    # Trigger action once r_arm is done moving
    # node.send_output("action_base", pa.array([0.0, 0.0, 0.0, 0.0, 0.0, -1.57]))
    # event = wait_for_event(id="response_base")[0].as_py()

    # if not event:
    #    print("Failed to move right")

    # Trigger action to release object
    if arm_holding_object == "right":
        node.send_output(
            "action_r_arm",
            pa.array(
                [
                    dest_x,
                    dest_y,
                    -0.16,
                    0,
                    0,
                    0,
                    100,
                ],
            ),
            metadata={"encoding": "xyzrpy", "duration": "0.75"},
        )
        event, cache = wait_for_event(id="response_r_arm", cache=cache)
    else:
        node.send_output(
            "action_l_arm",
            pa.array(
                [
                    dest_x,
                    dest_y,
                    -0.16,
                    0,
                    0,
                    0,
                    100,
                ]
            ),
            metadata={"encoding": "xyzrpy", "duration": "0.75"},
        )
        event, cache = wait_for_event(id="response_l_arm", cache=cache)

    if event is None or not event[0].as_py():
        print("Failed to release object")
        if arm_holding_object == "right":
            node.send_output(
                "action_r_arm",
                pa.array(
                    [
                        x,
                        y,
                        z,
                        0,
                        0,
                        0,
                        100,
                    ],
                ),
                metadata={"encoding": "xyzrpy", "duration": "0.75"},
            )
            event, cache = wait_for_event(id="response_r_arm", cache=cache)
        else:
            node.send_output(
                "action_l_arm",
                pa.array(
                    [
                        x,
                        y,
                        z,
                        0,
                        0,
                        0,
                        100,
                    ]
                ),
                metadata={"encoding": "xyzrpy", "duration": "0.75"},
            )
            event, cache = wait_for_event(id="response_l_arm", cache=cache)
    else:
        stop = True

    if cache.get("finished", False):
        break
    # Move object back to initial position
--- a/libraries/message/src/metadata.rs
+++ b/libraries/message/src/metadata.rs
@@ -63,6 +63,7 @@ pub enum Parameter {
    String(String),
    ListInt(Vec<i64>),
    ListFloat(Vec<f64>),
    ListString(Vec<String>),
 }

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
--- a/node-hub/dora-object-to-pose/Cargo.toml
+++ b/node-hub/dora-object-to-pose/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 dora-node-api = "0.3.8"
 dora-node-api = { workspace = true }
 eyre = "0.6.8"
 pyo3 = { workspace = true, features = [
    "extension-module",
--- a/node-hub/dora-object-to-pose/src/lib.rs
+++ b/node-hub/dora-object-to-pose/src/lib.rs
@@ -10,7 +10,7 @@ use dora_node_api::{
 use eyre::Result;
 use std::collections::HashMap;

 fn points_to_pose(points: &[(f32, f32, f32)]) -> (f32, f32, f32, f32, f32, f32) {
 fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> {
    let (_x, _y, _z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) =
        points.iter().fold(
            (
@@ -61,7 +61,7 @@ fn points_to_pose(points: &[(f32, f32, f32)]) -> (f32, f32, f32, f32, f32, f32)
    let std_y = (sum_y2 / n - mean_y * mean_y).sqrt();
    let corr = cov / (std_x * std_y);

    return (mean_x, mean_y, mean_z, 0., 0., corr * f32::consts::PI / 2.);
    return vec![mean_x, mean_y, mean_z, 0., 0., corr * f32::consts::PI / 2.];
 }

 pub fn lib_main() -> Result<()> {
@@ -74,7 +74,7 @@ pub fn lib_main() -> Result<()> {
    let mut focal_length = vec![605, 605];
    let mut resolution = vec![605, 605];
    let camera_pitch = std::env::var("CAMERA_PITCH")
        .unwrap_or("2.478".to_string())
        .unwrap_or("2.47".to_string())
        .parse::<f32>()
        .unwrap();
    let cos_theta = camera_pitch.cos(); // np.cos(np.deg2rad(180-38))
@@ -120,114 +120,147 @@ pub fn lib_main() -> Result<()> {
                    depth_frame = Some(buffer.clone());
                }
                "masks" => {
                    if let Some(data) = data.as_primitive_opt::<Float32Type>() {
                        let data = data.values();
                        let mut points = vec![];
                        let mut z_total = 0.;
                        let mut n = 0.;
                    let masks = if let Some(data) = data.as_primitive_opt::<Float32Type>() {
                        let data = data
                            .iter()
                            .map(|x| if let Some(x) = x { x > 0. } else { false })
                            .collect::<Vec<_>>();
                        data
                    } else if let Some(data) = data.as_boolean_opt() {
                        let data = data
                            .iter()
                            .map(|x| if let Some(x) = x { x } else { false })
                            .collect::<Vec<_>>();
                        data
                    } else {
                        println!("Got unexpected data type: {}", data.data_type());
                        continue;
                    };

                        if let Some(depth_frame) = &depth_frame {
                            depth_frame.iter().enumerate().for_each(|(i, z)| {
                                let u = i as f32 % width as f32; // Calculate x-coordinate (u)
                                let v = i as f32 / width as f32; // Calculate y-coordinate (v)
                    let outputs: Vec<Vec<f32>> = masks
                        .chunks(height as usize * width as usize)
                        .into_iter()
                        .map(|data| {
                            let mut points = vec![];
                            let mut z_total = 0.;
                            let mut n = 0.;

                                if let Some(z) = z {
                                    let z = z as f32;
                                    // Skip points that have empty depth or is too far away
                                    if z == 0. || z > 5.0 {
                                        return;
                                    }
                                    if data[i] > 0. {
                                        let y =
                                            (u - resolution[0] as f32) * z / focal_length[0] as f32;
                                        let x =
                                            (v - resolution[1] as f32) * z / focal_length[1] as f32;
                                        let new_x = sin_theta * z + cos_theta * x;
                                        let new_y = -y;
                                        let new_z = cos_theta * z - sin_theta * x;
                            if let Some(depth_frame) = &depth_frame {
                                depth_frame.iter().enumerate().for_each(|(i, z)| {
                                    let u = i as f32 % width as f32; // Calculate x-coordinate (u)
                                    let v = i as f32 / width as f32; // Calculate y-coordinate (v)

                                    if let Some(z) = z {
                                        let z = z as f32;
                                        // Skip points that have empty depth or is too far away
                                        if z == 0. || z > 20.0 {
                                            return;
                                        }
                                        if data[i] {
                                            let y = (u - resolution[0] as f32) * z
                                                / focal_length[0] as f32;
                                            let x = (v - resolution[1] as f32) * z
                                                / focal_length[1] as f32;
                                            let new_x = sin_theta * z + cos_theta * x;
                                            let new_y = -y;
                                            let new_z = cos_theta * z - sin_theta * x;

                                        points.push((new_x, new_y, new_z));
                                        z_total += new_z;
                                        n += 1.;
                                            points.push((new_x, new_y, new_z));
                                            z_total += new_z;
                                            n += 1.;
                                        }
                                    }
                                }
                            });
                        } else {
                            println!("No depth frame found");
                            continue;
                        }
                        if points.is_empty() {
                            println!("No points in mask found");
                            continue;
                        }
                        let (mean_x, mean_y, mean_z, rx, ry, rz) = points_to_pose(&points);
                        let mut metadata = metadata.parameters.clone();
                        metadata.insert(
                            "encoding".to_string(),
                            Parameter::String("xyzrpy".to_string()),
                        );
                                });
                            } else {
                                println!("No depth frame found");
                                return None;
                            }
                            if points.is_empty() {
                                println!("No points in mask found");
                                return None;
                            }
                            Some(points_to_pose(&points))
                        })
                        .filter(|x| x.is_some())
                        .map(|x| x.unwrap())
                        .collect();
                    let flatten_data = outputs.into_iter().flatten().collect::<Vec<_>>();
                    let mut metadata = metadata.parameters.clone();
                    metadata.insert(
                        "encoding".to_string(),
                        Parameter::String("xyzrpy".to_string()),
                    );
                    println!("Got data: {:?}", flatten_data);

                        node.send_output(
                            DataId::from("pose".to_string()),
                            metadata,
                            vec![mean_x, mean_y, mean_z, rx, ry, rz].into_arrow(),
                        )?;
                    }
                    node.send_output(
                        DataId::from("pose".to_string()),
                        metadata,
                        flatten_data.into_arrow(),
                    )?;
                }
                "boxes2d" => {
                    if let Some(data) = data.as_primitive_opt::<Int64Type>() {
                        let data = data.values();
                        let x_min = data[0] as f32;
                        let y_min = data[1] as f32;
                        let x_max = data[2] as f32;
                        let y_max = data[3] as f32;
                        let mut points = vec![];
                        let mut z_min = 100.;
                        let mut z_total = 0.;
                        let mut n = 0.;
                        let values = data.values();
                        let outputs: Vec<Vec<f32>> = values
                            .chunks(4)
                            .into_iter()
                            .map(|data| {
                                let x_min = data[0] as f32;
                                let y_min = data[1] as f32;
                                let x_max = data[2] as f32;
                                let y_max = data[3] as f32;
                                let mut points = vec![];
                                let mut z_min = 100.;
                                let mut z_total = 0.;
                                let mut n = 0.;

                        if let Some(depth_frame) = &depth_frame {
                            depth_frame.iter().enumerate().for_each(|(i, z)| {
                                let u = i as f32 % width as f32; // Calculate x-coordinate (u)
                                let v = i as f32 / width as f32; // Calculate y-coordinate (v)
                                if let Some(depth_frame) = &depth_frame {
                                    depth_frame.iter().enumerate().for_each(|(i, z)| {
                                        let u = i as f32 % width as f32; // Calculate x-coordinate (u)
                                        let v = i as f32 / width as f32; // Calculate y-coordinate (v)

                                if let Some(z) = z {
                                    let z = z as f32;
                                    // Skip points that have empty depth or is too far away
                                    if z == 0. || z > 5.0 {
                                        return;
                                    }
                                    if u > x_min && u < x_max && v > y_min && v < y_max {
                                        let y =
                                            (u - resolution[0] as f32) * z / focal_length[0] as f32;
                                        let x =
                                            (v - resolution[1] as f32) * z / focal_length[1] as f32;
                                        let new_x = sin_theta * z + cos_theta * x;
                                        let new_y = -y;
                                        let new_z = cos_theta * z - sin_theta * x;
                                        if new_z < z_min {
                                            z_min = new_z;
                                        if let Some(z) = z {
                                            let z = z as f32;
                                            // Skip points that have empty depth or is too far away
                                            if z == 0. || z > 5.0 {
                                                return;
                                            }
                                            if u > x_min && u < x_max && v > y_min && v < y_max {
                                                let y = (u - resolution[0] as f32) * z
                                                    / focal_length[0] as f32;
                                                let x = (v - resolution[1] as f32) * z
                                                    / focal_length[1] as f32;
                                                let new_x = sin_theta * z + cos_theta * x;
                                                let new_y = -y;
                                                let new_z = cos_theta * z - sin_theta * x;
                                                if new_z < z_min {
                                                    z_min = new_z;
                                                }
                                                points.push((new_x, new_y, new_z));
                                                z_total += new_z;
                                                n += 1.;
                                            }
                                        }
                                        points.push((new_x, new_y, new_z));
                                        z_total += new_z;
                                        n += 1.;
                                    }
                                    });
                                } else {
                                    println!("No depth frame found");
                                    return None;
                                }
                            });
                        } else {
                            println!("No depth frame found");
                            continue;
                        }
                        if points.is_empty() {
                            continue;
                        }
                        let raw_mean_z = z_total / n as f32;
                        let threshold = (raw_mean_z + z_min) / 2.;
                        let points = points
                            .into_iter()
                            .filter(|(_x, _y, z)| z > &threshold)
                            .collect::<Vec<_>>();
                        let (mean_x, mean_y, mean_z, rx, ry, rz) = points_to_pose(&points);
                                if points.is_empty() {
                                    return None;
                                }
                                let raw_mean_z = z_total / n as f32;
                                let threshold = (raw_mean_z + z_min) / 2.;
                                let points = points
                                    .into_iter()
                                    .filter(|(_x, _y, z)| z > &threshold)
                                    .collect::<Vec<_>>();
                                Some(points_to_pose(&points))
                            })
                            .filter(|x| x.is_some())
                            .map(|x| x.unwrap())
                            .collect();
                        let flatten_data = outputs.into_iter().flatten().collect::<Vec<_>>();
                        let mut metadata = metadata.parameters.clone();
                        metadata.insert(
                            "encoding".to_string(),
@@ -237,7 +270,7 @@ pub fn lib_main() -> Result<()> {
                        node.send_output(
                            DataId::from("pose".to_string()),
                            metadata,
                            vec![mean_x, mean_y, mean_z, rx, ry, rz].into_arrow(),
                            flatten_data.into_arrow(),
                        )?;
                    }
                }
--- a/node-hub/dora-reachy2/dora_reachy2/camera.py
+++ b/node-hub/dora-reachy2/dora_reachy2/camera.py
@@ -10,7 +10,7 @@ from reachy2_sdk.media.camera import CameraView
 def main():
    ROBOT_IP = os.getenv("ROBOT_IP", "10.42.0.80")

    for _ in range(5):
    for _ in range(10):
        reachy = ReachySDK(ROBOT_IP)
        try:
            reachy.cameras.teleop.get_frame(view=CameraView.LEFT)
--- a/node-hub/dora-reachy2/dora_reachy2/left_arm.py
+++ b/node-hub/dora-reachy2/dora_reachy2/left_arm.py
@@ -78,14 +78,14 @@ def manage_gripper(reachy, gripper, grasp):
        return True
    if gripper == 0.0:
        reachy.l_arm.gripper.close()
        time.sleep(0.5)
        time.sleep(0.3)
        if grasp:
            half_open = reachy.l_arm.gripper.get_current_opening() > 2
            if not half_open:
                return False
    elif gripper == 100.0:
        reachy.l_arm.gripper.open()
        time.sleep(0.5)
        time.sleep(0.3)
    return True


@@ -133,7 +133,12 @@ def main():
                        )
                    else:
                        for joint, gripper in joint_values:
                            reachy.l_arm.goto(joint, duration=duration, wait=wait)
                            reachy.l_arm.goto(
                                joint,
                                duration=duration,
                                wait=wait,
                                interpolation_mode="linear",
                            )
                            response_gripper = manage_gripper(reachy, gripper, grasp)
                            if not response_gripper:
                                node.send_output(
@@ -151,7 +156,12 @@ def main():
                        joints = value[:7].tolist()
                        gripper = value[7]

                        reachy.l_arm.goto(joints, duration=duration, wait=wait)
                        reachy.l_arm.goto(
                            joints,
                            duration=duration,
                            wait=wait,
                            interpolation_mode="linear",
                        )
                        manage_gripper(reachy, gripper, grasp)
                    node.send_output("response_l_arm", pa.array([True]))

--- a/node-hub/dora-reachy2/dora_reachy2/right_arm.py
+++ b/node-hub/dora-reachy2/dora_reachy2/right_arm.py
@@ -77,14 +77,14 @@ def manage_gripper(reachy, gripper, grasp):
        return True
    if gripper == 0.0:
        reachy.r_arm.gripper.close()
        time.sleep(0.5)
        time.sleep(0.3)
        if grasp:
            half_open = reachy.r_arm.gripper.get_current_opening() > 2
            if not half_open:
                return False
    elif gripper == 100.0:
        reachy.r_arm.gripper.open()
        time.sleep(0.5)
        time.sleep(0.3)
    return True


@@ -132,7 +132,12 @@ def main():
                        )
                    else:
                        for joint, gripper in joint_values:
                            reachy.r_arm.goto(joint, duration=duration, wait=wait)
                            reachy.r_arm.goto(
                                joint,
                                duration=duration,
                                wait=wait,
                                interpolation_mode="linear",
                            )
                            response_gripper = manage_gripper(reachy, gripper, grasp)
                            if not response_gripper:
                                node.send_output(
@@ -150,7 +155,12 @@ def main():
                        joints = value[:7].tolist()
                        gripper = value[7]

                        reachy.r_arm.goto(joints, duration=duration, wait=wait)
                        reachy.r_arm.goto(
                            joints,
                            duration=duration,
                            wait=wait,
                            interpolation_mode="linear",
                        )
                        manage_gripper(reachy, gripper, grasp)
                    node.send_output("response_r_arm", pa.array([True]))

--- a/node-hub/dora-rerun/Cargo.toml
+++ b/node-hub/dora-rerun/Cargo.toml
@@ -17,7 +17,7 @@ python = ["pyo3"]
 dora-node-api = { workspace = true, features = ["tracing"] }
 eyre = "0.6.8"
 tokio = { version = "1.24.2", features = ["rt"] }
 rerun = { version = "0.21.0", features = ["web_viewer", "image"] }
 rerun = { version = "0.22.0", features = ["web_viewer", "image"] }
 ndarray = "0.15.6"
 k = "0.32"
 pyo3 = { workspace = true, features = [
--- a/node-hub/dora-rerun/pyproject.toml
+++ b/node-hub/dora-rerun/pyproject.toml
@@ -10,7 +10,7 @@ requires-python = ">=3.8"

 dependencies = [
    "maturin>=1.8.2",
    'rerun_sdk==0.21.0',
    'rerun_sdk==0.22.0',
    # "rerun-loader-urdf @ git+https://github.com/rerun-io/rerun-loader-python-example-urdf.git",
 ]

--- a/node-hub/dora-rerun/src/boxes2d.rs
+++ b/node-hub/dora-rerun/src/boxes2d.rs
@@ -1,7 +1,9 @@
 use dora_node_api::{
    arrow::{
        array::AsArray,
        datatypes::{Float32Type, Float64Type, Int32Type, Int64Type},
        datatypes::{
            DataType, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
        },
    },
    dora_core::config::DataId,
    ArrowData, Metadata, Parameter,
@@ -29,10 +31,53 @@ pub fn update_boxes2d(
            .as_list_opt::<i32>()
            .context("Could not deserialize bbox as list")?
            .values();
        let bbox = bbox
            .as_primitive_opt::<Float32Type>()
            .context("Could not get bbox value as list")?
            .values();
        let bbox = match bbox.data_type() {
            DataType::Float16 => bbox
                .as_primitive_opt::<Float16Type>()
                .context("Failed to deserialize bbox")?
                .values()
                .iter()
                .map(|x| f32::from(*x))
                .collect(),
            DataType::Float32 => bbox
                .as_primitive_opt::<Float32Type>()
                .context("Failed to deserialize bbox")?
                .values()
                .to_vec(),
            DataType::Float64 => bbox
                .as_primitive_opt::<Float64Type>()
                .context("Failed to deserialize bbox")?
                .values()
                .iter()
                .map(|x| *x as f32)
                .collect(),
            DataType::Int16 => bbox
                .as_primitive_opt::<Int16Type>()
                .context("Failed to deserialize bbox")?
                .values()
                .iter()
                .map(|x| *x as f32)
                .collect(),
            DataType::Int32 => bbox
                .as_primitive_opt::<Int32Type>()
                .context("Failed to deserialize bbox")?
                .values()
                .iter()
                .map(|x| *x as f32)
                .collect(),
            DataType::Int64 => bbox
                .as_primitive_opt::<Int64Type>()
                .context("Failed to deserialize bbox")?
                .values()
                .iter()
                .map(|x| *x as f32)
                .collect(),
            _ => {
                return Err(eyre::eyre!(
                    "Could not deserialize bbox as float32, float64, int32 or int64"
                ))
            }
        };

        if bbox.len() == 0 {
            rec.log(id.as_str(), &rerun::Clear::flat())
@@ -53,18 +98,6 @@ pub fn update_boxes2d(
            .context("Could not deserialize labels as string")?;
        let labels: Vec<Text> = labels.iter().map(|x| Text::from(x.unwrap())).collect();

        // Cast confidence
        let conf_buffer = bbox_struct
            .column_by_name("conf")
            .context("Did not find conf field within bbox struct")?;
        let conf = conf_buffer
            .as_list_opt::<i32>()
            .context("Could not deserialize conf as list")?
            .values();
        let _conf = conf
            .as_primitive_opt::<Float32Type>()
            .context("Could not deserialize conf as string")?;

        let mut centers = vec![];
        let mut sizes = vec![];

--- a/node-hub/dora-rerun/src/lib.rs
+++ b/node-hub/dora-rerun/src/lib.rs
@@ -262,7 +262,6 @@ use pyo3::{
 #[cfg(feature = "python")]
 #[pyfunction]
 fn py_main(_py: Python) -> eyre::Result<()> {
    pyo3::prepare_freethreaded_python();
    lib_main()
 }

--- a/node-hub/dora-sam2/dora_sam2/main.py
+++ b/node-hub/dora-sam2/dora_sam2/main.py
@@ -13,6 +13,10 @@ def main():
    pa.array([])  # initialize pyarrow array
    node = Node()
    frames = {}
    last_pred = None
    labels = None
    return_type = pa.Array
    image_id = None
    for event in node:
        event_type = event["type"]

@@ -59,33 +63,143 @@ def main():
                image = Image.fromarray(frame)
                frames[event_id] = image

                # TODO: Fix the tracking code for SAM2.
                continue
                if last_pred is not None:
                    with (
                        torch.inference_mode(),
                        torch.autocast(
                            "cuda",
                            dtype=torch.bfloat16,
                        ),
                    ):
                        predictor.set_image(frames[image_id])

                        new_logits = []
                        new_masks = []

                        if len(last_pred.shape) < 3:
                            last_pred = np.expand_dims(last_pred, 0)

                        for mask in last_pred:
                            mask = np.expand_dims(mask, 0)  # Make shape: 1x256x256
                            masks, _, new_logit = predictor.predict(
                                mask_input=mask,
                                multimask_output=False,
                            )
                            if len(masks.shape) == 4:
                                masks = masks[:, 0, :, :]
                            else:
                                masks = masks[0, :, :]

                            masks = masks > 0
                            new_masks.append(masks)
                            new_logits.append(new_logit)
                            ## Mask to 3 channel image

                        last_pred = np.concatenate(new_logits, axis=0)
                        masks = np.concatenate(new_masks, axis=0)

                        match return_type:
                            case pa.Array:
                                node.send_output(
                                    "masks",
                                    pa.array(masks.ravel()),
                                    metadata={
                                        "image_id": image_id,
                                        "width": frames[image_id].width,
                                        "height": frames[image_id].height,
                                    },
                                )
                            case pa.StructArray:
                                node.send_output(
                                    "masks",
                                    pa.array(
                                        [
                                            {
                                                "masks": masks.ravel(),
                                                "labels": event["value"]["labels"],
                                            }
                                        ]
                                    ),
                                    metadata={
                                        "image_id": image_id,
                                        "width": frames[image_id].width,
                                        "height": frames[image_id].height,
                                    },
                                )

            elif "boxes2d" in event_id:
                boxes2d = event["value"].to_numpy()

                if isinstance(event["value"], pa.StructArray):
                    boxes2d = event["value"][0].get("bbox").values.to_numpy()
                    labels = (
                        event["value"][0]
                        .get("labels")
                        .values.to_numpy(zero_copy_only=False)
                    )
                    return_type = pa.Array
                else:
                    boxes2d = event["value"].to_numpy()
                    labels = None
                    return_type = pa.Array

                metadata = event["metadata"]
                encoding = metadata["encoding"]
                if encoding != "xyxy":
                    raise RuntimeError(f"Unsupported boxes2d encoding: {encoding}")

                boxes2d = boxes2d.reshape(-1, 4)
                image_id = metadata["image_id"]
                with torch.inference_mode(), torch.autocast(
                    "cuda",
                    dtype=torch.bfloat16,
                with (
                    torch.inference_mode(),
                    torch.autocast(
                        "cuda",
                        dtype=torch.bfloat16,
                    ),
                ):
                    predictor.set_image(frames[image_id])
                    masks, _, _ = predictor.predict(box=boxes2d)
                    masks = masks[0]
                    ## Mask to 3 channel image

                    node.send_output(
                        "masks",
                        pa.array(masks.ravel()),
                        metadata={
                            "image_id": image_id,
                            "width": frames[image_id].width,
                            "height": frames[image_id].height,
                        },
                    masks, _scores, last_pred = predictor.predict(
                        box=boxes2d, point_labels=labels, multimask_output=False
                    )

                    if len(masks.shape) == 4:
                        masks = masks[:, 0, :, :]
                        last_pred = last_pred[:, 0, :, :]
                    else:
                        masks = masks[0, :, :]
                        last_pred = last_pred[0, :, :]

                    masks = masks > 0
                    ## Mask to 3 channel image
                    match return_type:
                        case pa.Array:
                            node.send_output(
                                "masks",
                                pa.array(masks.ravel()),
                                metadata={
                                    "image_id": image_id,
                                    "width": frames[image_id].width,
                                    "height": frames[image_id].height,
                                },
                            )
                        case pa.StructArray:
                            node.send_output(
                                "masks",
                                pa.array(
                                    [
                                        {
                                            "masks": masks.ravel(),
                                            "labels": event["value"]["labels"],
                                        }
                                    ]
                                ),
                                metadata={
                                    "image_id": image_id,
                                    "width": frames[image_id].width,
                                    "height": frames[image_id].height,
                                },
                            )

        elif event_type == "ERROR":
            print("Event Error:" + event["error"])