Minor improvement

11 months ago · 04c06ec185
--- a/examples/reachy2-remote/dataflow_reachy.yml
+++ b/examples/reachy2-remote/dataflow_reachy.yml
@@ -4,17 +4,40 @@ nodes:
    _unstable_deploy:
      machine: encoder
    inputs:
      tick: dora/timer/millis/10
      tick: dora/timer/millis/20
    outputs:
      - image_left
      - image_depth
      - depth
    env:
      CAPTURE_PATH: 0
      IMAGE_WIDTH: 640
      IMAGE_HEIGHT: 480
      ROBOT_IP: 127.0.0.1

  - id: reachy-left-arm
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-left-arm
    _unstable_deploy:
      machine: encoder
    inputs:
      pose: parse_pose/action_l_arm
    outputs:
      - response_l_arm
    env:
      ROBOT_IP: 127.0.0.1

  - id: reachy-right-arm
    build: pip install -e ../../node-hub/dora-reachy2
    path: dora-reachy2-right-arm
    _unstable_deploy:
      machine: encoder
    inputs:
      pose: parse_pose/action_r_arm
    outputs:
      - response_r_arm
    env:
      ROBOT_IP: 127.0.0.1

  - id: rav1e-local-image
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
@@ -26,10 +49,21 @@ nodes:
    outputs:
      - image_left
      - image_depth
      - depth
    env:
      RAV1E_SPEED: 10

  - id: rav1e-local-depth
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    _unstable_deploy:
      machine: encoder
    inputs:
      depth: camera/depth
    outputs:
      - depth
    env:
      RAV1E_SPEED: 7

  - id: dav1d-remote
    path: dora-dav1d
    build: cargo build -p dora-dav1d --release
@@ -38,7 +72,7 @@ nodes:
    inputs:
      image_depth: rav1e-local-image/image_depth
      image_left: rav1e-local-image/image_left
      # depth: rav1e-local/depth
      depth: rav1e-local-depth/depth
    outputs:
      - image_left
      - image_depth
@@ -87,6 +121,8 @@ nodes:
      - action
      - points
      - text
      - action_release_left
      - action_release_right
    env:
      IMAGE_RESIZE_RATIO: "1.0"

@@ -118,6 +154,17 @@ nodes:
    env:
      IMAGE_RESIZE_RATIO: "1.0"

  - id: sam2
    build: pip install -e ../../node-hub/dora-sam2
    path: dora-sam2
    _unstable_deploy:
      machine: gpu
    inputs:
      image_depth: dav1d-remote/image_depth
      boxes2d: parse_bbox/bbox_grab
    outputs:
      - masks

  - id: tracker
    build: pip install -e ../../node-hub/dora-cotracker
    path: dora-cotracker
@@ -132,24 +179,32 @@ nodes:
    env:
      INTERACTIVE_MODE: false

  # - id: box_coordinates
  # build: pip install -e ../../node-hub/dora-object-to-pose
  # path: dora-object-to-pose
  # inputs:
  # depth: reachy-camera/depth
  # boxes2d: parse_bbox/bbox
  # outputs:
  # - pose
  #- id: sam2
  #build: pip install -e ../../node-hub/dora-sam2
  #path: dora-sam2
  #_unstable_deploy:
  #machine: gpu
  #inputs:
  #image_left: dav1d-remote/image_left
  #boxes2d: parse_bbox/bbox
  #outputs:
  #- masks
  - id: box_coordinates
    build: pip install -e ../../node-hub/dora-object-to-pose
    path: dora-object-to-pose
    _unstable_deploy:
      machine: gpu
    inputs:
      depth: dav1d-remote/depth
      masks: sam2/masks
    outputs:
      - pose

  - id: parse_pose
    path: parse_pose.py
    _unstable_deploy:
      machine: gpu
    inputs:
      pose: box_coordinates/pose
      response_r_arm: reachy-right-arm/response_r_arm
      response_l_arm: reachy-left-arm/response_l_arm
      release_left: parse_whisper/action_release_left
      release_right: parse_whisper/action_release_right
    outputs:
      - action_r_arm
      - action_l_arm
    env:
      IMAGE_RESIZE_RATIO: "1.0"

  - id: parse_point
    path: parse_point.py
@@ -179,12 +234,15 @@ nodes:
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    _unstable_deploy:
      machine: macbook
      machine: gpu
    inputs:
      image: dav1d-remote/image_left
      image_depth: dav1d-remote/image_depth
      boxes2d: parse_bbox/bbox
      torso/image: dav1d-remote/image_depth
      torso/depth: dav1d-remote/depth
      torso/boxes2d: parse_bbox/bbox
      original_text: dora-distil-whisper/text
      parsed_text: parse_whisper/text
      qwenvl_text: dora-qwenvl/text
      tracked_image: tracker/tracked_image
    env:
      RERUN_MEMORY_LIMIT: 5%
      CAMERA_PITCH: 2.47
--- a/examples/reachy2-remote/parse_bbox.py
+++ b/examples/reachy2-remote/parse_bbox.py
@@ -54,20 +54,23 @@ for event in node:
            continue

        text = event["value"][0].as_py()
        metadata = event["metadata"]
        image_id = event["metadata"]["image_id"]

        bboxes, labels = extract_bboxes(text)
        if bboxes is not None and len(bboxes) > 0:
            bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
            metadata["image_id"] = image_id
            metadata["encoding"] = "xyxy"
            if image_id == "image_left":
                node.send_output(
                    "bbox_track",
                    pa.array(bboxes.ravel()),
                    metadata={"encoding": "xyxy", "image_id": image_id},
                    metadata,
                )
            elif image_id == "image_depth":
                node.send_output(
                    "bbox_grab",
                    pa.array(bboxes.ravel()),
                    metadata={"encoding": "xyxy", "image_id": image_id},
                    metadata,
                )
--- a/examples/reachy2-remote/parse_point.py
+++ b/examples/reachy2-remote/parse_point.py
@@ -29,18 +29,24 @@ for event in node:
        point = values[-1]

        rz = int((width / 2) - point[0]) / (width / 2)
        x_distance = min(height / 2, height - point[1])

        if abs(rz) > 0.3:
            rz = np.deg2rad(30) * np.sign(rz)
        x_distance = min(height, height - point[1])

        if abs(rz) > 0.75:
            rz = np.deg2rad(90) * np.sign(rz)
        if abs(rz) > 0.5:
            rz = np.deg2rad(60) * np.sign(rz)
        elif abs(rz) > 0.3:
            rz = np.deg2rad(55) * np.sign(rz)
        elif abs(rz) > 0.1:
            rz = np.deg2rad(20) * np.sign(rz)
            rz = np.deg2rad(45) * np.sign(rz)
        else:
            x = 0

        if x_distance > (height * 0.3):
            x = 0.7
        elif x_distance > (height * 0.15):
        if x_distance > (height * 0.7):
            x = 0.5
        elif x_distance > (height * 0.5):
            x = 0.5
        elif x_distance > (height * 0.2):
            x = 0.5
        else:
            x = 0
--- a/examples/reachy2-remote/parse_pose.py
+++ b/examples/reachy2-remote/parse_pose.py
@@ -0,0 +1,291 @@
 """TODO: Add docstring."""

 import json
 import os

 import numpy as np
 import pyarrow as pa
 from dora import Node

 node = Node()

 IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


 l_init_pose = [
    -7.0631310641087435,
    -10.432298603362307,
    24.429809104404114,
    -132.15000828778648,
    -1.5494749438811133,
    -21.749917789205202,
    8.099312596108344,
    100,
 ]
 r_init_pose = [
    -5.60273587426976,
    10.780818397272316,
    -27.868146823156042,
    -126.15650363072193,
    3.961108018106834,
    -35.43682799906162,
    350.9236448374495,
    100,
 ]
 r_release_closed_pose = [
    -26.1507947940993,
    12.16735021387949,
    -2.2657319092611976,
    -97.63648867582175,
    -19.91084837404425,
    22.10184328619011,
    366.71351223614494,
    0,
 ]

 r_release_opened_pose = [
    -26.1507947940993,
    12.16735021387949,
    -2.2657319092611976,
    -97.63648867582175,
    -19.91084837404425,
    22.10184328619011,
    366.71351223614494,
    100,
 ]

 l_release_opened_pose = [
    -30.04330081906935,
    -7.415231584691132,
    3.6972339048071468,
    -97.7274736257555,
    12.996718740452982,
    30.838020649757016,
    -1.5572310505704858,
    0,
 ]

 l_release_closed_pose = [
    -30.04330081906935,
    -7.415231584691132,
    3.6972339048071468,
    -97.7274736257555,
    12.996718740452982,
    30.838020649757016,
    -1.5572310505704858,
    100,
 ]


 def wait_for_event(id, timeout=None, cache={}):
    """TODO: Add docstring."""
    while True:
        event = node.next(timeout=timeout)
        if event is None:
            cache["finished"] = True
            return None, cache
        if event["type"] == "INPUT":
            cache[event["id"]] = event["value"]
            if event["id"] == id:
                return event["value"], cache

        elif event["type"] == "ERROR":
            return None, cache


 arm_holding_object = None
 cache = {}


 ## ---- INIT ---
 node.send_output(
    "action_r_arm",
    pa.array(r_init_pose),
    metadata={"encoding": "jointstate", "duration": 2},
 )
 node.send_output(
    "action_l_arm",
    pa.array(l_init_pose),
    metadata={"encoding": "jointstate", "duration": 2},
 )

 for event in node:
    if event["type"] == "INPUT":
        if event["id"] == "pose":
            values = event["value"]
            values = values.to_numpy()
            print("Pose: ", values)
            if len(values) == 0:
                continue
            x = values[0]
            y = values[1]
            z = values[2]
            action = event["metadata"]["action"]

            match action:
                case "grab":
                    if len(values) == 0:
                        continue
                    x = x + 0.03

                    ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement.
                    trajectory = np.array(
                        [
                            [x, y, -0.16, 0, 0, 0, 100],
                            [x, y, z, 0, 0, 0, 0],
                            [x, y, -0.16, 0, 0, 0, 0],
                        ],
                    ).ravel()

                    if y < 0:
                        node.send_output(
                            "action_r_arm",
                            pa.array(trajectory),
                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
                        )
                        event = wait_for_event(id="response_r_arm", timeout=5)
                        if event is not None and event[0]:
                            print("Success")
                            arm_holding_object = "right"
                            node.send_output(
                                "action_r_arm",
                                pa.array([0.1, -0.2, -0.16, 0, 0, 0, 0]),
                                metadata={"encoding": "xyzrpy", "duration": "1"},
                            )
                        else:
                            print("Failed: x: ", x, " y: ", y, " z: ", z)
                            node.send_output(
                                "action_r_arm",
                                pa.array(r_init_pose),
                                metadata={"encoding": "jointstate", "duration": "1"},
                            )
                            event = wait_for_event(id="response_r_arm")
                    else:
                        y += 0.03
                        node.send_output(
                            "action_l_arm",
                            pa.array(trajectory),
                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
                        )
                        event = wait_for_event(id="response_l_arm", timeout=5)
                        if event is not None and event[0]:
                            print("Success")
                            arm_holding_object = "left"
                            node.send_output(
                                "action_l_arm",
                                pa.array([0.1, 0.2, -0.16, 0, 0, 0, 0]),
                                metadata={"encoding": "xyzrpy", "duration": "1"},
                            )
                        else:
                            print("Failed")
                            node.send_output(
                                "action_l_arm",
                                pa.array(l_init_pose),
                                metadata={"encoding": "jointstate", "duration": "1"},
                            )
                            event = wait_for_event(id="response_l_arm")
                case "release":
                    if len(values) == 0:
                        continue
                    x = x + 0.03

                    ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement.
                    trajectory = np.array(
                        [
                            [x, y, -0.16, 0, 0, 0, 100],
                        ],
                    ).ravel()

                    if y < 0:
                        node.send_output(
                            "action_r_arm",
                            pa.array(trajectory),
                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
                        )
                        event = wait_for_event(id="response_r_arm", timeout=5)
                        if event is not None and event[0]:
                            print("Success")
                            arm_holding_object = "right"
                            node.send_output(
                                "action_r_arm",
                                pa.array(r_init_pose),
                                metadata={"encoding": "jointstate", "duration": 1},
                            )
                        else:
                            print("Failed: x: ", x, " y: ", y, " z: ", z)
                            node.send_output(
                                "action_r_arm",
                                pa.array(r_init_pose),
                                metadata={"encoding": "jointstate", "duration": "1"},
                            )
                            event = wait_for_event(id="response_r_arm")
                    else:
                        y += 0.03
                        node.send_output(
                            "action_l_arm",
                            pa.array(trajectory),
                            metadata={"encoding": "xyzrpy", "duration": "0.75"},
                        )
                        event = wait_for_event(id="response_l_arm", timeout=5)
                        if event is not None and event[0]:
                            print("Success")
                            arm_holding_object = "left"
                            node.send_output(
                                "action_l_arm",
                                pa.array(l_init_pose),
                                metadata={"encoding": "jointstate", "duration": 1},
                            )
                        else:
                            print("Failed")
                            node.send_output(
                                "action_l_arm",
                                pa.array(l_init_pose),
                                metadata={"encoding": "jointstate", "duration": "1"},
                            )
                            event = wait_for_event(id="response_l_arm")

        elif event["id"] == "release_right":
            node.send_output(
                "action_r_arm",
                pa.array(
                    [
                        0.4,
                        0,
                        -0.16,
                        0,
                        0,
                        0,
                        100,
                    ],
                ),
                metadata={"encoding": "xyzrpy", "duration": "0.75"},
            )
            event, cache = wait_for_event(id="response_r_arm", cache=cache)
            node.send_output(
                "action_r_arm",
                pa.array(r_init_pose),
                metadata={"encoding": "jointstate", "duration": 1},
            )
        elif event["id"] == "release_left":
            node.send_output(
                "action_l_arm",
                pa.array(
                    [
                        0.4,
                        0,
                        -0.16,
                        0,
                        0,
                        0,
                        100,
                    ],
                ),
                metadata={"encoding": "xyzrpy", "duration": "0.75"},
            )
            event, cache = wait_for_event(id="response_l_arm", cache=cache)

            node.send_output(
                "action_l_arm",
                pa.array(l_init_pose),
                metadata={"encoding": "jointstate", "duration": 1},
            )
--- a/examples/reachy2-remote/parse_whisper.py
+++ b/examples/reachy2-remote/parse_whisper.py
@@ -59,8 +59,21 @@ for event in node:
            node.send_output("text", pa.array([text]), {"image_id": "image_left"})
        elif "grab" in text:
            text = f"Given the prompt: {text}. Output the bounding boxes for the given grabbed object"
            node.send_output("text", pa.array([text]), {"image_id": "image_depth"})
        elif "left" in text:
            node.send_output(
                "text", pa.array([text]), {"image_id": "image_depth", "action": "grab"}
            )
        elif "put " in text:
            text = f"Given the prompt: {text}. Output the bounding boxes for the place to put the object"
            node.send_output(
                "text",
                pa.array([text]),
                {"image_id": "image_depth", "action": "release"},
            )
        elif "release left" in text:
            node.send_output("action_release_left", pa.array([1.0]))
        elif "release right" in text:
            node.send_output("action_release_right", pa.array([1.0]))
        elif "turn left" in text:
            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
            node.send_output("action", action)
            time.sleep(0.25)
@@ -70,7 +83,7 @@ for event in node:
            action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)])
            node.send_output("action", action)
            node.send_output("points", pa.array([]))
        elif "right" in text:
        elif "turn right" in text:
            action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)])
            node.send_output("action", action)
            time.sleep(0.25)
--- a/node-hub/dora-object-to-pose/src/lib.rs
+++ b/node-hub/dora-object-to-pose/src/lib.rs
@@ -1,7 +1,7 @@
 use core::f32;
 use dora_node_api::{
    arrow::{
        array::{AsArray, Float64Array, UInt8Array},
        array::{AsArray, Float64Array, UInt16Array, UInt8Array},
        datatypes::{Float32Type, Int64Type},
    },
    dora_core::config::DataId,
@@ -11,7 +11,7 @@ use eyre::Result;
 use std::collections::HashMap;

 fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> {
    let (_x, _y, _z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) =
    let (sum_x, sum_y, sum_z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) =
        points.iter().fold(
            (
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.0, -10.0, 10.0, -10.0, 10., -10.0,
@@ -49,11 +49,7 @@ fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> {
                )
            },
        );
    let (mean_x, mean_y, mean_z) = (
        (x_max + x_min) / 2.,
        (y_max + y_min) / 2.,
        (z_max + z_min) / 2.,
    );
    let (mean_x, mean_y, mean_z) = ((sum_x) / n, (sum_y) / n, (sum_z) / n);

    // Compute covariance and standard deviations
    let cov = sum_xy / n - mean_x * mean_y;
@@ -116,7 +112,8 @@ pub fn lib_main() -> Result<()> {
                    } else {
                        vec![640, 480]
                    };
                    let buffer: &Float64Array = data.as_any().downcast_ref().unwrap();
                    let buffer: &UInt16Array = data.as_any().downcast_ref().unwrap();

                    depth_frame = Some(buffer.clone());
                }
                "masks" => {
@@ -137,6 +134,8 @@ pub fn lib_main() -> Result<()> {
                        continue;
                    };

                    let mut z_2 = 0.0;
                    let mut z_1 = 0.0;
                    let outputs: Vec<Vec<f32>> = masks
                        .chunks(height as usize * width as usize)
                        .filter_map(|data| {
@@ -150,23 +149,36 @@ pub fn lib_main() -> Result<()> {
                                    let v = i as f32 / width as f32; // Calculate y-coordinate (v)

                                    if let Some(z) = z {
                                        let z = z as f32;
                                        let z = (z as f32) / 1000.;
                                        // Skip points that have empty depth or is too far away
                                        if z == 0. || z > 20.0 {
                                            return;
                                        }
                                        if data[i] {
                                            let y = (u - resolution[0] as f32) * z
                                                / focal_length[0] as f32;
                                            let x = (v - resolution[1] as f32) * z
                                                / focal_length[1] as f32;
                                            let new_x = sin_theta * z + cos_theta * x;
                                            let new_y = -y;
                                            let new_z = cos_theta * z - sin_theta * x;
                                        if z_2 == 0. && z_1 == 0. {
                                            z_1 = z;
                                        } else if z_1 == 0. {
                                            z_2 = z_1;
                                            z_1 = z;
                                        } else if (z - z_2).abs() < 0.1 && (z - z_1).abs() < 0.1 {
                                            z_2 = z_1;
                                            z_1 = z;

                                            points.push((new_x, new_y, new_z));
                                            z_total += new_z;
                                            n += 1.;
                                            if data[i] {
                                                let y = (u - resolution[0] as f32) * z
                                                    / focal_length[0] as f32;
                                                let x = (v - resolution[1] as f32) * z
                                                    / focal_length[1] as f32;
                                                let new_x = sin_theta * z + cos_theta * x;
                                                let new_y = -y;
                                                let new_z = cos_theta * z - sin_theta * x;

                                                points.push((new_x, new_y, new_z));
                                                z_total += new_z;
                                                n += 1.;
                                            }
                                        } else {
                                            z_2 = z_1;
                                            z_1 = z;
                                        }
                                    }
                                });
@@ -215,7 +227,7 @@ pub fn lib_main() -> Result<()> {
                                        let v = i as f32 / width as f32; // Calculate y-coordinate (v)

                                        if let Some(z) = z {
                                            let z = z as f32;
                                            let z = (z as f32) / 1000.;
                                            // Skip points that have empty depth or is too far away
                                            if z == 0. || z > 5.0 {
                                                return;
--- a/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py
+++ b/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py
@@ -229,10 +229,12 @@ def main():
                    past_key_values,
                    image_id,
                )
                metadata = event["metadata"]
                metadata["image_id"] = image_id if image_id is not None else "all"
                node.send_output(
                    "text",
                    pa.array([response]),
                    {"image_id": image_id if image_id is not None else "all"},
                    metadata,
                )

        elif event_type == "ERROR":
--- a/node-hub/dora-sam2/dora_sam2/main.py
+++ b/node-hub/dora-sam2/dora_sam2/main.py
@@ -133,7 +133,9 @@ def main():
                                )

            if "boxes2d" in event_id:

                if len(event["value"]) == 0:
                    node.send_output("masks", pa.array([]))
                    continue
                if isinstance(event["value"], pa.StructArray):
                    boxes2d = event["value"][0].get("bbox").values.to_numpy()
                    labels = (
@@ -162,7 +164,59 @@ def main():
                ):
                    predictor.set_image(frames[image_id])
                    masks, _scores, last_pred = predictor.predict(
                        box=boxes2d, point_labels=labels, multimask_output=False,
                        box=boxes2d,
                        point_labels=labels,
                        multimask_output=False,
                    )

                    if len(masks.shape) == 4:
                        masks = masks[:, 0, :, :]
                        last_pred = last_pred[:, 0, :, :]
                    else:
                        masks = masks[0, :, :]
                        last_pred = last_pred[0, :, :]

                    masks = masks > 0
                    metadata["image_id"] = image_id
                    metadata["width"] = frames[image_id].width
                    metadata["height"] = frames[image_id].height
                    ## Mask to 3 channel image
                    match return_type:
                        case pa.Array:
                            node.send_output("masks", pa.array(masks.ravel()), metadata)
                        case pa.StructArray:
                            node.send_output(
                                "masks",
                                pa.array(
                                    [
                                        {
                                            "masks": masks.ravel(),
                                            "labels": event["value"]["labels"],
                                        },
                                    ],
                                ),
                                metadata,
                            )
            elif "points" in event_id:
                points = event["value"].to_numpy().reshape((-1, 2))
                return_type = pa.Array
                if len(frames) == 0:
                    continue
                first_image = next(iter(frames.keys()))
                image_id = event["metadata"].get("image_id", first_image)
                with (
                    torch.inference_mode(),
                    torch.autocast(
                        "cuda",
                        dtype=torch.bfloat16,
                    ),
                ):
                    predictor.set_image(frames[image_id])
                    labels = [i for i in range(len(points))]
                    masks, _scores, last_pred = predictor.predict(
                        points,
                        point_labels=labels,
                        multimask_output=False,
                    )

                    if len(masks.shape) == 4: