diff --git a/examples/reachy2-remote/dataflow_reachy.yml b/examples/reachy2-remote/dataflow_reachy.yml index dc843baf..494b112d 100644 --- a/examples/reachy2-remote/dataflow_reachy.yml +++ b/examples/reachy2-remote/dataflow_reachy.yml @@ -4,17 +4,40 @@ nodes: _unstable_deploy: machine: encoder inputs: - tick: dora/timer/millis/10 + tick: dora/timer/millis/20 outputs: - image_left - image_depth - depth env: - CAPTURE_PATH: 0 IMAGE_WIDTH: 640 IMAGE_HEIGHT: 480 ROBOT_IP: 127.0.0.1 + - id: reachy-left-arm + build: pip install -e ../../node-hub/dora-reachy2 + path: dora-reachy2-left-arm + _unstable_deploy: + machine: encoder + inputs: + pose: parse_pose/action_l_arm + outputs: + - response_l_arm + env: + ROBOT_IP: 127.0.0.1 + + - id: reachy-right-arm + build: pip install -e ../../node-hub/dora-reachy2 + path: dora-reachy2-right-arm + _unstable_deploy: + machine: encoder + inputs: + pose: parse_pose/action_r_arm + outputs: + - response_r_arm + env: + ROBOT_IP: 127.0.0.1 + - id: rav1e-local-image path: dora-rav1e build: cargo build -p dora-rav1e --release @@ -26,10 +49,21 @@ nodes: outputs: - image_left - image_depth - - depth env: RAV1E_SPEED: 10 + - id: rav1e-local-depth + path: dora-rav1e + build: cargo build -p dora-rav1e --release + _unstable_deploy: + machine: encoder + inputs: + depth: camera/depth + outputs: + - depth + env: + RAV1E_SPEED: 7 + - id: dav1d-remote path: dora-dav1d build: cargo build -p dora-dav1d --release @@ -38,7 +72,7 @@ nodes: inputs: image_depth: rav1e-local-image/image_depth image_left: rav1e-local-image/image_left - # depth: rav1e-local/depth + depth: rav1e-local-depth/depth outputs: - image_left - image_depth @@ -87,6 +121,8 @@ nodes: - action - points - text + - action_release_left + - action_release_right env: IMAGE_RESIZE_RATIO: "1.0" @@ -118,6 +154,17 @@ nodes: env: IMAGE_RESIZE_RATIO: "1.0" + - id: sam2 + build: pip install -e ../../node-hub/dora-sam2 + path: dora-sam2 + _unstable_deploy: + machine: gpu + inputs: + image_depth: dav1d-remote/image_depth + boxes2d: parse_bbox/bbox_grab + outputs: + - masks + - id: tracker build: pip install -e ../../node-hub/dora-cotracker path: dora-cotracker @@ -132,24 +179,32 @@ nodes: env: INTERACTIVE_MODE: false - # - id: box_coordinates - # build: pip install -e ../../node-hub/dora-object-to-pose - # path: dora-object-to-pose - # inputs: - # depth: reachy-camera/depth - # boxes2d: parse_bbox/bbox - # outputs: - # - pose - #- id: sam2 - #build: pip install -e ../../node-hub/dora-sam2 - #path: dora-sam2 - #_unstable_deploy: - #machine: gpu - #inputs: - #image_left: dav1d-remote/image_left - #boxes2d: parse_bbox/bbox - #outputs: - #- masks + - id: box_coordinates + build: pip install -e ../../node-hub/dora-object-to-pose + path: dora-object-to-pose + _unstable_deploy: + machine: gpu + inputs: + depth: dav1d-remote/depth + masks: sam2/masks + outputs: + - pose + + - id: parse_pose + path: parse_pose.py + _unstable_deploy: + machine: gpu + inputs: + pose: box_coordinates/pose + response_r_arm: reachy-right-arm/response_r_arm + response_l_arm: reachy-left-arm/response_l_arm + release_left: parse_whisper/action_release_left + release_right: parse_whisper/action_release_right + outputs: + - action_r_arm + - action_l_arm + env: + IMAGE_RESIZE_RATIO: "1.0" - id: parse_point path: parse_point.py @@ -179,12 +234,15 @@ nodes: build: pip install -e ../../node-hub/dora-rerun path: dora-rerun _unstable_deploy: - machine: macbook + machine: gpu inputs: image: dav1d-remote/image_left - image_depth: dav1d-remote/image_depth - boxes2d: parse_bbox/bbox + torso/image: dav1d-remote/image_depth + torso/depth: dav1d-remote/depth + torso/boxes2d: parse_bbox/bbox original_text: dora-distil-whisper/text parsed_text: parse_whisper/text qwenvl_text: dora-qwenvl/text - tracked_image: tracker/tracked_image + env: + RERUN_MEMORY_LIMIT: 5% + CAMERA_PITCH: 2.47 diff --git a/examples/reachy2-remote/parse_bbox.py b/examples/reachy2-remote/parse_bbox.py index 143404ac..88667769 100644 --- a/examples/reachy2-remote/parse_bbox.py +++ b/examples/reachy2-remote/parse_bbox.py @@ -54,20 +54,23 @@ for event in node: continue text = event["value"][0].as_py() + metadata = event["metadata"] image_id = event["metadata"]["image_id"] bboxes, labels = extract_bboxes(text) if bboxes is not None and len(bboxes) > 0: bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) + metadata["image_id"] = image_id + metadata["encoding"] = "xyxy" if image_id == "image_left": node.send_output( "bbox_track", pa.array(bboxes.ravel()), - metadata={"encoding": "xyxy", "image_id": image_id}, + metadata, ) elif image_id == "image_depth": node.send_output( "bbox_grab", pa.array(bboxes.ravel()), - metadata={"encoding": "xyxy", "image_id": image_id}, + metadata, ) diff --git a/examples/reachy2-remote/parse_point.py b/examples/reachy2-remote/parse_point.py index e3401a4e..7e9990da 100644 --- a/examples/reachy2-remote/parse_point.py +++ b/examples/reachy2-remote/parse_point.py @@ -29,18 +29,24 @@ for event in node: point = values[-1] rz = int((width / 2) - point[0]) / (width / 2) - x_distance = min(height / 2, height - point[1]) - - if abs(rz) > 0.3: - rz = np.deg2rad(30) * np.sign(rz) + x_distance = min(height, height - point[1]) + + if abs(rz) > 0.75: + rz = np.deg2rad(90) * np.sign(rz) + if abs(rz) > 0.5: + rz = np.deg2rad(60) * np.sign(rz) + elif abs(rz) > 0.3: + rz = np.deg2rad(55) * np.sign(rz) elif abs(rz) > 0.1: - rz = np.deg2rad(20) * np.sign(rz) + rz = np.deg2rad(45) * np.sign(rz) else: x = 0 - if x_distance > (height * 0.3): - x = 0.7 - elif x_distance > (height * 0.15): + if x_distance > (height * 0.7): + x = 0.5 + elif x_distance > (height * 0.5): + x = 0.5 + elif x_distance > (height * 0.2): x = 0.5 else: x = 0 diff --git a/examples/reachy2-remote/parse_pose.py b/examples/reachy2-remote/parse_pose.py index e69de29b..042b6c0a 100644 --- a/examples/reachy2-remote/parse_pose.py +++ b/examples/reachy2-remote/parse_pose.py @@ -0,0 +1,291 @@ +"""TODO: Add docstring.""" + +import json +import os + +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() + +IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) + + +l_init_pose = [ + -7.0631310641087435, + -10.432298603362307, + 24.429809104404114, + -132.15000828778648, + -1.5494749438811133, + -21.749917789205202, + 8.099312596108344, + 100, +] +r_init_pose = [ + -5.60273587426976, + 10.780818397272316, + -27.868146823156042, + -126.15650363072193, + 3.961108018106834, + -35.43682799906162, + 350.9236448374495, + 100, +] +r_release_closed_pose = [ + -26.1507947940993, + 12.16735021387949, + -2.2657319092611976, + -97.63648867582175, + -19.91084837404425, + 22.10184328619011, + 366.71351223614494, + 0, +] + +r_release_opened_pose = [ + -26.1507947940993, + 12.16735021387949, + -2.2657319092611976, + -97.63648867582175, + -19.91084837404425, + 22.10184328619011, + 366.71351223614494, + 100, +] + +l_release_opened_pose = [ + -30.04330081906935, + -7.415231584691132, + 3.6972339048071468, + -97.7274736257555, + 12.996718740452982, + 30.838020649757016, + -1.5572310505704858, + 0, +] + +l_release_closed_pose = [ + -30.04330081906935, + -7.415231584691132, + 3.6972339048071468, + -97.7274736257555, + 12.996718740452982, + 30.838020649757016, + -1.5572310505704858, + 100, +] + + +def wait_for_event(id, timeout=None, cache={}): + """TODO: Add docstring.""" + while True: + event = node.next(timeout=timeout) + if event is None: + cache["finished"] = True + return None, cache + if event["type"] == "INPUT": + cache[event["id"]] = event["value"] + if event["id"] == id: + return event["value"], cache + + elif event["type"] == "ERROR": + return None, cache + + +arm_holding_object = None +cache = {} + + +## ---- INIT --- +node.send_output( + "action_r_arm", + pa.array(r_init_pose), + metadata={"encoding": "jointstate", "duration": 2}, +) +node.send_output( + "action_l_arm", + pa.array(l_init_pose), + metadata={"encoding": "jointstate", "duration": 2}, +) + +for event in node: + if event["type"] == "INPUT": + if event["id"] == "pose": + values = event["value"] + values = values.to_numpy() + print("Pose: ", values) + if len(values) == 0: + continue + x = values[0] + y = values[1] + z = values[2] + action = event["metadata"]["action"] + + match action: + case "grab": + if len(values) == 0: + continue + x = x + 0.03 + + ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement. + trajectory = np.array( + [ + [x, y, -0.16, 0, 0, 0, 100], + [x, y, z, 0, 0, 0, 0], + [x, y, -0.16, 0, 0, 0, 0], + ], + ).ravel() + + if y < 0: + node.send_output( + "action_r_arm", + pa.array(trajectory), + metadata={"encoding": "xyzrpy", "duration": "0.75"}, + ) + event = wait_for_event(id="response_r_arm", timeout=5) + if event is not None and event[0]: + print("Success") + arm_holding_object = "right" + node.send_output( + "action_r_arm", + pa.array([0.1, -0.2, -0.16, 0, 0, 0, 0]), + metadata={"encoding": "xyzrpy", "duration": "1"}, + ) + else: + print("Failed: x: ", x, " y: ", y, " z: ", z) + node.send_output( + "action_r_arm", + pa.array(r_init_pose), + metadata={"encoding": "jointstate", "duration": "1"}, + ) + event = wait_for_event(id="response_r_arm") + else: + y += 0.03 + node.send_output( + "action_l_arm", + pa.array(trajectory), + metadata={"encoding": "xyzrpy", "duration": "0.75"}, + ) + event = wait_for_event(id="response_l_arm", timeout=5) + if event is not None and event[0]: + print("Success") + arm_holding_object = "left" + node.send_output( + "action_l_arm", + pa.array([0.1, 0.2, -0.16, 0, 0, 0, 0]), + metadata={"encoding": "xyzrpy", "duration": "1"}, + ) + else: + print("Failed") + node.send_output( + "action_l_arm", + pa.array(l_init_pose), + metadata={"encoding": "jointstate", "duration": "1"}, + ) + event = wait_for_event(id="response_l_arm") + case "release": + if len(values) == 0: + continue + x = x + 0.03 + + ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement. + trajectory = np.array( + [ + [x, y, -0.16, 0, 0, 0, 100], + ], + ).ravel() + + if y < 0: + node.send_output( + "action_r_arm", + pa.array(trajectory), + metadata={"encoding": "xyzrpy", "duration": "0.75"}, + ) + event = wait_for_event(id="response_r_arm", timeout=5) + if event is not None and event[0]: + print("Success") + arm_holding_object = "right" + node.send_output( + "action_r_arm", + pa.array(r_init_pose), + metadata={"encoding": "jointstate", "duration": 1}, + ) + else: + print("Failed: x: ", x, " y: ", y, " z: ", z) + node.send_output( + "action_r_arm", + pa.array(r_init_pose), + metadata={"encoding": "jointstate", "duration": "1"}, + ) + event = wait_for_event(id="response_r_arm") + else: + y += 0.03 + node.send_output( + "action_l_arm", + pa.array(trajectory), + metadata={"encoding": "xyzrpy", "duration": "0.75"}, + ) + event = wait_for_event(id="response_l_arm", timeout=5) + if event is not None and event[0]: + print("Success") + arm_holding_object = "left" + node.send_output( + "action_l_arm", + pa.array(l_init_pose), + metadata={"encoding": "jointstate", "duration": 1}, + ) + else: + print("Failed") + node.send_output( + "action_l_arm", + pa.array(l_init_pose), + metadata={"encoding": "jointstate", "duration": "1"}, + ) + event = wait_for_event(id="response_l_arm") + + elif event["id"] == "release_right": + node.send_output( + "action_r_arm", + pa.array( + [ + 0.4, + 0, + -0.16, + 0, + 0, + 0, + 100, + ], + ), + metadata={"encoding": "xyzrpy", "duration": "0.75"}, + ) + event, cache = wait_for_event(id="response_r_arm", cache=cache) + node.send_output( + "action_r_arm", + pa.array(r_init_pose), + metadata={"encoding": "jointstate", "duration": 1}, + ) + elif event["id"] == "release_left": + node.send_output( + "action_l_arm", + pa.array( + [ + 0.4, + 0, + -0.16, + 0, + 0, + 0, + 100, + ], + ), + metadata={"encoding": "xyzrpy", "duration": "0.75"}, + ) + event, cache = wait_for_event(id="response_l_arm", cache=cache) + + node.send_output( + "action_l_arm", + pa.array(l_init_pose), + metadata={"encoding": "jointstate", "duration": 1}, + ) diff --git a/examples/reachy2-remote/parse_whisper.py b/examples/reachy2-remote/parse_whisper.py index 74211806..99a5e47f 100644 --- a/examples/reachy2-remote/parse_whisper.py +++ b/examples/reachy2-remote/parse_whisper.py @@ -59,8 +59,21 @@ for event in node: node.send_output("text", pa.array([text]), {"image_id": "image_left"}) elif "grab" in text: text = f"Given the prompt: {text}. Output the bounding boxes for the given grabbed object" - node.send_output("text", pa.array([text]), {"image_id": "image_depth"}) - elif "left" in text: + node.send_output( + "text", pa.array([text]), {"image_id": "image_depth", "action": "grab"} + ) + elif "put " in text: + text = f"Given the prompt: {text}. Output the bounding boxes for the place to put the object" + node.send_output( + "text", + pa.array([text]), + {"image_id": "image_depth", "action": "release"}, + ) + elif "release left" in text: + node.send_output("action_release_left", pa.array([1.0])) + elif "release right" in text: + node.send_output("action_release_right", pa.array([1.0])) + elif "turn left" in text: action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) node.send_output("action", action) time.sleep(0.25) @@ -70,7 +83,7 @@ for event in node: action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) node.send_output("action", action) node.send_output("points", pa.array([])) - elif "right" in text: + elif "turn right" in text: action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) node.send_output("action", action) time.sleep(0.25) diff --git a/node-hub/dora-object-to-pose/src/lib.rs b/node-hub/dora-object-to-pose/src/lib.rs index 8c2a3779..98aadad1 100644 --- a/node-hub/dora-object-to-pose/src/lib.rs +++ b/node-hub/dora-object-to-pose/src/lib.rs @@ -1,7 +1,7 @@ use core::f32; use dora_node_api::{ arrow::{ - array::{AsArray, Float64Array, UInt8Array}, + array::{AsArray, Float64Array, UInt16Array, UInt8Array}, datatypes::{Float32Type, Int64Type}, }, dora_core::config::DataId, @@ -11,7 +11,7 @@ use eyre::Result; use std::collections::HashMap; fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec { - let (_x, _y, _z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) = + let (sum_x, sum_y, sum_z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) = points.iter().fold( ( 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.0, -10.0, 10.0, -10.0, 10., -10.0, @@ -49,11 +49,7 @@ fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec { ) }, ); - let (mean_x, mean_y, mean_z) = ( - (x_max + x_min) / 2., - (y_max + y_min) / 2., - (z_max + z_min) / 2., - ); + let (mean_x, mean_y, mean_z) = ((sum_x) / n, (sum_y) / n, (sum_z) / n); // Compute covariance and standard deviations let cov = sum_xy / n - mean_x * mean_y; @@ -116,7 +112,8 @@ pub fn lib_main() -> Result<()> { } else { vec![640, 480] }; - let buffer: &Float64Array = data.as_any().downcast_ref().unwrap(); + let buffer: &UInt16Array = data.as_any().downcast_ref().unwrap(); + depth_frame = Some(buffer.clone()); } "masks" => { @@ -137,6 +134,8 @@ pub fn lib_main() -> Result<()> { continue; }; + let mut z_2 = 0.0; + let mut z_1 = 0.0; let outputs: Vec> = masks .chunks(height as usize * width as usize) .filter_map(|data| { @@ -150,23 +149,36 @@ pub fn lib_main() -> Result<()> { let v = i as f32 / width as f32; // Calculate y-coordinate (v) if let Some(z) = z { - let z = z as f32; + let z = (z as f32) / 1000.; // Skip points that have empty depth or is too far away if z == 0. || z > 20.0 { return; } - if data[i] { - let y = (u - resolution[0] as f32) * z - / focal_length[0] as f32; - let x = (v - resolution[1] as f32) * z - / focal_length[1] as f32; - let new_x = sin_theta * z + cos_theta * x; - let new_y = -y; - let new_z = cos_theta * z - sin_theta * x; + if z_2 == 0. && z_1 == 0. { + z_1 = z; + } else if z_1 == 0. { + z_2 = z_1; + z_1 = z; + } else if (z - z_2).abs() < 0.1 && (z - z_1).abs() < 0.1 { + z_2 = z_1; + z_1 = z; - points.push((new_x, new_y, new_z)); - z_total += new_z; - n += 1.; + if data[i] { + let y = (u - resolution[0] as f32) * z + / focal_length[0] as f32; + let x = (v - resolution[1] as f32) * z + / focal_length[1] as f32; + let new_x = sin_theta * z + cos_theta * x; + let new_y = -y; + let new_z = cos_theta * z - sin_theta * x; + + points.push((new_x, new_y, new_z)); + z_total += new_z; + n += 1.; + } + } else { + z_2 = z_1; + z_1 = z; } } }); @@ -215,7 +227,7 @@ pub fn lib_main() -> Result<()> { let v = i as f32 / width as f32; // Calculate y-coordinate (v) if let Some(z) = z { - let z = z as f32; + let z = (z as f32) / 1000.; // Skip points that have empty depth or is too far away if z == 0. || z > 5.0 { return; diff --git a/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py b/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py index 8a7ade0c..3125858c 100644 --- a/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py +++ b/node-hub/dora-qwen2-5-vl/dora_qwen2_5_vl/main.py @@ -229,10 +229,12 @@ def main(): past_key_values, image_id, ) + metadata = event["metadata"] + metadata["image_id"] = image_id if image_id is not None else "all" node.send_output( "text", pa.array([response]), - {"image_id": image_id if image_id is not None else "all"}, + metadata, ) elif event_type == "ERROR": diff --git a/node-hub/dora-sam2/dora_sam2/main.py b/node-hub/dora-sam2/dora_sam2/main.py index d2612cac..37b216a9 100644 --- a/node-hub/dora-sam2/dora_sam2/main.py +++ b/node-hub/dora-sam2/dora_sam2/main.py @@ -133,7 +133,9 @@ def main(): ) if "boxes2d" in event_id: - + if len(event["value"]) == 0: + node.send_output("masks", pa.array([])) + continue if isinstance(event["value"], pa.StructArray): boxes2d = event["value"][0].get("bbox").values.to_numpy() labels = ( @@ -162,7 +164,59 @@ def main(): ): predictor.set_image(frames[image_id]) masks, _scores, last_pred = predictor.predict( - box=boxes2d, point_labels=labels, multimask_output=False, + box=boxes2d, + point_labels=labels, + multimask_output=False, + ) + + if len(masks.shape) == 4: + masks = masks[:, 0, :, :] + last_pred = last_pred[:, 0, :, :] + else: + masks = masks[0, :, :] + last_pred = last_pred[0, :, :] + + masks = masks > 0 + metadata["image_id"] = image_id + metadata["width"] = frames[image_id].width + metadata["height"] = frames[image_id].height + ## Mask to 3 channel image + match return_type: + case pa.Array: + node.send_output("masks", pa.array(masks.ravel()), metadata) + case pa.StructArray: + node.send_output( + "masks", + pa.array( + [ + { + "masks": masks.ravel(), + "labels": event["value"]["labels"], + }, + ], + ), + metadata, + ) + elif "points" in event_id: + points = event["value"].to_numpy().reshape((-1, 2)) + return_type = pa.Array + if len(frames) == 0: + continue + first_image = next(iter(frames.keys())) + image_id = event["metadata"].get("image_id", first_image) + with ( + torch.inference_mode(), + torch.autocast( + "cuda", + dtype=torch.bfloat16, + ), + ): + predictor.set_image(frames[image_id]) + labels = [i for i in range(len(points))] + masks, _scores, last_pred = predictor.predict( + points, + point_labels=labels, + multimask_output=False, ) if len(masks.shape) == 4: