| @@ -4,17 +4,40 @@ nodes: | |||
| _unstable_deploy: | |||
| machine: encoder | |||
| inputs: | |||
| tick: dora/timer/millis/10 | |||
| tick: dora/timer/millis/20 | |||
| outputs: | |||
| - image_left | |||
| - image_depth | |||
| - depth | |||
| env: | |||
| CAPTURE_PATH: 0 | |||
| IMAGE_WIDTH: 640 | |||
| IMAGE_HEIGHT: 480 | |||
| ROBOT_IP: 127.0.0.1 | |||
| - id: reachy-left-arm | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-left-arm | |||
| _unstable_deploy: | |||
| machine: encoder | |||
| inputs: | |||
| pose: parse_pose/action_l_arm | |||
| outputs: | |||
| - response_l_arm | |||
| env: | |||
| ROBOT_IP: 127.0.0.1 | |||
| - id: reachy-right-arm | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-right-arm | |||
| _unstable_deploy: | |||
| machine: encoder | |||
| inputs: | |||
| pose: parse_pose/action_r_arm | |||
| outputs: | |||
| - response_r_arm | |||
| env: | |||
| ROBOT_IP: 127.0.0.1 | |||
| - id: rav1e-local-image | |||
| path: dora-rav1e | |||
| build: cargo build -p dora-rav1e --release | |||
| @@ -26,10 +49,21 @@ nodes: | |||
| outputs: | |||
| - image_left | |||
| - image_depth | |||
| - depth | |||
| env: | |||
| RAV1E_SPEED: 10 | |||
| - id: rav1e-local-depth | |||
| path: dora-rav1e | |||
| build: cargo build -p dora-rav1e --release | |||
| _unstable_deploy: | |||
| machine: encoder | |||
| inputs: | |||
| depth: camera/depth | |||
| outputs: | |||
| - depth | |||
| env: | |||
| RAV1E_SPEED: 7 | |||
| - id: dav1d-remote | |||
| path: dora-dav1d | |||
| build: cargo build -p dora-dav1d --release | |||
| @@ -38,7 +72,7 @@ nodes: | |||
| inputs: | |||
| image_depth: rav1e-local-image/image_depth | |||
| image_left: rav1e-local-image/image_left | |||
| # depth: rav1e-local/depth | |||
| depth: rav1e-local-depth/depth | |||
| outputs: | |||
| - image_left | |||
| - image_depth | |||
| @@ -87,6 +121,8 @@ nodes: | |||
| - action | |||
| - points | |||
| - text | |||
| - action_release_left | |||
| - action_release_right | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "1.0" | |||
| @@ -118,6 +154,17 @@ nodes: | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "1.0" | |||
| - id: sam2 | |||
| build: pip install -e ../../node-hub/dora-sam2 | |||
| path: dora-sam2 | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| image_depth: dav1d-remote/image_depth | |||
| boxes2d: parse_bbox/bbox_grab | |||
| outputs: | |||
| - masks | |||
| - id: tracker | |||
| build: pip install -e ../../node-hub/dora-cotracker | |||
| path: dora-cotracker | |||
| @@ -132,24 +179,32 @@ nodes: | |||
| env: | |||
| INTERACTIVE_MODE: false | |||
| # - id: box_coordinates | |||
| # build: pip install -e ../../node-hub/dora-object-to-pose | |||
| # path: dora-object-to-pose | |||
| # inputs: | |||
| # depth: reachy-camera/depth | |||
| # boxes2d: parse_bbox/bbox | |||
| # outputs: | |||
| # - pose | |||
| #- id: sam2 | |||
| #build: pip install -e ../../node-hub/dora-sam2 | |||
| #path: dora-sam2 | |||
| #_unstable_deploy: | |||
| #machine: gpu | |||
| #inputs: | |||
| #image_left: dav1d-remote/image_left | |||
| #boxes2d: parse_bbox/bbox | |||
| #outputs: | |||
| #- masks | |||
| - id: box_coordinates | |||
| build: pip install -e ../../node-hub/dora-object-to-pose | |||
| path: dora-object-to-pose | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| depth: dav1d-remote/depth | |||
| masks: sam2/masks | |||
| outputs: | |||
| - pose | |||
| - id: parse_pose | |||
| path: parse_pose.py | |||
| _unstable_deploy: | |||
| machine: gpu | |||
| inputs: | |||
| pose: box_coordinates/pose | |||
| response_r_arm: reachy-right-arm/response_r_arm | |||
| response_l_arm: reachy-left-arm/response_l_arm | |||
| release_left: parse_whisper/action_release_left | |||
| release_right: parse_whisper/action_release_right | |||
| outputs: | |||
| - action_r_arm | |||
| - action_l_arm | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "1.0" | |||
| - id: parse_point | |||
| path: parse_point.py | |||
| @@ -179,12 +234,15 @@ nodes: | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| _unstable_deploy: | |||
| machine: macbook | |||
| machine: gpu | |||
| inputs: | |||
| image: dav1d-remote/image_left | |||
| image_depth: dav1d-remote/image_depth | |||
| boxes2d: parse_bbox/bbox | |||
| torso/image: dav1d-remote/image_depth | |||
| torso/depth: dav1d-remote/depth | |||
| torso/boxes2d: parse_bbox/bbox | |||
| original_text: dora-distil-whisper/text | |||
| parsed_text: parse_whisper/text | |||
| qwenvl_text: dora-qwenvl/text | |||
| tracked_image: tracker/tracked_image | |||
| env: | |||
| RERUN_MEMORY_LIMIT: 5% | |||
| CAMERA_PITCH: 2.47 | |||
| @@ -54,20 +54,23 @@ for event in node: | |||
| continue | |||
| text = event["value"][0].as_py() | |||
| metadata = event["metadata"] | |||
| image_id = event["metadata"]["image_id"] | |||
| bboxes, labels = extract_bboxes(text) | |||
| if bboxes is not None and len(bboxes) > 0: | |||
| bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) | |||
| metadata["image_id"] = image_id | |||
| metadata["encoding"] = "xyxy" | |||
| if image_id == "image_left": | |||
| node.send_output( | |||
| "bbox_track", | |||
| pa.array(bboxes.ravel()), | |||
| metadata={"encoding": "xyxy", "image_id": image_id}, | |||
| metadata, | |||
| ) | |||
| elif image_id == "image_depth": | |||
| node.send_output( | |||
| "bbox_grab", | |||
| pa.array(bboxes.ravel()), | |||
| metadata={"encoding": "xyxy", "image_id": image_id}, | |||
| metadata, | |||
| ) | |||
| @@ -29,18 +29,24 @@ for event in node: | |||
| point = values[-1] | |||
| rz = int((width / 2) - point[0]) / (width / 2) | |||
| x_distance = min(height / 2, height - point[1]) | |||
| if abs(rz) > 0.3: | |||
| rz = np.deg2rad(30) * np.sign(rz) | |||
| x_distance = min(height, height - point[1]) | |||
| if abs(rz) > 0.75: | |||
| rz = np.deg2rad(90) * np.sign(rz) | |||
| if abs(rz) > 0.5: | |||
| rz = np.deg2rad(60) * np.sign(rz) | |||
| elif abs(rz) > 0.3: | |||
| rz = np.deg2rad(55) * np.sign(rz) | |||
| elif abs(rz) > 0.1: | |||
| rz = np.deg2rad(20) * np.sign(rz) | |||
| rz = np.deg2rad(45) * np.sign(rz) | |||
| else: | |||
| x = 0 | |||
| if x_distance > (height * 0.3): | |||
| x = 0.7 | |||
| elif x_distance > (height * 0.15): | |||
| if x_distance > (height * 0.7): | |||
| x = 0.5 | |||
| elif x_distance > (height * 0.5): | |||
| x = 0.5 | |||
| elif x_distance > (height * 0.2): | |||
| x = 0.5 | |||
| else: | |||
| x = 0 | |||
| @@ -0,0 +1,291 @@ | |||
| """TODO: Add docstring.""" | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| node = Node() | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| l_init_pose = [ | |||
| -7.0631310641087435, | |||
| -10.432298603362307, | |||
| 24.429809104404114, | |||
| -132.15000828778648, | |||
| -1.5494749438811133, | |||
| -21.749917789205202, | |||
| 8.099312596108344, | |||
| 100, | |||
| ] | |||
| r_init_pose = [ | |||
| -5.60273587426976, | |||
| 10.780818397272316, | |||
| -27.868146823156042, | |||
| -126.15650363072193, | |||
| 3.961108018106834, | |||
| -35.43682799906162, | |||
| 350.9236448374495, | |||
| 100, | |||
| ] | |||
| r_release_closed_pose = [ | |||
| -26.1507947940993, | |||
| 12.16735021387949, | |||
| -2.2657319092611976, | |||
| -97.63648867582175, | |||
| -19.91084837404425, | |||
| 22.10184328619011, | |||
| 366.71351223614494, | |||
| 0, | |||
| ] | |||
| r_release_opened_pose = [ | |||
| -26.1507947940993, | |||
| 12.16735021387949, | |||
| -2.2657319092611976, | |||
| -97.63648867582175, | |||
| -19.91084837404425, | |||
| 22.10184328619011, | |||
| 366.71351223614494, | |||
| 100, | |||
| ] | |||
| l_release_opened_pose = [ | |||
| -30.04330081906935, | |||
| -7.415231584691132, | |||
| 3.6972339048071468, | |||
| -97.7274736257555, | |||
| 12.996718740452982, | |||
| 30.838020649757016, | |||
| -1.5572310505704858, | |||
| 0, | |||
| ] | |||
| l_release_closed_pose = [ | |||
| -30.04330081906935, | |||
| -7.415231584691132, | |||
| 3.6972339048071468, | |||
| -97.7274736257555, | |||
| 12.996718740452982, | |||
| 30.838020649757016, | |||
| -1.5572310505704858, | |||
| 100, | |||
| ] | |||
| def wait_for_event(id, timeout=None, cache={}): | |||
| """TODO: Add docstring.""" | |||
| while True: | |||
| event = node.next(timeout=timeout) | |||
| if event is None: | |||
| cache["finished"] = True | |||
| return None, cache | |||
| if event["type"] == "INPUT": | |||
| cache[event["id"]] = event["value"] | |||
| if event["id"] == id: | |||
| return event["value"], cache | |||
| elif event["type"] == "ERROR": | |||
| return None, cache | |||
| arm_holding_object = None | |||
| cache = {} | |||
| ## ---- INIT --- | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(r_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 2}, | |||
| ) | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(l_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 2}, | |||
| ) | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| if event["id"] == "pose": | |||
| values = event["value"] | |||
| values = values.to_numpy() | |||
| print("Pose: ", values) | |||
| if len(values) == 0: | |||
| continue | |||
| x = values[0] | |||
| y = values[1] | |||
| z = values[2] | |||
| action = event["metadata"]["action"] | |||
| match action: | |||
| case "grab": | |||
| if len(values) == 0: | |||
| continue | |||
| x = x + 0.03 | |||
| ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement. | |||
| trajectory = np.array( | |||
| [ | |||
| [x, y, -0.16, 0, 0, 0, 100], | |||
| [x, y, z, 0, 0, 0, 0], | |||
| [x, y, -0.16, 0, 0, 0, 0], | |||
| ], | |||
| ).ravel() | |||
| if y < 0: | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(trajectory), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event = wait_for_event(id="response_r_arm", timeout=5) | |||
| if event is not None and event[0]: | |||
| print("Success") | |||
| arm_holding_object = "right" | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array([0.1, -0.2, -0.16, 0, 0, 0, 0]), | |||
| metadata={"encoding": "xyzrpy", "duration": "1"}, | |||
| ) | |||
| else: | |||
| print("Failed: x: ", x, " y: ", y, " z: ", z) | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(r_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": "1"}, | |||
| ) | |||
| event = wait_for_event(id="response_r_arm") | |||
| else: | |||
| y += 0.03 | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(trajectory), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event = wait_for_event(id="response_l_arm", timeout=5) | |||
| if event is not None and event[0]: | |||
| print("Success") | |||
| arm_holding_object = "left" | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array([0.1, 0.2, -0.16, 0, 0, 0, 0]), | |||
| metadata={"encoding": "xyzrpy", "duration": "1"}, | |||
| ) | |||
| else: | |||
| print("Failed") | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(l_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": "1"}, | |||
| ) | |||
| event = wait_for_event(id="response_l_arm") | |||
| case "release": | |||
| if len(values) == 0: | |||
| continue | |||
| x = x + 0.03 | |||
| ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement. | |||
| trajectory = np.array( | |||
| [ | |||
| [x, y, -0.16, 0, 0, 0, 100], | |||
| ], | |||
| ).ravel() | |||
| if y < 0: | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(trajectory), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event = wait_for_event(id="response_r_arm", timeout=5) | |||
| if event is not None and event[0]: | |||
| print("Success") | |||
| arm_holding_object = "right" | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(r_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 1}, | |||
| ) | |||
| else: | |||
| print("Failed: x: ", x, " y: ", y, " z: ", z) | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(r_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": "1"}, | |||
| ) | |||
| event = wait_for_event(id="response_r_arm") | |||
| else: | |||
| y += 0.03 | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(trajectory), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event = wait_for_event(id="response_l_arm", timeout=5) | |||
| if event is not None and event[0]: | |||
| print("Success") | |||
| arm_holding_object = "left" | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(l_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 1}, | |||
| ) | |||
| else: | |||
| print("Failed") | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(l_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": "1"}, | |||
| ) | |||
| event = wait_for_event(id="response_l_arm") | |||
| elif event["id"] == "release_right": | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array( | |||
| [ | |||
| 0.4, | |||
| 0, | |||
| -0.16, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 100, | |||
| ], | |||
| ), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_r_arm", cache=cache) | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(r_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 1}, | |||
| ) | |||
| elif event["id"] == "release_left": | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array( | |||
| [ | |||
| 0.4, | |||
| 0, | |||
| -0.16, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 100, | |||
| ], | |||
| ), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_l_arm", cache=cache) | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(l_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 1}, | |||
| ) | |||
| @@ -59,8 +59,21 @@ for event in node: | |||
| node.send_output("text", pa.array([text]), {"image_id": "image_left"}) | |||
| elif "grab" in text: | |||
| text = f"Given the prompt: {text}. Output the bounding boxes for the given grabbed object" | |||
| node.send_output("text", pa.array([text]), {"image_id": "image_depth"}) | |||
| elif "left" in text: | |||
| node.send_output( | |||
| "text", pa.array([text]), {"image_id": "image_depth", "action": "grab"} | |||
| ) | |||
| elif "put " in text: | |||
| text = f"Given the prompt: {text}. Output the bounding boxes for the place to put the object" | |||
| node.send_output( | |||
| "text", | |||
| pa.array([text]), | |||
| {"image_id": "image_depth", "action": "release"}, | |||
| ) | |||
| elif "release left" in text: | |||
| node.send_output("action_release_left", pa.array([1.0])) | |||
| elif "release right" in text: | |||
| node.send_output("action_release_right", pa.array([1.0])) | |||
| elif "turn left" in text: | |||
| action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) | |||
| node.send_output("action", action) | |||
| time.sleep(0.25) | |||
| @@ -70,7 +83,7 @@ for event in node: | |||
| action = pa.array([0.0, 0, 0, 0, 0, np.deg2rad(160)]) | |||
| node.send_output("action", action) | |||
| node.send_output("points", pa.array([])) | |||
| elif "right" in text: | |||
| elif "turn right" in text: | |||
| action = pa.array([0.0, 0, 0, 0, 0, -np.deg2rad(160)]) | |||
| node.send_output("action", action) | |||
| time.sleep(0.25) | |||
| @@ -1,7 +1,7 @@ | |||
| use core::f32; | |||
| use dora_node_api::{ | |||
| arrow::{ | |||
| array::{AsArray, Float64Array, UInt8Array}, | |||
| array::{AsArray, Float64Array, UInt16Array, UInt8Array}, | |||
| datatypes::{Float32Type, Int64Type}, | |||
| }, | |||
| dora_core::config::DataId, | |||
| @@ -11,7 +11,7 @@ use eyre::Result; | |||
| use std::collections::HashMap; | |||
| fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> { | |||
| let (_x, _y, _z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) = | |||
| let (sum_x, sum_y, sum_z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) = | |||
| points.iter().fold( | |||
| ( | |||
| 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.0, -10.0, 10.0, -10.0, 10., -10.0, | |||
| @@ -49,11 +49,7 @@ fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> { | |||
| ) | |||
| }, | |||
| ); | |||
| let (mean_x, mean_y, mean_z) = ( | |||
| (x_max + x_min) / 2., | |||
| (y_max + y_min) / 2., | |||
| (z_max + z_min) / 2., | |||
| ); | |||
| let (mean_x, mean_y, mean_z) = ((sum_x) / n, (sum_y) / n, (sum_z) / n); | |||
| // Compute covariance and standard deviations | |||
| let cov = sum_xy / n - mean_x * mean_y; | |||
| @@ -116,7 +112,8 @@ pub fn lib_main() -> Result<()> { | |||
| } else { | |||
| vec![640, 480] | |||
| }; | |||
| let buffer: &Float64Array = data.as_any().downcast_ref().unwrap(); | |||
| let buffer: &UInt16Array = data.as_any().downcast_ref().unwrap(); | |||
| depth_frame = Some(buffer.clone()); | |||
| } | |||
| "masks" => { | |||
| @@ -137,6 +134,8 @@ pub fn lib_main() -> Result<()> { | |||
| continue; | |||
| }; | |||
| let mut z_2 = 0.0; | |||
| let mut z_1 = 0.0; | |||
| let outputs: Vec<Vec<f32>> = masks | |||
| .chunks(height as usize * width as usize) | |||
| .filter_map(|data| { | |||
| @@ -150,23 +149,36 @@ pub fn lib_main() -> Result<()> { | |||
| let v = i as f32 / width as f32; // Calculate y-coordinate (v) | |||
| if let Some(z) = z { | |||
| let z = z as f32; | |||
| let z = (z as f32) / 1000.; | |||
| // Skip points that have empty depth or is too far away | |||
| if z == 0. || z > 20.0 { | |||
| return; | |||
| } | |||
| if data[i] { | |||
| let y = (u - resolution[0] as f32) * z | |||
| / focal_length[0] as f32; | |||
| let x = (v - resolution[1] as f32) * z | |||
| / focal_length[1] as f32; | |||
| let new_x = sin_theta * z + cos_theta * x; | |||
| let new_y = -y; | |||
| let new_z = cos_theta * z - sin_theta * x; | |||
| if z_2 == 0. && z_1 == 0. { | |||
| z_1 = z; | |||
| } else if z_1 == 0. { | |||
| z_2 = z_1; | |||
| z_1 = z; | |||
| } else if (z - z_2).abs() < 0.1 && (z - z_1).abs() < 0.1 { | |||
| z_2 = z_1; | |||
| z_1 = z; | |||
| points.push((new_x, new_y, new_z)); | |||
| z_total += new_z; | |||
| n += 1.; | |||
| if data[i] { | |||
| let y = (u - resolution[0] as f32) * z | |||
| / focal_length[0] as f32; | |||
| let x = (v - resolution[1] as f32) * z | |||
| / focal_length[1] as f32; | |||
| let new_x = sin_theta * z + cos_theta * x; | |||
| let new_y = -y; | |||
| let new_z = cos_theta * z - sin_theta * x; | |||
| points.push((new_x, new_y, new_z)); | |||
| z_total += new_z; | |||
| n += 1.; | |||
| } | |||
| } else { | |||
| z_2 = z_1; | |||
| z_1 = z; | |||
| } | |||
| } | |||
| }); | |||
| @@ -215,7 +227,7 @@ pub fn lib_main() -> Result<()> { | |||
| let v = i as f32 / width as f32; // Calculate y-coordinate (v) | |||
| if let Some(z) = z { | |||
| let z = z as f32; | |||
| let z = (z as f32) / 1000.; | |||
| // Skip points that have empty depth or is too far away | |||
| if z == 0. || z > 5.0 { | |||
| return; | |||
| @@ -229,10 +229,12 @@ def main(): | |||
| past_key_values, | |||
| image_id, | |||
| ) | |||
| metadata = event["metadata"] | |||
| metadata["image_id"] = image_id if image_id is not None else "all" | |||
| node.send_output( | |||
| "text", | |||
| pa.array([response]), | |||
| {"image_id": image_id if image_id is not None else "all"}, | |||
| metadata, | |||
| ) | |||
| elif event_type == "ERROR": | |||
| @@ -133,7 +133,9 @@ def main(): | |||
| ) | |||
| if "boxes2d" in event_id: | |||
| if len(event["value"]) == 0: | |||
| node.send_output("masks", pa.array([])) | |||
| continue | |||
| if isinstance(event["value"], pa.StructArray): | |||
| boxes2d = event["value"][0].get("bbox").values.to_numpy() | |||
| labels = ( | |||
| @@ -162,7 +164,59 @@ def main(): | |||
| ): | |||
| predictor.set_image(frames[image_id]) | |||
| masks, _scores, last_pred = predictor.predict( | |||
| box=boxes2d, point_labels=labels, multimask_output=False, | |||
| box=boxes2d, | |||
| point_labels=labels, | |||
| multimask_output=False, | |||
| ) | |||
| if len(masks.shape) == 4: | |||
| masks = masks[:, 0, :, :] | |||
| last_pred = last_pred[:, 0, :, :] | |||
| else: | |||
| masks = masks[0, :, :] | |||
| last_pred = last_pred[0, :, :] | |||
| masks = masks > 0 | |||
| metadata["image_id"] = image_id | |||
| metadata["width"] = frames[image_id].width | |||
| metadata["height"] = frames[image_id].height | |||
| ## Mask to 3 channel image | |||
| match return_type: | |||
| case pa.Array: | |||
| node.send_output("masks", pa.array(masks.ravel()), metadata) | |||
| case pa.StructArray: | |||
| node.send_output( | |||
| "masks", | |||
| pa.array( | |||
| [ | |||
| { | |||
| "masks": masks.ravel(), | |||
| "labels": event["value"]["labels"], | |||
| }, | |||
| ], | |||
| ), | |||
| metadata, | |||
| ) | |||
| elif "points" in event_id: | |||
| points = event["value"].to_numpy().reshape((-1, 2)) | |||
| return_type = pa.Array | |||
| if len(frames) == 0: | |||
| continue | |||
| first_image = next(iter(frames.keys())) | |||
| image_id = event["metadata"].get("image_id", first_image) | |||
| with ( | |||
| torch.inference_mode(), | |||
| torch.autocast( | |||
| "cuda", | |||
| dtype=torch.bfloat16, | |||
| ), | |||
| ): | |||
| predictor.set_image(frames[image_id]) | |||
| labels = [i for i in range(len(points))] | |||
| masks, _scores, last_pred = predictor.predict( | |||
| points, | |||
| point_labels=labels, | |||
| multimask_output=False, | |||
| ) | |||
| if len(masks.shape) == 4: | |||