Demo of reachy doing a pick and place exercice.tags/v0.3.10-rc3
| @@ -199,6 +199,12 @@ pub fn pydict_to_metadata(dict: Option<Bound<'_, PyDict>>) -> Result<MetadataPar | |||
| { | |||
| let list: Vec<f64> = value.extract()?; | |||
| parameters.insert(key, Parameter::ListFloat(list)) | |||
| } else if value.is_instance_of::<PyList>() | |||
| && value.len()? > 0 | |||
| && value.get_item(0)?.is_exact_instance_of::<PyString>() | |||
| { | |||
| let list: Vec<String> = value.extract()?; | |||
| parameters.insert(key, Parameter::ListString(list)) | |||
| } else { | |||
| println!("could not convert type {value}"); | |||
| parameters.insert(key, Parameter::String(value.str()?.to_string())) | |||
| @@ -233,6 +239,9 @@ pub fn metadata_to_pydict<'a>( | |||
| Parameter::ListFloat(l) => dict | |||
| .set_item(k, l) | |||
| .context("Could not insert metadata into python dictionary")?, | |||
| Parameter::ListString(l) => dict | |||
| .set_item(k, l) | |||
| .context("Could not insert metadata into python dictionary")?, | |||
| } | |||
| } | |||
| @@ -0,0 +1,81 @@ | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| node = Node() | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| def extract_bboxes(json_text) -> (np.ndarray, np.ndarray): | |||
| """ | |||
| Extracts bounding boxes from a JSON string with markdown markers and returns them as a NumPy array. | |||
| Parameters: | |||
| json_text (str): JSON string containing bounding box data, including ```json markers. | |||
| Returns: | |||
| np.ndarray: NumPy array of bounding boxes. | |||
| """ | |||
| # Ensure all lines are stripped of whitespace and markers | |||
| lines = json_text.strip().splitlines() | |||
| # Filter out lines that are markdown markers | |||
| clean_lines = [line for line in lines if not line.strip().startswith("```")] | |||
| # Join the lines back into a single string | |||
| clean_text = "\n".join(clean_lines) | |||
| # Parse the cleaned JSON text | |||
| try: | |||
| data = json.loads(clean_text) | |||
| # Extract bounding boxes | |||
| bboxes = [item["bbox_2d"] for item in data] | |||
| labels = [item["label"] for item in data] | |||
| return np.array(bboxes), np.array(labels) | |||
| except Exception as _e: # noqa | |||
| pass | |||
| return None, None | |||
| for event in node: | |||
| text = "Put the chocolate in the white plate" | |||
| if event["type"] == "INPUT": | |||
| if event["id"] == "prompt": | |||
| prompt = event["value"][0].as_py() | |||
| elif event["id"] == "text": | |||
| text = event["value"][0].as_py() | |||
| image_id = event["metadata"]["image_id"] | |||
| bboxes, labels = extract_bboxes(text) | |||
| if bboxes is not None and len(bboxes) > 0: | |||
| bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) | |||
| unique_labels = np.unique(labels) | |||
| idx = [] | |||
| order = [] | |||
| for label in unique_labels: | |||
| if label in prompt: | |||
| # Get the index of the start of the label in the prompt | |||
| order.append(prompt.index(label)) | |||
| idx.append(np.where(labels == label)[0][0]) | |||
| if len(idx) == 0: | |||
| continue | |||
| # Reorder idx given the order | |||
| # print(idx, order) | |||
| idx = np.array(idx)[np.argsort(order)].ravel() | |||
| bboxes = bboxes[idx] | |||
| # Check for duplicated box | |||
| if len(np.unique(bboxes, axis=0)) != len(bboxes): | |||
| print("Duplicated box") | |||
| continue | |||
| node.send_output( | |||
| "bbox", | |||
| pa.array([{"bbox": bboxes.ravel(), "labels": labels[idx]}]), | |||
| metadata={"encoding": "xyxy", "image_id": image_id}, | |||
| ) | |||
| @@ -0,0 +1,151 @@ | |||
| nodes: | |||
| - id: dora-microphone | |||
| build: pip install -e ../../node-hub/dora-microphone | |||
| path: dora-microphone | |||
| inputs: | |||
| tick: dora/timer/millis/2000 | |||
| outputs: | |||
| - audio | |||
| - id: sam2 | |||
| build: pip install -e ../../node-hub/dora-sam2 | |||
| path: dora-sam2 | |||
| inputs: | |||
| image_depth: reachy-camera/image_depth | |||
| boxes2d: parse_bbox/bbox | |||
| outputs: | |||
| - masks | |||
| - id: dora-vad | |||
| build: pip install -e ../../node-hub/dora-vad | |||
| path: dora-vad | |||
| inputs: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - id: dora-distil-whisper | |||
| build: pip install -e ../../node-hub/dora-distil-whisper | |||
| path: dora-distil-whisper | |||
| inputs: | |||
| input: dora-vad/audio | |||
| outputs: | |||
| - text | |||
| env: | |||
| TARGET_LANGUAGE: english | |||
| TRANSLATE: true | |||
| - id: reachy-mobile-base | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-mobile-base | |||
| inputs: | |||
| action_base: state_machine/action_base | |||
| outputs: | |||
| - response_base | |||
| - id: reachy-left-arm | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-left-arm | |||
| inputs: | |||
| pose: state_machine/action_l_arm | |||
| outputs: | |||
| - response_l_arm | |||
| - id: reachy-right-arm | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-right-arm | |||
| inputs: | |||
| pose: state_machine/action_r_arm | |||
| outputs: | |||
| - response_r_arm | |||
| - id: reachy-camera | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-camera | |||
| inputs: | |||
| tick: dora/timer/millis/50 | |||
| outputs: | |||
| - image_depth | |||
| - depth | |||
| - id: reachy-head | |||
| build: pip install -e ../../node-hub/dora-reachy2 | |||
| path: dora-reachy2-head | |||
| inputs: | |||
| boxes2d: parse_bbox/bbox_face | |||
| look: state_machine/look | |||
| - id: plot | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| inputs: | |||
| # camera_left/image_right: reachy-camera/image_right | |||
| camera_torso/image: reachy-camera/image_depth | |||
| text_response: dora-qwenvl/text | |||
| text_whisper: dora-distil-whisper/text | |||
| camera_torso/boxes2d: parse_bbox/bbox | |||
| camera_left/boxes2d_face: parse_bbox/bbox_face | |||
| env: | |||
| RERUN_MEMORY_LIMIT: "5%" | |||
| - id: dora-qwenvl | |||
| build: pip install -e ../../node-hub/dora-qwen2-5-vl | |||
| path: dora-qwen2-5-vl | |||
| inputs: | |||
| image_depth: reachy-camera/image_depth | |||
| # image_left: reachy-camera/image_left | |||
| text_1: dora/timer/millis/600 | |||
| text_2: state_machine/text_vlm | |||
| outputs: | |||
| - text | |||
| env: | |||
| DEFAULT_QUESTION: grab human. | |||
| IMAGE_RESIZE_RATIO: "0.5" | |||
| # ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have | |||
| #SYSTEM_PROMPT: You're a robot. | |||
| - id: parse_bbox | |||
| path: parse_bbox_minimal.py | |||
| inputs: | |||
| text: dora-qwenvl/text | |||
| prompt: state_machine/prompt | |||
| outputs: | |||
| - bbox | |||
| - bbox_face | |||
| env: | |||
| IMAGE_RESIZE_RATIO: "0.5" | |||
| - id: box_coordinates | |||
| build: pip install -e ../../node-hub/dora-object-to-pose | |||
| path: dora-object-to-pose | |||
| inputs: | |||
| depth: reachy-camera/depth | |||
| masks: sam2/masks | |||
| outputs: | |||
| - pose | |||
| - id: keyboard | |||
| build: pip install -e ../../node-hub/dora-keyboard | |||
| path: dora-keyboard | |||
| inputs: | |||
| tick: dora/timer/millis/1000 | |||
| outputs: | |||
| - char | |||
| - id: state_machine | |||
| path: pick_place.py | |||
| inputs: | |||
| text: dora-distil-whisper/text | |||
| response_base: reachy-mobile-base/response_base | |||
| response_r_arm: reachy-right-arm/response_r_arm | |||
| response_l_arm: reachy-left-arm/response_l_arm | |||
| pose: box_coordinates/pose | |||
| outputs: | |||
| - text_vlm | |||
| - action_r_arm | |||
| - action_base | |||
| - look | |||
| - action_l_arm | |||
| - prompt | |||
| env: | |||
| ACTIVATION_WORDS: grab pick give output take catch grabs picks gives output takes catches have put | |||
| @@ -0,0 +1,401 @@ | |||
| # State Machine | |||
| import json | |||
| import os | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| from dora import Node | |||
| IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) | |||
| node = Node() | |||
| ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "").split() | |||
| TABLE_HEIGHT = float(os.getenv("TABLE_HEIGHT", "-0.41")) | |||
| l_init_pose = [ | |||
| -7.0631310641087435, | |||
| -10.432298603362307, | |||
| 24.429809104404114, | |||
| -132.15000828778648, | |||
| -1.5494749438811133, | |||
| -21.749917789205202, | |||
| 8.099312596108344, | |||
| 100, | |||
| ] | |||
| r_init_pose = [ | |||
| -5.60273587426976, | |||
| 10.780818397272316, | |||
| -27.868146823156042, | |||
| -126.15650363072193, | |||
| 3.961108018106834, | |||
| -35.43682799906162, | |||
| 350.9236448374495, | |||
| 100, | |||
| ] | |||
| r_release_closed_pose = [ | |||
| -26.1507947940993, | |||
| 12.16735021387949, | |||
| -2.2657319092611976, | |||
| -97.63648867582175, | |||
| -19.91084837404425, | |||
| 22.10184328619011, | |||
| 366.71351223614494, | |||
| 0, | |||
| ] | |||
| r_release_opened_pose = [ | |||
| -26.1507947940993, | |||
| 12.16735021387949, | |||
| -2.2657319092611976, | |||
| -97.63648867582175, | |||
| -19.91084837404425, | |||
| 22.10184328619011, | |||
| 366.71351223614494, | |||
| 100, | |||
| ] | |||
| l_release_opened_pose = [ | |||
| -30.04330081906935, | |||
| -7.415231584691132, | |||
| 3.6972339048071468, | |||
| -97.7274736257555, | |||
| 12.996718740452982, | |||
| 30.838020649757016, | |||
| -1.5572310505704858, | |||
| 0, | |||
| ] | |||
| l_release_closed_pose = [ | |||
| -30.04330081906935, | |||
| -7.415231584691132, | |||
| 3.6972339048071468, | |||
| -97.7274736257555, | |||
| 12.996718740452982, | |||
| 30.838020649757016, | |||
| -1.5572310505704858, | |||
| 100, | |||
| ] | |||
| stop = True | |||
| def extract_bboxes(json_text) -> (np.ndarray, np.ndarray): | |||
| """ | |||
| Extracts bounding boxes from a JSON string with markdown markers and returns them as a NumPy array. | |||
| Parameters: | |||
| json_text (str): JSON string containing bounding box data, including ```json markers. | |||
| Returns: | |||
| np.ndarray: NumPy array of bounding boxes. | |||
| """ | |||
| # Ensure all lines are stripped of whitespace and markers | |||
| lines = json_text.strip().splitlines() | |||
| # Filter out lines that are markdown markers | |||
| clean_lines = [line for line in lines if not line.strip().startswith("```")] | |||
| # Join the lines back into a single string | |||
| clean_text = "\n".join(clean_lines) | |||
| # Parse the cleaned JSON text | |||
| try: | |||
| data = json.loads(clean_text) | |||
| # Extract bounding boxes | |||
| bboxes = [item["bbox_2d"] for item in data] | |||
| labels = [item["label"] for item in data] | |||
| return np.array(bboxes), np.array(labels) | |||
| except Exception as _e: # noqa | |||
| pass | |||
| return None, None | |||
| def handle_speech(last_text): | |||
| global stop | |||
| words = last_text.lower().split() | |||
| if len(ACTIVATION_WORDS) > 0 and any(word in ACTIVATION_WORDS for word in words): | |||
| node.send_output( | |||
| "text_vlm", | |||
| pa.array( | |||
| [ | |||
| f"Given the prompt: {cache['text']}. Output the two bounding boxes for the two objects" | |||
| ] | |||
| ), | |||
| metadata={"image_id": "image_depth"}, | |||
| ) | |||
| node.send_output( | |||
| "prompt", | |||
| pa.array([cache["text"]]), | |||
| metadata={"image_id": "image_depth"}, | |||
| ) | |||
| print(f"sending: {cache['text']}") | |||
| stop = False | |||
| def wait_for_event(id, timeout=None, cache={}): | |||
| while True: | |||
| event = node.next(timeout=timeout) | |||
| if event is None: | |||
| cache["finished"] = True | |||
| return None, cache | |||
| if event["type"] == "INPUT": | |||
| cache[event["id"]] = event["value"] | |||
| if event["id"] == "text": | |||
| cache[event["id"]] = event["value"][0].as_py() | |||
| handle_speech(event["value"][0].as_py()) | |||
| elif event["id"] == id: | |||
| return event["value"], cache | |||
| elif event["type"] == "ERROR": | |||
| return None, cache | |||
| def wait_for_events(ids: list[str], timeout=None, cache={}): | |||
| response = {} | |||
| while True: | |||
| event = node.next(timeout=timeout) | |||
| if event is None: | |||
| cache["finished"] = True | |||
| return None, cache | |||
| if event["type"] == "INPUT": | |||
| cache[event["id"]] = event["value"] | |||
| if event["id"] == "text": | |||
| cache[event["id"]] = event["value"][0].as_py() | |||
| handle_speech(event["value"][0].as_py()) | |||
| elif event["id"] in ids: | |||
| response[event["id"]] = event["value"] | |||
| if len(response) == len(ids): | |||
| return response, cache | |||
| elif event["type"] == "ERROR": | |||
| return None, cache | |||
| def get_prompt(): | |||
| text = wait_for_event(id="text", timeout=0.3) | |||
| if text is None: | |||
| return | |||
| text = text[0].as_py() | |||
| words = text.lower().split() | |||
| if len(ACTIVATION_WORDS) > 0 and all( | |||
| word not in ACTIVATION_WORDS for word in words | |||
| ): | |||
| return | |||
| else: | |||
| return text | |||
| last_text = "" | |||
| cache = {"text": "Put the orange in the metal box"} | |||
| while True: | |||
| ### === IDLE === | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(r_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 1}, | |||
| ) | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(l_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": 1}, | |||
| ) | |||
| _, cache = wait_for_events( | |||
| ids=["response_r_arm", "response_l_arm"], timeout=2, cache=cache | |||
| ) | |||
| # handle_speech(cache["text"]) | |||
| ### === TURNING === | |||
| # Trigger action once text from whisper is received | |||
| # Move left. Overwrite this with your desired movement.. | |||
| # node.send_output("action_base", pa.array([0.0, 0.0, 0.0, 0.0, 0.0, 1.57])) | |||
| # Look straight | |||
| # node.send_output("look", pa.array([0.3, 0, -0.1])) | |||
| # You can add additional actions here | |||
| # ... | |||
| # event = wait_for_event(id="response_base")[0].as_py() | |||
| # if not event: | |||
| ## return to IDLE | |||
| # node.send_output("action_base", pa.array([0.0, 0.0, 0.0, 0.0, 0.0, -1.57])) | |||
| # event = wait_for_event(id="response_base")[0].as_py() | |||
| # if event: | |||
| # continue | |||
| # else: | |||
| # break | |||
| ### === GRABBING === | |||
| # Trigger action once base is done moving | |||
| # node.send_output( | |||
| # "text_vlm", | |||
| # pa.array([f"Given the prompt: {text}. Output bounding box for this action"]), | |||
| # metadata={"image_id": "image_depth"}, | |||
| # ) | |||
| arm_holding_object = None | |||
| # Try pose and until one is successful | |||
| text, cache = wait_for_event(id="text", timeout=0.3, cache=cache) | |||
| if stop: | |||
| continue | |||
| while True: | |||
| values, cache = wait_for_event(id="pose", cache=cache) | |||
| if values is None: | |||
| continue | |||
| values = values.to_numpy().reshape((-1, 6)) | |||
| if len(values) < 2: | |||
| continue | |||
| x = values[0][0] | |||
| y = values[0][1] | |||
| z = values[0][2] | |||
| dest_x = values[1][0] | |||
| dest_y = values[1][1] | |||
| dest_z = values[1][2] | |||
| x = x + 0.01 | |||
| dest_x = dest_x - 0.05 | |||
| print("x: ", x, " y: ", y, " z: ", z) | |||
| ## Clip the Maximum and minim values for the height of the arm to avoid collision or weird movement. | |||
| z = np.max((z, TABLE_HEIGHT)) | |||
| node.send_output("look", pa.array([x, y, z])) | |||
| trajectory = np.array( | |||
| [ | |||
| [x, y, -0.16, 0, 0, 0, 100], | |||
| [x, y, z, 0, 0, 0, 0], | |||
| [x, y, -0.16, 0, 0, 0, 0], | |||
| ] | |||
| ).ravel() | |||
| if y < 0: | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(trajectory), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.5"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_r_arm", timeout=5, cache=cache) | |||
| if event is not None and event[0].as_py(): | |||
| print("Success") | |||
| arm_holding_object = "right" | |||
| break | |||
| else: | |||
| print("Failed: x: ", x, " y: ", y, " z: ", z) | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array(r_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": "1.3"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_r_arm", cache=cache) | |||
| else: | |||
| y += 0.03 | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(trajectory), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.5"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_l_arm", timeout=5, cache=cache) | |||
| if event is not None and event[0].as_py(): | |||
| print("Success") | |||
| arm_holding_object = "left" | |||
| break | |||
| else: | |||
| print("Failed") | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array(l_init_pose), | |||
| metadata={"encoding": "jointstate", "duration": "1.3"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_l_arm", cache=cache) | |||
| ### === RELEASING === | |||
| # Trigger action once r_arm is done moving | |||
| # node.send_output("action_base", pa.array([0.0, 0.0, 0.0, 0.0, 0.0, -1.57])) | |||
| # event = wait_for_event(id="response_base")[0].as_py() | |||
| # if not event: | |||
| # print("Failed to move right") | |||
| # Trigger action to release object | |||
| if arm_holding_object == "right": | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array( | |||
| [ | |||
| dest_x, | |||
| dest_y, | |||
| -0.16, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 100, | |||
| ], | |||
| ), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_r_arm", cache=cache) | |||
| else: | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array( | |||
| [ | |||
| dest_x, | |||
| dest_y, | |||
| -0.16, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 100, | |||
| ] | |||
| ), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_l_arm", cache=cache) | |||
| if event is None or not event[0].as_py(): | |||
| print("Failed to release object") | |||
| if arm_holding_object == "right": | |||
| node.send_output( | |||
| "action_r_arm", | |||
| pa.array( | |||
| [ | |||
| x, | |||
| y, | |||
| z, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 100, | |||
| ], | |||
| ), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_r_arm", cache=cache) | |||
| else: | |||
| node.send_output( | |||
| "action_l_arm", | |||
| pa.array( | |||
| [ | |||
| x, | |||
| y, | |||
| z, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 100, | |||
| ] | |||
| ), | |||
| metadata={"encoding": "xyzrpy", "duration": "0.75"}, | |||
| ) | |||
| event, cache = wait_for_event(id="response_l_arm", cache=cache) | |||
| else: | |||
| stop = True | |||
| if cache.get("finished", False): | |||
| break | |||
| # Move object back to initial position | |||
| @@ -63,6 +63,7 @@ pub enum Parameter { | |||
| String(String), | |||
| ListInt(Vec<i64>), | |||
| ListFloat(Vec<f64>), | |||
| ListString(Vec<String>), | |||
| } | |||
| #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | |||
| @@ -6,7 +6,7 @@ edition = "2021" | |||
| # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |||
| [dependencies] | |||
| dora-node-api = "0.3.8" | |||
| dora-node-api = { workspace = true } | |||
| eyre = "0.6.8" | |||
| pyo3 = { workspace = true, features = [ | |||
| "extension-module", | |||
| @@ -10,7 +10,7 @@ use dora_node_api::{ | |||
| use eyre::Result; | |||
| use std::collections::HashMap; | |||
| fn points_to_pose(points: &[(f32, f32, f32)]) -> (f32, f32, f32, f32, f32, f32) { | |||
| fn points_to_pose(points: &[(f32, f32, f32)]) -> Vec<f32> { | |||
| let (_x, _y, _z, sum_xy, sum_x2, sum_y2, n, x_min, x_max, y_min, y_max, z_min, z_max) = | |||
| points.iter().fold( | |||
| ( | |||
| @@ -61,7 +61,7 @@ fn points_to_pose(points: &[(f32, f32, f32)]) -> (f32, f32, f32, f32, f32, f32) | |||
| let std_y = (sum_y2 / n - mean_y * mean_y).sqrt(); | |||
| let corr = cov / (std_x * std_y); | |||
| return (mean_x, mean_y, mean_z, 0., 0., corr * f32::consts::PI / 2.); | |||
| return vec![mean_x, mean_y, mean_z, 0., 0., corr * f32::consts::PI / 2.]; | |||
| } | |||
| pub fn lib_main() -> Result<()> { | |||
| @@ -74,7 +74,7 @@ pub fn lib_main() -> Result<()> { | |||
| let mut focal_length = vec![605, 605]; | |||
| let mut resolution = vec![605, 605]; | |||
| let camera_pitch = std::env::var("CAMERA_PITCH") | |||
| .unwrap_or("2.478".to_string()) | |||
| .unwrap_or("2.47".to_string()) | |||
| .parse::<f32>() | |||
| .unwrap(); | |||
| let cos_theta = camera_pitch.cos(); // np.cos(np.deg2rad(180-38)) | |||
| @@ -120,114 +120,147 @@ pub fn lib_main() -> Result<()> { | |||
| depth_frame = Some(buffer.clone()); | |||
| } | |||
| "masks" => { | |||
| if let Some(data) = data.as_primitive_opt::<Float32Type>() { | |||
| let data = data.values(); | |||
| let mut points = vec![]; | |||
| let mut z_total = 0.; | |||
| let mut n = 0.; | |||
| let masks = if let Some(data) = data.as_primitive_opt::<Float32Type>() { | |||
| let data = data | |||
| .iter() | |||
| .map(|x| if let Some(x) = x { x > 0. } else { false }) | |||
| .collect::<Vec<_>>(); | |||
| data | |||
| } else if let Some(data) = data.as_boolean_opt() { | |||
| let data = data | |||
| .iter() | |||
| .map(|x| if let Some(x) = x { x } else { false }) | |||
| .collect::<Vec<_>>(); | |||
| data | |||
| } else { | |||
| println!("Got unexpected data type: {}", data.data_type()); | |||
| continue; | |||
| }; | |||
| if let Some(depth_frame) = &depth_frame { | |||
| depth_frame.iter().enumerate().for_each(|(i, z)| { | |||
| let u = i as f32 % width as f32; // Calculate x-coordinate (u) | |||
| let v = i as f32 / width as f32; // Calculate y-coordinate (v) | |||
| let outputs: Vec<Vec<f32>> = masks | |||
| .chunks(height as usize * width as usize) | |||
| .into_iter() | |||
| .map(|data| { | |||
| let mut points = vec![]; | |||
| let mut z_total = 0.; | |||
| let mut n = 0.; | |||
| if let Some(z) = z { | |||
| let z = z as f32; | |||
| // Skip points that have empty depth or is too far away | |||
| if z == 0. || z > 5.0 { | |||
| return; | |||
| } | |||
| if data[i] > 0. { | |||
| let y = | |||
| (u - resolution[0] as f32) * z / focal_length[0] as f32; | |||
| let x = | |||
| (v - resolution[1] as f32) * z / focal_length[1] as f32; | |||
| let new_x = sin_theta * z + cos_theta * x; | |||
| let new_y = -y; | |||
| let new_z = cos_theta * z - sin_theta * x; | |||
| if let Some(depth_frame) = &depth_frame { | |||
| depth_frame.iter().enumerate().for_each(|(i, z)| { | |||
| let u = i as f32 % width as f32; // Calculate x-coordinate (u) | |||
| let v = i as f32 / width as f32; // Calculate y-coordinate (v) | |||
| if let Some(z) = z { | |||
| let z = z as f32; | |||
| // Skip points that have empty depth or is too far away | |||
| if z == 0. || z > 20.0 { | |||
| return; | |||
| } | |||
| if data[i] { | |||
| let y = (u - resolution[0] as f32) * z | |||
| / focal_length[0] as f32; | |||
| let x = (v - resolution[1] as f32) * z | |||
| / focal_length[1] as f32; | |||
| let new_x = sin_theta * z + cos_theta * x; | |||
| let new_y = -y; | |||
| let new_z = cos_theta * z - sin_theta * x; | |||
| points.push((new_x, new_y, new_z)); | |||
| z_total += new_z; | |||
| n += 1.; | |||
| points.push((new_x, new_y, new_z)); | |||
| z_total += new_z; | |||
| n += 1.; | |||
| } | |||
| } | |||
| } | |||
| }); | |||
| } else { | |||
| println!("No depth frame found"); | |||
| continue; | |||
| } | |||
| if points.is_empty() { | |||
| println!("No points in mask found"); | |||
| continue; | |||
| } | |||
| let (mean_x, mean_y, mean_z, rx, ry, rz) = points_to_pose(&points); | |||
| let mut metadata = metadata.parameters.clone(); | |||
| metadata.insert( | |||
| "encoding".to_string(), | |||
| Parameter::String("xyzrpy".to_string()), | |||
| ); | |||
| }); | |||
| } else { | |||
| println!("No depth frame found"); | |||
| return None; | |||
| } | |||
| if points.is_empty() { | |||
| println!("No points in mask found"); | |||
| return None; | |||
| } | |||
| Some(points_to_pose(&points)) | |||
| }) | |||
| .filter(|x| x.is_some()) | |||
| .map(|x| x.unwrap()) | |||
| .collect(); | |||
| let flatten_data = outputs.into_iter().flatten().collect::<Vec<_>>(); | |||
| let mut metadata = metadata.parameters.clone(); | |||
| metadata.insert( | |||
| "encoding".to_string(), | |||
| Parameter::String("xyzrpy".to_string()), | |||
| ); | |||
| println!("Got data: {:?}", flatten_data); | |||
| node.send_output( | |||
| DataId::from("pose".to_string()), | |||
| metadata, | |||
| vec![mean_x, mean_y, mean_z, rx, ry, rz].into_arrow(), | |||
| )?; | |||
| } | |||
| node.send_output( | |||
| DataId::from("pose".to_string()), | |||
| metadata, | |||
| flatten_data.into_arrow(), | |||
| )?; | |||
| } | |||
| "boxes2d" => { | |||
| if let Some(data) = data.as_primitive_opt::<Int64Type>() { | |||
| let data = data.values(); | |||
| let x_min = data[0] as f32; | |||
| let y_min = data[1] as f32; | |||
| let x_max = data[2] as f32; | |||
| let y_max = data[3] as f32; | |||
| let mut points = vec![]; | |||
| let mut z_min = 100.; | |||
| let mut z_total = 0.; | |||
| let mut n = 0.; | |||
| let values = data.values(); | |||
| let outputs: Vec<Vec<f32>> = values | |||
| .chunks(4) | |||
| .into_iter() | |||
| .map(|data| { | |||
| let x_min = data[0] as f32; | |||
| let y_min = data[1] as f32; | |||
| let x_max = data[2] as f32; | |||
| let y_max = data[3] as f32; | |||
| let mut points = vec![]; | |||
| let mut z_min = 100.; | |||
| let mut z_total = 0.; | |||
| let mut n = 0.; | |||
| if let Some(depth_frame) = &depth_frame { | |||
| depth_frame.iter().enumerate().for_each(|(i, z)| { | |||
| let u = i as f32 % width as f32; // Calculate x-coordinate (u) | |||
| let v = i as f32 / width as f32; // Calculate y-coordinate (v) | |||
| if let Some(depth_frame) = &depth_frame { | |||
| depth_frame.iter().enumerate().for_each(|(i, z)| { | |||
| let u = i as f32 % width as f32; // Calculate x-coordinate (u) | |||
| let v = i as f32 / width as f32; // Calculate y-coordinate (v) | |||
| if let Some(z) = z { | |||
| let z = z as f32; | |||
| // Skip points that have empty depth or is too far away | |||
| if z == 0. || z > 5.0 { | |||
| return; | |||
| } | |||
| if u > x_min && u < x_max && v > y_min && v < y_max { | |||
| let y = | |||
| (u - resolution[0] as f32) * z / focal_length[0] as f32; | |||
| let x = | |||
| (v - resolution[1] as f32) * z / focal_length[1] as f32; | |||
| let new_x = sin_theta * z + cos_theta * x; | |||
| let new_y = -y; | |||
| let new_z = cos_theta * z - sin_theta * x; | |||
| if new_z < z_min { | |||
| z_min = new_z; | |||
| if let Some(z) = z { | |||
| let z = z as f32; | |||
| // Skip points that have empty depth or is too far away | |||
| if z == 0. || z > 5.0 { | |||
| return; | |||
| } | |||
| if u > x_min && u < x_max && v > y_min && v < y_max { | |||
| let y = (u - resolution[0] as f32) * z | |||
| / focal_length[0] as f32; | |||
| let x = (v - resolution[1] as f32) * z | |||
| / focal_length[1] as f32; | |||
| let new_x = sin_theta * z + cos_theta * x; | |||
| let new_y = -y; | |||
| let new_z = cos_theta * z - sin_theta * x; | |||
| if new_z < z_min { | |||
| z_min = new_z; | |||
| } | |||
| points.push((new_x, new_y, new_z)); | |||
| z_total += new_z; | |||
| n += 1.; | |||
| } | |||
| } | |||
| points.push((new_x, new_y, new_z)); | |||
| z_total += new_z; | |||
| n += 1.; | |||
| } | |||
| }); | |||
| } else { | |||
| println!("No depth frame found"); | |||
| return None; | |||
| } | |||
| }); | |||
| } else { | |||
| println!("No depth frame found"); | |||
| continue; | |||
| } | |||
| if points.is_empty() { | |||
| continue; | |||
| } | |||
| let raw_mean_z = z_total / n as f32; | |||
| let threshold = (raw_mean_z + z_min) / 2.; | |||
| let points = points | |||
| .into_iter() | |||
| .filter(|(_x, _y, z)| z > &threshold) | |||
| .collect::<Vec<_>>(); | |||
| let (mean_x, mean_y, mean_z, rx, ry, rz) = points_to_pose(&points); | |||
| if points.is_empty() { | |||
| return None; | |||
| } | |||
| let raw_mean_z = z_total / n as f32; | |||
| let threshold = (raw_mean_z + z_min) / 2.; | |||
| let points = points | |||
| .into_iter() | |||
| .filter(|(_x, _y, z)| z > &threshold) | |||
| .collect::<Vec<_>>(); | |||
| Some(points_to_pose(&points)) | |||
| }) | |||
| .filter(|x| x.is_some()) | |||
| .map(|x| x.unwrap()) | |||
| .collect(); | |||
| let flatten_data = outputs.into_iter().flatten().collect::<Vec<_>>(); | |||
| let mut metadata = metadata.parameters.clone(); | |||
| metadata.insert( | |||
| "encoding".to_string(), | |||
| @@ -237,7 +270,7 @@ pub fn lib_main() -> Result<()> { | |||
| node.send_output( | |||
| DataId::from("pose".to_string()), | |||
| metadata, | |||
| vec![mean_x, mean_y, mean_z, rx, ry, rz].into_arrow(), | |||
| flatten_data.into_arrow(), | |||
| )?; | |||
| } | |||
| } | |||
| @@ -10,7 +10,7 @@ from reachy2_sdk.media.camera import CameraView | |||
| def main(): | |||
| ROBOT_IP = os.getenv("ROBOT_IP", "10.42.0.80") | |||
| for _ in range(5): | |||
| for _ in range(10): | |||
| reachy = ReachySDK(ROBOT_IP) | |||
| try: | |||
| reachy.cameras.teleop.get_frame(view=CameraView.LEFT) | |||
| @@ -78,14 +78,14 @@ def manage_gripper(reachy, gripper, grasp): | |||
| return True | |||
| if gripper == 0.0: | |||
| reachy.l_arm.gripper.close() | |||
| time.sleep(0.5) | |||
| time.sleep(0.3) | |||
| if grasp: | |||
| half_open = reachy.l_arm.gripper.get_current_opening() > 2 | |||
| if not half_open: | |||
| return False | |||
| elif gripper == 100.0: | |||
| reachy.l_arm.gripper.open() | |||
| time.sleep(0.5) | |||
| time.sleep(0.3) | |||
| return True | |||
| @@ -133,7 +133,12 @@ def main(): | |||
| ) | |||
| else: | |||
| for joint, gripper in joint_values: | |||
| reachy.l_arm.goto(joint, duration=duration, wait=wait) | |||
| reachy.l_arm.goto( | |||
| joint, | |||
| duration=duration, | |||
| wait=wait, | |||
| interpolation_mode="linear", | |||
| ) | |||
| response_gripper = manage_gripper(reachy, gripper, grasp) | |||
| if not response_gripper: | |||
| node.send_output( | |||
| @@ -151,7 +156,12 @@ def main(): | |||
| joints = value[:7].tolist() | |||
| gripper = value[7] | |||
| reachy.l_arm.goto(joints, duration=duration, wait=wait) | |||
| reachy.l_arm.goto( | |||
| joints, | |||
| duration=duration, | |||
| wait=wait, | |||
| interpolation_mode="linear", | |||
| ) | |||
| manage_gripper(reachy, gripper, grasp) | |||
| node.send_output("response_l_arm", pa.array([True])) | |||
| @@ -77,14 +77,14 @@ def manage_gripper(reachy, gripper, grasp): | |||
| return True | |||
| if gripper == 0.0: | |||
| reachy.r_arm.gripper.close() | |||
| time.sleep(0.5) | |||
| time.sleep(0.3) | |||
| if grasp: | |||
| half_open = reachy.r_arm.gripper.get_current_opening() > 2 | |||
| if not half_open: | |||
| return False | |||
| elif gripper == 100.0: | |||
| reachy.r_arm.gripper.open() | |||
| time.sleep(0.5) | |||
| time.sleep(0.3) | |||
| return True | |||
| @@ -132,7 +132,12 @@ def main(): | |||
| ) | |||
| else: | |||
| for joint, gripper in joint_values: | |||
| reachy.r_arm.goto(joint, duration=duration, wait=wait) | |||
| reachy.r_arm.goto( | |||
| joint, | |||
| duration=duration, | |||
| wait=wait, | |||
| interpolation_mode="linear", | |||
| ) | |||
| response_gripper = manage_gripper(reachy, gripper, grasp) | |||
| if not response_gripper: | |||
| node.send_output( | |||
| @@ -150,7 +155,12 @@ def main(): | |||
| joints = value[:7].tolist() | |||
| gripper = value[7] | |||
| reachy.r_arm.goto(joints, duration=duration, wait=wait) | |||
| reachy.r_arm.goto( | |||
| joints, | |||
| duration=duration, | |||
| wait=wait, | |||
| interpolation_mode="linear", | |||
| ) | |||
| manage_gripper(reachy, gripper, grasp) | |||
| node.send_output("response_r_arm", pa.array([True])) | |||
| @@ -17,7 +17,7 @@ python = ["pyo3"] | |||
| dora-node-api = { workspace = true, features = ["tracing"] } | |||
| eyre = "0.6.8" | |||
| tokio = { version = "1.24.2", features = ["rt"] } | |||
| rerun = { version = "0.21.0", features = ["web_viewer", "image"] } | |||
| rerun = { version = "0.22.0", features = ["web_viewer", "image"] } | |||
| ndarray = "0.15.6" | |||
| k = "0.32" | |||
| pyo3 = { workspace = true, features = [ | |||
| @@ -10,7 +10,7 @@ requires-python = ">=3.8" | |||
| dependencies = [ | |||
| "maturin>=1.8.2", | |||
| 'rerun_sdk==0.21.0', | |||
| 'rerun_sdk==0.22.0', | |||
| # "rerun-loader-urdf @ git+https://github.com/rerun-io/rerun-loader-python-example-urdf.git", | |||
| ] | |||
| @@ -1,7 +1,9 @@ | |||
| use dora_node_api::{ | |||
| arrow::{ | |||
| array::AsArray, | |||
| datatypes::{Float32Type, Float64Type, Int32Type, Int64Type}, | |||
| datatypes::{ | |||
| DataType, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, | |||
| }, | |||
| }, | |||
| dora_core::config::DataId, | |||
| ArrowData, Metadata, Parameter, | |||
| @@ -29,10 +31,53 @@ pub fn update_boxes2d( | |||
| .as_list_opt::<i32>() | |||
| .context("Could not deserialize bbox as list")? | |||
| .values(); | |||
| let bbox = bbox | |||
| .as_primitive_opt::<Float32Type>() | |||
| .context("Could not get bbox value as list")? | |||
| .values(); | |||
| let bbox = match bbox.data_type() { | |||
| DataType::Float16 => bbox | |||
| .as_primitive_opt::<Float16Type>() | |||
| .context("Failed to deserialize bbox")? | |||
| .values() | |||
| .iter() | |||
| .map(|x| f32::from(*x)) | |||
| .collect(), | |||
| DataType::Float32 => bbox | |||
| .as_primitive_opt::<Float32Type>() | |||
| .context("Failed to deserialize bbox")? | |||
| .values() | |||
| .to_vec(), | |||
| DataType::Float64 => bbox | |||
| .as_primitive_opt::<Float64Type>() | |||
| .context("Failed to deserialize bbox")? | |||
| .values() | |||
| .iter() | |||
| .map(|x| *x as f32) | |||
| .collect(), | |||
| DataType::Int16 => bbox | |||
| .as_primitive_opt::<Int16Type>() | |||
| .context("Failed to deserialize bbox")? | |||
| .values() | |||
| .iter() | |||
| .map(|x| *x as f32) | |||
| .collect(), | |||
| DataType::Int32 => bbox | |||
| .as_primitive_opt::<Int32Type>() | |||
| .context("Failed to deserialize bbox")? | |||
| .values() | |||
| .iter() | |||
| .map(|x| *x as f32) | |||
| .collect(), | |||
| DataType::Int64 => bbox | |||
| .as_primitive_opt::<Int64Type>() | |||
| .context("Failed to deserialize bbox")? | |||
| .values() | |||
| .iter() | |||
| .map(|x| *x as f32) | |||
| .collect(), | |||
| _ => { | |||
| return Err(eyre::eyre!( | |||
| "Could not deserialize bbox as float32, float64, int32 or int64" | |||
| )) | |||
| } | |||
| }; | |||
| if bbox.len() == 0 { | |||
| rec.log(id.as_str(), &rerun::Clear::flat()) | |||
| @@ -53,18 +98,6 @@ pub fn update_boxes2d( | |||
| .context("Could not deserialize labels as string")?; | |||
| let labels: Vec<Text> = labels.iter().map(|x| Text::from(x.unwrap())).collect(); | |||
| // Cast confidence | |||
| let conf_buffer = bbox_struct | |||
| .column_by_name("conf") | |||
| .context("Did not find conf field within bbox struct")?; | |||
| let conf = conf_buffer | |||
| .as_list_opt::<i32>() | |||
| .context("Could not deserialize conf as list")? | |||
| .values(); | |||
| let _conf = conf | |||
| .as_primitive_opt::<Float32Type>() | |||
| .context("Could not deserialize conf as string")?; | |||
| let mut centers = vec![]; | |||
| let mut sizes = vec![]; | |||
| @@ -262,7 +262,6 @@ use pyo3::{ | |||
| #[cfg(feature = "python")] | |||
| #[pyfunction] | |||
| fn py_main(_py: Python) -> eyre::Result<()> { | |||
| pyo3::prepare_freethreaded_python(); | |||
| lib_main() | |||
| } | |||
| @@ -13,6 +13,10 @@ def main(): | |||
| pa.array([]) # initialize pyarrow array | |||
| node = Node() | |||
| frames = {} | |||
| last_pred = None | |||
| labels = None | |||
| return_type = pa.Array | |||
| image_id = None | |||
| for event in node: | |||
| event_type = event["type"] | |||
| @@ -59,33 +63,143 @@ def main(): | |||
| image = Image.fromarray(frame) | |||
| frames[event_id] = image | |||
| # TODO: Fix the tracking code for SAM2. | |||
| continue | |||
| if last_pred is not None: | |||
| with ( | |||
| torch.inference_mode(), | |||
| torch.autocast( | |||
| "cuda", | |||
| dtype=torch.bfloat16, | |||
| ), | |||
| ): | |||
| predictor.set_image(frames[image_id]) | |||
| new_logits = [] | |||
| new_masks = [] | |||
| if len(last_pred.shape) < 3: | |||
| last_pred = np.expand_dims(last_pred, 0) | |||
| for mask in last_pred: | |||
| mask = np.expand_dims(mask, 0) # Make shape: 1x256x256 | |||
| masks, _, new_logit = predictor.predict( | |||
| mask_input=mask, | |||
| multimask_output=False, | |||
| ) | |||
| if len(masks.shape) == 4: | |||
| masks = masks[:, 0, :, :] | |||
| else: | |||
| masks = masks[0, :, :] | |||
| masks = masks > 0 | |||
| new_masks.append(masks) | |||
| new_logits.append(new_logit) | |||
| ## Mask to 3 channel image | |||
| last_pred = np.concatenate(new_logits, axis=0) | |||
| masks = np.concatenate(new_masks, axis=0) | |||
| match return_type: | |||
| case pa.Array: | |||
| node.send_output( | |||
| "masks", | |||
| pa.array(masks.ravel()), | |||
| metadata={ | |||
| "image_id": image_id, | |||
| "width": frames[image_id].width, | |||
| "height": frames[image_id].height, | |||
| }, | |||
| ) | |||
| case pa.StructArray: | |||
| node.send_output( | |||
| "masks", | |||
| pa.array( | |||
| [ | |||
| { | |||
| "masks": masks.ravel(), | |||
| "labels": event["value"]["labels"], | |||
| } | |||
| ] | |||
| ), | |||
| metadata={ | |||
| "image_id": image_id, | |||
| "width": frames[image_id].width, | |||
| "height": frames[image_id].height, | |||
| }, | |||
| ) | |||
| elif "boxes2d" in event_id: | |||
| boxes2d = event["value"].to_numpy() | |||
| if isinstance(event["value"], pa.StructArray): | |||
| boxes2d = event["value"][0].get("bbox").values.to_numpy() | |||
| labels = ( | |||
| event["value"][0] | |||
| .get("labels") | |||
| .values.to_numpy(zero_copy_only=False) | |||
| ) | |||
| return_type = pa.Array | |||
| else: | |||
| boxes2d = event["value"].to_numpy() | |||
| labels = None | |||
| return_type = pa.Array | |||
| metadata = event["metadata"] | |||
| encoding = metadata["encoding"] | |||
| if encoding != "xyxy": | |||
| raise RuntimeError(f"Unsupported boxes2d encoding: {encoding}") | |||
| boxes2d = boxes2d.reshape(-1, 4) | |||
| image_id = metadata["image_id"] | |||
| with torch.inference_mode(), torch.autocast( | |||
| "cuda", | |||
| dtype=torch.bfloat16, | |||
| with ( | |||
| torch.inference_mode(), | |||
| torch.autocast( | |||
| "cuda", | |||
| dtype=torch.bfloat16, | |||
| ), | |||
| ): | |||
| predictor.set_image(frames[image_id]) | |||
| masks, _, _ = predictor.predict(box=boxes2d) | |||
| masks = masks[0] | |||
| ## Mask to 3 channel image | |||
| node.send_output( | |||
| "masks", | |||
| pa.array(masks.ravel()), | |||
| metadata={ | |||
| "image_id": image_id, | |||
| "width": frames[image_id].width, | |||
| "height": frames[image_id].height, | |||
| }, | |||
| masks, _scores, last_pred = predictor.predict( | |||
| box=boxes2d, point_labels=labels, multimask_output=False | |||
| ) | |||
| if len(masks.shape) == 4: | |||
| masks = masks[:, 0, :, :] | |||
| last_pred = last_pred[:, 0, :, :] | |||
| else: | |||
| masks = masks[0, :, :] | |||
| last_pred = last_pred[0, :, :] | |||
| masks = masks > 0 | |||
| ## Mask to 3 channel image | |||
| match return_type: | |||
| case pa.Array: | |||
| node.send_output( | |||
| "masks", | |||
| pa.array(masks.ravel()), | |||
| metadata={ | |||
| "image_id": image_id, | |||
| "width": frames[image_id].width, | |||
| "height": frames[image_id].height, | |||
| }, | |||
| ) | |||
| case pa.StructArray: | |||
| node.send_output( | |||
| "masks", | |||
| pa.array( | |||
| [ | |||
| { | |||
| "masks": masks.ravel(), | |||
| "labels": event["value"]["labels"], | |||
| } | |||
| ] | |||
| ), | |||
| metadata={ | |||
| "image_id": image_id, | |||
| "width": frames[image_id].width, | |||
| "height": frames[image_id].height, | |||
| }, | |||
| ) | |||
| elif event_type == "ERROR": | |||
| print("Event Error:" + event["error"]) | |||