From 39b96bd4b71418f86cc7b9cd4738c3f2b1bcd4cc Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Thu, 1 May 2025 21:13:57 +0200 Subject: [PATCH] working so100 inference code with qwenvl --- examples/so100-remote/parse_bbox.py | 69 ++++ examples/so100-remote/parse_keyboard.py | 127 ++++--- examples/so100-remote/parse_pose.py | 152 ++++++++ examples/so100-remote/parse_whisper.py | 32 ++ examples/so100-remote/qwenvl.yml | 141 ++++++++ examples/so100-remote/so100_inference.urdf | 384 +++++++++++++++++++++ examples/so100-remote/test.yml | 8 +- 7 files changed, 842 insertions(+), 71 deletions(-) create mode 100644 examples/so100-remote/parse_bbox.py create mode 100644 examples/so100-remote/parse_pose.py create mode 100644 examples/so100-remote/parse_whisper.py create mode 100644 examples/so100-remote/qwenvl.yml create mode 100644 examples/so100-remote/so100_inference.urdf diff --git a/examples/so100-remote/parse_bbox.py b/examples/so100-remote/parse_bbox.py new file mode 100644 index 00000000..6a6af454 --- /dev/null +++ b/examples/so100-remote/parse_bbox.py @@ -0,0 +1,69 @@ +"""TODO: Add docstring.""" + +import json +import os + +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() + +IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0")) + + +def extract_bboxes(json_text): + """Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array. + + Parameters + ---------- + json_text : str + JSON string containing bounding box data, including ```json markers. + + Returns + ------- + np.ndarray: NumPy array of bounding boxes. + + """ + # Ensure all lines are stripped of whitespace and markers + lines = json_text.strip().splitlines() + + # Filter out lines that are markdown markers + clean_lines = [line for line in lines if not line.strip().startswith("```")] + + # Join the lines back into a single string + clean_text = "\n".join(clean_lines) + # Parse the cleaned JSON text + try: + data = json.loads(clean_text) + + # Extract bounding boxes + bboxes = [item["bbox_2d"] for item in data] + labels = [item["label"] for item in data] + + return np.array(bboxes), np.array(labels) + except Exception as _e: # noqa + pass + return None, None + + +for event in node: + if event["type"] == "INPUT": + if len(event["value"]) == 0: + node.send_output("bbox_track", pa.array([])) + continue + + text = event["value"][0].as_py() + metadata = event["metadata"] + image_id = event["metadata"]["image_id"] + + bboxes, labels = extract_bboxes(text) + if bboxes is not None and len(bboxes) > 0: + bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO) + metadata["image_id"] = image_id + metadata["encoding"] = "xyxy" + node.send_output( + "bbox", + pa.array(bboxes.ravel()), + metadata, + ) diff --git a/examples/so100-remote/parse_keyboard.py b/examples/so100-remote/parse_keyboard.py index ed085c42..2e0d84cb 100644 --- a/examples/so100-remote/parse_keyboard.py +++ b/examples/so100-remote/parse_keyboard.py @@ -11,94 +11,85 @@ target_y = -0.02 target_x = 0.00 place_x = -0.02 -place_y = -0.1 - -top_z = -0.50 +place_y = 0.2 +place_z = -0.48 +top_z = -0.44 low_z = -0.57 roll = 1.86 pitch = 1.43 -yaw_closed = 0.8 -yaw_opened = -0.5 - -now = time.time() -time.sleep(1.5) - -node.send_output( - "action", - pa.array([target_x, target_y, top_z, roll, pitch, yaw_closed]), - metadata={"encoding": "xyzrpy"}, -) +yaw_open = 0.8 +yaw_close = -0.5 -time.sleep(0.8) -node.send_output( - "action", - pa.array([target_x, target_y, top_z, roll, pitch, yaw_closed]), - metadata={"encoding": "xyzrpy"}, -) +def grab(target_x, target_y, low_z, top_z, roll, pitch, yaw_open, yaw_close): + node.send_output( + "action", + pa.array([target_x, target_y, top_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, + ) -time.sleep(0.5) + time.sleep(0.8) -node.send_output( - "action", - pa.array([target_x, target_y, low_z, roll, pitch, yaw_closed]), - metadata={"encoding": "xyzrpy"}, -) -time.sleep(0.2) + node.send_output( + "action", + pa.array([target_x, target_y, low_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, + ) + time.sleep(0.2) -node.send_output( - "action", - pa.array([target_x, target_y, low_z, roll, pitch, yaw_opened]), - metadata={"encoding": "xyzrpy"}, -) + node.send_output( + "action", + pa.array([target_x, target_y, low_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) + time.sleep(1.0) -time.sleep(1.0) + node.send_output( + "action", + pa.array([target_x, target_y, top_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) -node.send_output( - "action", - pa.array([target_x, target_y, top_z, roll, pitch, yaw_opened]), - metadata={"encoding": "xyzrpy"}, -) -time.sleep(0.3) +def place(place_x, place_y, place_z, top_z, roll, pitch, yaw_open, yaw_close): + node.send_output( + "action", + pa.array([place_x, place_y, top_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) -node.send_output( - "action", - pa.array([place_x, place_y, top_z, roll, pitch, yaw_opened]), - metadata={"encoding": "xyzrpy"}, -) + time.sleep(1.0) -time.sleep(1.0) + node.send_output( + "action", + pa.array([place_x, place_y, place_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) -node.send_output( - "action", - pa.array([place_x, place_y, low_z, roll, pitch, yaw_opened]), - metadata={"encoding": "xyzrpy"}, -) + time.sleep(1.0) -time.sleep(0.2) + node.send_output( + "action", + pa.array([place_x, place_y, place_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, + ) + time.sleep(0.5) -node.send_output( - "action", - pa.array([place_x, place_y, low_z, roll, pitch, yaw_closed]), - metadata={"encoding": "xyzrpy"}, -) -time.sleep(1.0) + node.send_output( + "action", + pa.array([place_x, place_y, place_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) -node.send_output( - "action", - pa.array([place_x, place_y, top_z, roll, pitch, yaw_opened]), - metadata={"encoding": "xyzrpy"}, -) + time.sleep(0.5) -time.sleep(1.0) + node.send_output( + "action", + pa.array([place_x, place_y, top_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) -node.send_output( - "action", - pa.array([place_x, place_y, top_z, roll, pitch, yaw_opened]), - metadata={"encoding": "xyzrpy"}, -) diff --git a/examples/so100-remote/parse_pose.py b/examples/so100-remote/parse_pose.py new file mode 100644 index 00000000..5e3d702f --- /dev/null +++ b/examples/so100-remote/parse_pose.py @@ -0,0 +1,152 @@ +"""TODO: Add docstring.""" + +import time +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() +top_z = -0.48 +low_z = -0.57 + +roll = 1.86 +pitch = 1.43 +yaw_open = 0.8 +yaw_close = -0.5 + + +def grab(target_x, target_y, low_z, top_z, roll, pitch, yaw_open, yaw_close, last_x, last_y): + + node.send_output( + "action", + pa.array([target_x, target_y, top_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, + ) + + time.sleep(0.6) + + node.send_output( + "action", + pa.array([target_x, target_y, low_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, + ) + time.sleep(0.5) + + + node.send_output( + "action", + pa.array([target_x, target_y, low_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) + + time.sleep(0.5) + + node.send_output( + "action", + pa.array([target_x, target_y, top_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) + + node.send_output( + "action", + pa.array([0.05, 0.04, top_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) + + +def place(place_x, place_y, place_z, top_z, roll, pitch, yaw_open, yaw_close, last_x, last_y): + + + node.send_output( + "action", + pa.array([place_x, place_y, top_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) + + time.sleep(0.6) + + node.send_output( + "action", + pa.array([place_x, place_y, place_z, roll, pitch, yaw_close]), + metadata={"encoding": "xyzrpy"}, + ) + + time.sleep(0.5) + + + node.send_output( + "action", + pa.array([place_x, place_y, top_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, + ) + + time.sleep(0.7) + + + node.send_output( + "action", + pa.array([0.05, 0.04, top_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, + ) + + + +node.send_output( + "action", + pa.array([0.05, 0.04, top_z, roll, pitch, yaw_open]), + metadata={"encoding": "xyzrpy"}, +) + +last_x = 0 +last_y = 0 +last_z = 0 + +for event in node: + if event["type"] == "INPUT": + if event["id"] == "pose": + values = event["value"] + values = values.to_numpy() + print(values) + if len(values) == 0: + continue + x = values[0] + y = values[1] + z = values[2] + action = event["metadata"]["action"] + + + # Adjust z with the size of the gripper + z = z + 0.073 + # y = y - 0.01 + x = x - 0.01 + match action: + case "grab": + grab( + x, + y, + z, + top_z, + roll, + pitch, + yaw_open, + yaw_close, + last_x, + last_y + ) + case "release": + y = y - 0.02 + place( + x, + y, + z, + top_z, + roll, + pitch, + yaw_open, + yaw_close, + last_x, + last_y + ) + last_x = -0.05 + last_y = 0.04 + last_z = z diff --git a/examples/so100-remote/parse_whisper.py b/examples/so100-remote/parse_whisper.py new file mode 100644 index 00000000..a667760c --- /dev/null +++ b/examples/so100-remote/parse_whisper.py @@ -0,0 +1,32 @@ +"""TODO: Add docstring.""" + +import json +import os +import time + +import numpy as np +import pyarrow as pa +from dora import Node + +node = Node() + + +last_prompt = "" +for event in node: + if event["type"] == "INPUT": + if event["id"] == "text": + text = event["value"][0].as_py().lower() + + if "grab " in text: + text = f"Given the prompt: {text}. Output the bounding boxes for the given object" + node.send_output( + "text", pa.array([text]), {"image_id": "image", "action": "grab"} + ) + + elif "put " in text: + text = f"Given the prompt: {text}. Output the bounding boxes for the place to put the object" + node.send_output( + "text", + pa.array([text]), + {"image_id": "image", "action": "release"}, + ) diff --git a/examples/so100-remote/qwenvl.yml b/examples/so100-remote/qwenvl.yml new file mode 100644 index 00000000..efa0cd97 --- /dev/null +++ b/examples/so100-remote/qwenvl.yml @@ -0,0 +1,141 @@ +nodes: + - id: so100 + path: dora-rustypot + inputs: + tick: dora/timer/millis/33 + pose: + source: pytorch-kinematics/action + queue_size: 100 + outputs: + - pose + env: + PORT: /dev/ttyACM0 + TORQUE: true + IDS: 1 2 3 4 5 6 + + - id: camera + build: pip install -e ../../node-hub/dora-pyrealsense + path: dora-pyrealsense + inputs: + tick: dora/timer/millis/33 + outputs: + - image + - depth + + - id: pytorch-kinematics + build: pip install -e ../../node-hub/dora-pytorch-kinematics + path: dora-pytorch-kinematics + inputs: + pose: so100/pose + action: + source: parse_pose/action + queue_size: 100 + outputs: + - pose + - action + env: + URDF_PATH: so100.urdf + END_EFFECTOR_LINK: "Moving Jaw" + TRANSFORM: -0.18 0.02 -0.65 0.7 0 0 0.7 + + - id: plot + build: pip install -e ../../node-hub/dora-rerun + path: dora-rerun + inputs: + #series_so100: so100/pose + # series_pose: pytorch-kinematics/pose + jointstate_so100: so100/pose + jointstate_so100_inference: pytorch-kinematics/action + camera/image: camera/image + camera/depth: camera/depth + text_whisper: dora-distil-whisper/text + text_vlm: dora-qwenvl/text + camera/boxes2d: parse_bbox/bbox + camera/masks: sam2/masks + env: + so100_urdf: so100.urdf + so100_inference_urdf: so100_inference.urdf + so100_transform: -0.18 0.02 -0.65 0.7 0 0 0.7 + so100_inference_transform: -0.18 0.02 -0.65 0.7 0 0 0.7 + CAMERA_PITCH: -3.1415 + + - id: dora-microphone + build: pip install -e ../../node-hub/dora-microphone + path: dora-microphone + inputs: + tick: dora/timer/millis/2000 + outputs: + - audio + + - id: parse_whisper + path: parse_whisper.py + inputs: + text: dora-distil-whisper/text + outputs: + - text + + - id: dora-qwenvl + build: pip install -e ../../node-hub/dora-qwen2-5-vl + path: dora-qwen2-5-vl + inputs: + image: camera/image + text: parse_whisper/text + outputs: + - text + env: + DEFAULT_QUESTION: Output the bounding box of the suitcase. + IMAGE_RESIZE_RATIO: "1.0" + + - id: parse_bbox + path: parse_bbox.py + inputs: + text: dora-qwenvl/text + outputs: + - bbox + env: + IMAGE_RESIZE_RATIO: "1.0" + + - id: sam2 + build: pip install -e ../../node-hub/dora-sam2 + path: dora-sam2 + inputs: + image: camera/image + boxes2d: parse_bbox/bbox + outputs: + - masks + + - id: box_coordinates + build: pip install -e ../../node-hub/dora-object-to-pose + path: dora-object-to-pose + inputs: + depth: camera/depth + masks: sam2/masks + outputs: + - pose + env: + CAMERA_PITCH: -3.1415 + + - id: parse_pose + path: parse_pose.py + inputs: + pose: box_coordinates/pose + outputs: + - action + + - id: dora-vad + build: pip install -e ../../node-hub/dora-vad + path: dora-vad + inputs: + audio: dora-microphone/audio + outputs: + - audio + + - id: dora-distil-whisper + build: pip install -e ../../node-hub/dora-distil-whisper + path: dora-distil-whisper + inputs: + input: dora-vad/audio + outputs: + - text + env: + TARGET_LANGUAGE: english diff --git a/examples/so100-remote/so100_inference.urdf b/examples/so100-remote/so100_inference.urdf new file mode 100644 index 00000000..a72a7ebb --- /dev/null +++ b/examples/so100-remote/so100_inference.urdf @@ -0,0 +1,384 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/examples/so100-remote/test.yml b/examples/so100-remote/test.yml index a7bf9740..05cfb161 100644 --- a/examples/so100-remote/test.yml +++ b/examples/so100-remote/test.yml @@ -43,13 +43,15 @@ nodes: build: pip install -e ../../node-hub/dora-rerun path: dora-rerun inputs: - #series_so100: so100/pose # series_pose: pytorch-kinematics/pose - series_so100: pytorch-kinematics/action - jointstate_so100: pytorch-kinematics/action + series_so100_inference: pytorch-kinematics/action + jointstate_so100: so100/pose + jointstate_so100_inference: pytorch-kinematics/action camera/image: camera/image camera/depth: camera/depth env: so100_urdf: so100.urdf + so100_inference_urdf: so100_inference.urdf so100_transform: -0.18 0.02 -0.65 0.7 0 0 0.7 + so100_inference_transform: -0.18 0.02 -0.65 0.7 0 0 0.7 CAMERA_PITCH: -3.1415