Browse Source

working so100 inference code with qwenvl

remote-reachy2-improvements
haixuanTao 8 months ago
parent
commit
4ef9fe3b68
7 changed files with 842 additions and 71 deletions
  1. +69
    -0
      examples/so100-remote/parse_bbox.py
  2. +59
    -68
      examples/so100-remote/parse_keyboard.py
  3. +152
    -0
      examples/so100-remote/parse_pose.py
  4. +32
    -0
      examples/so100-remote/parse_whisper.py
  5. +141
    -0
      examples/so100-remote/qwenvl.yml
  6. +384
    -0
      examples/so100-remote/so100_inference.urdf
  7. +5
    -3
      examples/so100-remote/test.yml

+ 69
- 0
examples/so100-remote/parse_bbox.py View File

@@ -0,0 +1,69 @@
"""TODO: Add docstring."""

import json
import os

import numpy as np
import pyarrow as pa
from dora import Node

node = Node()

IMAGE_RESIZE_RATIO = float(os.getenv("IMAGE_RESIZE_RATIO", "1.0"))


def extract_bboxes(json_text):
"""Extract bounding boxes from a JSON string with markdown markers and return them as a NumPy array.

Parameters
----------
json_text : str
JSON string containing bounding box data, including ```json markers.

Returns
-------
np.ndarray: NumPy array of bounding boxes.

"""
# Ensure all lines are stripped of whitespace and markers
lines = json_text.strip().splitlines()

# Filter out lines that are markdown markers
clean_lines = [line for line in lines if not line.strip().startswith("```")]

# Join the lines back into a single string
clean_text = "\n".join(clean_lines)
# Parse the cleaned JSON text
try:
data = json.loads(clean_text)

# Extract bounding boxes
bboxes = [item["bbox_2d"] for item in data]
labels = [item["label"] for item in data]

return np.array(bboxes), np.array(labels)
except Exception as _e: # noqa
pass
return None, None


for event in node:
if event["type"] == "INPUT":
if len(event["value"]) == 0:
node.send_output("bbox_track", pa.array([]))
continue

text = event["value"][0].as_py()
metadata = event["metadata"]
image_id = event["metadata"]["image_id"]

bboxes, labels = extract_bboxes(text)
if bboxes is not None and len(bboxes) > 0:
bboxes = bboxes * int(1 / IMAGE_RESIZE_RATIO)
metadata["image_id"] = image_id
metadata["encoding"] = "xyxy"
node.send_output(
"bbox",
pa.array(bboxes.ravel()),
metadata,
)

+ 59
- 68
examples/so100-remote/parse_keyboard.py View File

@@ -11,94 +11,85 @@ target_y = -0.02
target_x = 0.00

place_x = -0.02
place_y = -0.1
top_z = -0.50
place_y = 0.2
place_z = -0.48
top_z = -0.44
low_z = -0.57

roll = 1.86
pitch = 1.43
yaw_closed = 0.8
yaw_opened = -0.5

now = time.time()
time.sleep(1.5)

node.send_output(
"action",
pa.array([target_x, target_y, top_z, roll, pitch, yaw_closed]),
metadata={"encoding": "xyzrpy"},
)
yaw_open = 0.8
yaw_close = -0.5

time.sleep(0.8)

node.send_output(
"action",
pa.array([target_x, target_y, top_z, roll, pitch, yaw_closed]),
metadata={"encoding": "xyzrpy"},
)
def grab(target_x, target_y, low_z, top_z, roll, pitch, yaw_open, yaw_close):
node.send_output(
"action",
pa.array([target_x, target_y, top_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)

time.sleep(0.5)
time.sleep(0.8)

node.send_output(
"action",
pa.array([target_x, target_y, low_z, roll, pitch, yaw_closed]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(0.2)
node.send_output(
"action",
pa.array([target_x, target_y, low_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(0.2)


node.send_output(
"action",
pa.array([target_x, target_y, low_z, roll, pitch, yaw_opened]),
metadata={"encoding": "xyzrpy"},
)
node.send_output(
"action",
pa.array([target_x, target_y, low_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

time.sleep(1.0)

time.sleep(1.0)
node.send_output(
"action",
pa.array([target_x, target_y, top_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

node.send_output(
"action",
pa.array([target_x, target_y, top_z, roll, pitch, yaw_opened]),
metadata={"encoding": "xyzrpy"},
)

time.sleep(0.3)
def place(place_x, place_y, place_z, top_z, roll, pitch, yaw_open, yaw_close):

node.send_output(
"action",
pa.array([place_x, place_y, top_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

node.send_output(
"action",
pa.array([place_x, place_y, top_z, roll, pitch, yaw_opened]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(1.0)

time.sleep(1.0)
node.send_output(
"action",
pa.array([place_x, place_y, place_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

node.send_output(
"action",
pa.array([place_x, place_y, low_z, roll, pitch, yaw_opened]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(1.0)

time.sleep(0.2)
node.send_output(
"action",
pa.array([place_x, place_y, place_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(0.5)

node.send_output(
"action",
pa.array([place_x, place_y, low_z, roll, pitch, yaw_closed]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(1.0)
node.send_output(
"action",
pa.array([place_x, place_y, place_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

node.send_output(
"action",
pa.array([place_x, place_y, top_z, roll, pitch, yaw_opened]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(0.5)

time.sleep(1.0)
node.send_output(
"action",
pa.array([place_x, place_y, top_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

node.send_output(
"action",
pa.array([place_x, place_y, top_z, roll, pitch, yaw_opened]),
metadata={"encoding": "xyzrpy"},
)

+ 152
- 0
examples/so100-remote/parse_pose.py View File

@@ -0,0 +1,152 @@
"""TODO: Add docstring."""

import time
import numpy as np
import pyarrow as pa
from dora import Node

node = Node()
top_z = -0.48
low_z = -0.57

roll = 1.86
pitch = 1.43
yaw_open = 0.8
yaw_close = -0.5


def grab(target_x, target_y, low_z, top_z, roll, pitch, yaw_open, yaw_close, last_x, last_y):

node.send_output(
"action",
pa.array([target_x, target_y, top_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)

time.sleep(0.6)

node.send_output(
"action",
pa.array([target_x, target_y, low_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(0.5)


node.send_output(
"action",
pa.array([target_x, target_y, low_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

time.sleep(0.5)

node.send_output(
"action",
pa.array([target_x, target_y, top_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

node.send_output(
"action",
pa.array([0.05, 0.04, top_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

def place(place_x, place_y, place_z, top_z, roll, pitch, yaw_open, yaw_close, last_x, last_y):


node.send_output(
"action",
pa.array([place_x, place_y, top_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

time.sleep(0.6)

node.send_output(
"action",
pa.array([place_x, place_y, place_z, roll, pitch, yaw_close]),
metadata={"encoding": "xyzrpy"},
)

time.sleep(0.5)


node.send_output(
"action",
pa.array([place_x, place_y, top_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)
time.sleep(0.7)


node.send_output(
"action",
pa.array([0.05, 0.04, top_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)


node.send_output(
"action",
pa.array([0.05, 0.04, top_z, roll, pitch, yaw_open]),
metadata={"encoding": "xyzrpy"},
)

last_x = 0
last_y = 0
last_z = 0

for event in node:
if event["type"] == "INPUT":
if event["id"] == "pose":
values = event["value"]
values = values.to_numpy()
print(values)
if len(values) == 0:
continue
x = values[0]
y = values[1]
z = values[2]
action = event["metadata"]["action"]

# Adjust z with the size of the gripper
z = z + 0.073
# y = y - 0.01
x = x - 0.01
match action:
case "grab":
grab(
x,
y,
z,
top_z,
roll,
pitch,
yaw_open,
yaw_close,
last_x,
last_y
)
case "release":
y = y - 0.02
place(
x,
y,
z,
top_z,
roll,
pitch,
yaw_open,
yaw_close,
last_x,
last_y
)
last_x = -0.05
last_y = 0.04
last_z = z

+ 32
- 0
examples/so100-remote/parse_whisper.py View File

@@ -0,0 +1,32 @@
"""TODO: Add docstring."""

import json
import os
import time

import numpy as np
import pyarrow as pa
from dora import Node

node = Node()


last_prompt = ""
for event in node:
if event["type"] == "INPUT":
if event["id"] == "text":
text = event["value"][0].as_py().lower()

if "grab " in text:
text = f"Given the prompt: {text}. Output the bounding boxes for the given object"
node.send_output(
"text", pa.array([text]), {"image_id": "image", "action": "grab"}
)

elif "put " in text:
text = f"Given the prompt: {text}. Output the bounding boxes for the place to put the object"
node.send_output(
"text",
pa.array([text]),
{"image_id": "image", "action": "release"},
)

+ 141
- 0
examples/so100-remote/qwenvl.yml View File

@@ -0,0 +1,141 @@
nodes:
- id: so100
path: dora-rustypot
inputs:
tick: dora/timer/millis/33
pose:
source: pytorch-kinematics/action
queue_size: 100
outputs:
- pose
env:
PORT: /dev/ttyACM0
TORQUE: true
IDS: 1 2 3 4 5 6

- id: camera
build: pip install -e ../../node-hub/dora-pyrealsense
path: dora-pyrealsense
inputs:
tick: dora/timer/millis/33
outputs:
- image
- depth

- id: pytorch-kinematics
build: pip install -e ../../node-hub/dora-pytorch-kinematics
path: dora-pytorch-kinematics
inputs:
pose: so100/pose
action:
source: parse_pose/action
queue_size: 100
outputs:
- pose
- action
env:
URDF_PATH: so100.urdf
END_EFFECTOR_LINK: "Moving Jaw"
TRANSFORM: -0.18 0.02 -0.65 0.7 0 0 0.7

- id: plot
build: pip install -e ../../node-hub/dora-rerun
path: dora-rerun
inputs:
#series_so100: so100/pose
# series_pose: pytorch-kinematics/pose
jointstate_so100: so100/pose
jointstate_so100_inference: pytorch-kinematics/action
camera/image: camera/image
camera/depth: camera/depth
text_whisper: dora-distil-whisper/text
text_vlm: dora-qwenvl/text
camera/boxes2d: parse_bbox/bbox
camera/masks: sam2/masks
env:
so100_urdf: so100.urdf
so100_inference_urdf: so100_inference.urdf
so100_transform: -0.18 0.02 -0.65 0.7 0 0 0.7
so100_inference_transform: -0.18 0.02 -0.65 0.7 0 0 0.7
CAMERA_PITCH: -3.1415

- id: dora-microphone
build: pip install -e ../../node-hub/dora-microphone
path: dora-microphone
inputs:
tick: dora/timer/millis/2000
outputs:
- audio

- id: parse_whisper
path: parse_whisper.py
inputs:
text: dora-distil-whisper/text
outputs:
- text

- id: dora-qwenvl
build: pip install -e ../../node-hub/dora-qwen2-5-vl
path: dora-qwen2-5-vl
inputs:
image: camera/image
text: parse_whisper/text
outputs:
- text
env:
DEFAULT_QUESTION: Output the bounding box of the suitcase.
IMAGE_RESIZE_RATIO: "1.0"

- id: parse_bbox
path: parse_bbox.py
inputs:
text: dora-qwenvl/text
outputs:
- bbox
env:
IMAGE_RESIZE_RATIO: "1.0"

- id: sam2
build: pip install -e ../../node-hub/dora-sam2
path: dora-sam2
inputs:
image: camera/image
boxes2d: parse_bbox/bbox
outputs:
- masks

- id: box_coordinates
build: pip install -e ../../node-hub/dora-object-to-pose
path: dora-object-to-pose
inputs:
depth: camera/depth
masks: sam2/masks
outputs:
- pose
env:
CAMERA_PITCH: -3.1415

- id: parse_pose
path: parse_pose.py
inputs:
pose: box_coordinates/pose
outputs:
- action

- id: dora-vad
build: pip install -e ../../node-hub/dora-vad
path: dora-vad
inputs:
audio: dora-microphone/audio
outputs:
- audio

- id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper
path: dora-distil-whisper
inputs:
input: dora-vad/audio
outputs:
- text
env:
TARGET_LANGUAGE: english

+ 384
- 0
examples/so100-remote/so100_inference.urdf View File

@@ -0,0 +1,384 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- This URDF was automatically created by SolidWorks to URDF Exporter! Originally created by Stephen Brawner (brawner@gmail.com)
Commit Version: 1.6.0-1-g15f4949 Build Version: 1.6.7594.29634
For more information, please see http://wiki.ros.org/sw_urdf_exporter -->
<robot
name="SO_5DOF_ARM100_8j_URDF.SLDASM">
<link
name="base_link">
<inertial>
<origin
xyz="-2.45960666746703E-07 0.0311418169687909 0.0175746661003382"
rpy="0 0 0" />
<mass
value="0.193184127927598" />
<inertia
ixx="0.000137030709467877"
ixy="2.10136126944992E-08"
ixz="4.24087422551286E-09"
iyy="0.000169089551209259"
iyz="2.26514711036514E-05"
izz="0.000145097720857224" />
</inertial>
<visual>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Base.STL" />
</geometry>
<material
name="">
<color
rgba="0.2 0.819607843137255 0.2 1" />
</material>
</visual>
<collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Base.STL" />
</geometry>
</collision>
</link>
<link
name="Rotation_Pitch">
<inertial>
<origin
xyz="-9.07886224712597E-05 0.0590971820568318 0.031089016892169"
rpy="0 0 0" />
<mass
value="0.119226314127197" />
<inertia
ixx="5.90408775624429E-05"
ixy="4.90800532852998E-07"
ixz="-5.90451772654387E-08"
iyy="3.21498601038881E-05"
iyz="-4.58026206663885E-06"
izz="5.86058514263952E-05" />
</inertial>
<visual>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Rotation_Pitch.STL" />
</geometry>
<material
name="">
<color
rgba="0.2 0.819607843137255 0.2 1" />
</material>
</visual>
<collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Rotation_Pitch.STL" />
</geometry>
</collision>
</link>
<joint
name="Rotation"
type="revolute">
<origin
xyz="0 -0.0452 0.0165"
rpy="1.5708 0 0" />
<parent
link="base_link" />
<child
link="Rotation_Pitch" />
<axis
xyz="0 -1 0" />
<!-- note for the so100 arm there is no well defined effort/velocity limits at the moment -->
</joint>
<link
name="Upper_Arm">
<inertial>
<origin
xyz="-1.7205170190925E-05 0.0701802156327694 0.00310545118155671"
rpy="0 0 0" />
<mass
value="0.162409284599177" />
<inertia
ixx="0.000167153146617081"
ixy="1.03902689187701E-06"
ixz="-1.20161820645189E-08"
iyy="7.01946992214245E-05"
iyz="2.11884806298698E-06"
izz="0.000213280241160769" />
</inertial>
<visual>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Upper_Arm.STL" />
</geometry>
<material
name="">
<color
rgba="0.2 0.819607843137255 0.2 1" />
</material>
</visual>
<collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Upper_Arm.STL" />
</geometry>
</collision>
</link>
<joint
name="Pitch"
type="revolute">
<origin
xyz="0 0.1025 0.0306"
rpy="0 0 0" />
<parent
link="Rotation_Pitch" />
<child
link="Upper_Arm" />
<axis
xyz="1 0 0" />
</joint>
<link
name="Lower_Arm">
<inertial>
<origin
xyz="-0.00339603710186651 0.00137796353960074 0.0768006751156044"
rpy="0 0 0" />
<mass
value="0.147967774582291" />
<inertia
ixx="0.000105333995841409"
ixy="1.73059237226499E-07"
ixz="-1.1720305455211E-05"
iyy="0.000138766654485212"
iyz="1.77429964684103E-06"
izz="5.08741652515214E-05" />
</inertial>
<visual>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Lower_Arm.STL" />
</geometry>
<material
name="">
<color
rgba="0.2 0.819607843137255 0.2 1" />
</material>
</visual>
<collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Lower_Arm.STL" />
</geometry>
</collision>
</link>
<joint
name="Elbow"
type="revolute">
<origin
xyz="0 0.11257 0.028"
rpy="-0.1 0 0" />
<parent
link="Upper_Arm" />
<child
link="Lower_Arm" />
<axis
xyz="1 0 0" />
</joint>
<link
name="Wrist_Pitch_Roll">
<inertial>
<origin
xyz="-0.00852653127372418 -0.0352278997897927 -2.34622481569413E-05"
rpy="0 0 0" />
<mass
value="0.066132067097723" />
<inertia
ixx="1.95717492443445E-05"
ixy="-6.62714374412293E-07"
ixz="5.20089016442066E-09"
iyy="2.38028417569933E-05"
iyz="4.09549055863776E-08"
izz="3.4540143384536E-05" />
</inertial>
<visual>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Wrist_Pitch_Roll.STL" />
</geometry>
<material
name="">
<color
rgba="0.2 0.819607843137255 0.2 1" />
</material>
</visual>
<collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Wrist_Pitch_Roll.STL" />
</geometry>
</collision>
</link>
<joint
name="Wrist_Pitch"
type="revolute">
<origin
xyz="0 0.0052 0.1349"
rpy="-1.5708 0 0" />
<parent
link="Lower_Arm" />
<child
link="Wrist_Pitch_Roll" />
<axis
xyz="1 0 0" />
<limit
lower="-1.8"
upper="1.8"
effort="0"
velocity="0" />
</joint>
<link
name="Fixed_Jaw">
<inertial>
<origin
xyz="0.00552376906426563 -0.0280167153359021 0.000483582592841092"
rpy="0 0 0" />
<mass
value="0.0929859131176897" />
<inertia
ixx="4.3328249304211E-05"
ixy="7.09654328670947E-06"
ixz="5.99838530879484E-07"
iyy="3.04451747368212E-05"
iyz="-1.58743247545413E-07"
izz="5.02460913506734E-05" />
</inertial>
<visual>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Fixed_Jaw.STL" />
</geometry>
<material
name="">
<color
rgba="0.2 0.819607843137255 0.2 1" />
</material>
</visual>
<!-- <collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Fixed_Jaw.STL" />
</geometry>
</collision> -->
<collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Wrist_Roll_Follower_SO101.STL" />
</geometry>
</collision>
</link>
<joint
name="Wrist_Roll"
type="revolute">
<origin
xyz="0 -0.0601 0"
rpy="0 1.5708 0" />
<parent
link="Wrist_Pitch_Roll" />
<child
link="Fixed_Jaw" />
<axis
xyz="0 1 0" />
</joint>
<link
name="Moving Jaw">
<inertial>
<origin
xyz="-0.00161744605468241 -0.0303472584046471 0.000449645961853651"
rpy="0 0 0" />
<mass
value="0.0202443794940372" />
<inertia
ixx="1.10911325081525E-05"
ixy="-5.35076503033314E-07"
ixz="-9.46105662101403E-09"
iyy="3.03576451001973E-06"
iyz="-1.71146075110632E-07"
izz="8.9916083370498E-06" />
</inertial>
<visual>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Moving Jaw.STL" />
</geometry>
<material
name="">
<color
rgba="0.2 0.819607843137255 0.2 1" />
</material>
</visual>
<collision>
<origin
xyz="0 0 0"
rpy="0 0 0" />
<geometry>
<mesh
filename="Moving_Jaw_SO101.STL" />
</geometry>
</collision>
</link>
<joint
name="Jaw"
type="revolute">
<origin
xyz="-0.0202 -0.0244 0"
rpy="3.1416 0 3.33" />
<parent
link="Fixed_Jaw" />
<child
link="Moving Jaw" />
<axis
xyz="0 0 1" />
</joint>
</robot>

+ 5
- 3
examples/so100-remote/test.yml View File

@@ -43,13 +43,15 @@ nodes:
build: pip install -e ../../node-hub/dora-rerun
path: dora-rerun
inputs:
#series_so100: so100/pose
# series_pose: pytorch-kinematics/pose
series_so100: pytorch-kinematics/action
jointstate_so100: pytorch-kinematics/action
series_so100_inference: pytorch-kinematics/action
jointstate_so100: so100/pose
jointstate_so100_inference: pytorch-kinematics/action
camera/image: camera/image
camera/depth: camera/depth
env:
so100_urdf: so100.urdf
so100_inference_urdf: so100_inference.urdf
so100_transform: -0.18 0.02 -0.65 0.7 0 0 0.7
so100_inference_transform: -0.18 0.02 -0.65 0.7 0 0 0.7
CAMERA_PITCH: -3.1415

Loading…
Cancel
Save