Add vggt based URDF visualisation (#1044)

6 months ago · dfc24300f8
--- a/examples/urdf/vggt/franka.yml
+++ b/examples/urdf/vggt/franka.yml
@@ -0,0 +1,70 @@
 nodes:
  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      jointstate_panda: pytorch_kinematics/cmd_vel
      camera/image: dora-vggt/image
      camera/depth: dora-vggt/depth
    env:
      panda_urdf: "panda_description"
      panda_transform: .5 -0. -0.1 1. 0. 0. 0.
      CAMERA_PITCH: 1.5708
  - id: gamepad
    build: pip install -e ../../node-hub/gamepad
    path: gamepad
    outputs:
      - cmd_vel
      - raw_control
    inputs:
      tick: dora/timer/millis/10
    env:
      MAX_LINEAR_SPEED: 0.01
      MAX_ANGULAR_SPEED: 0.05
  - id: pytorch_kinematics
    build: pip install -e ../../node-hub/dora-pytorch-kinematics
    path: dora-pytorch-kinematics
    inputs:
      cmd_vel: gamepad/cmd_vel
    outputs:
      - cmd_vel
    env:
      MODEL_NAME: "panda_description"
      END_EFFECTOR_LINK: "panda_link8"
      TRANSFORM: .5 -0. -0.1 1. 0. 0. 0.
      POSITION_TOLERANCE: 0.001
      ROTATION_TOLERANCE: 0.001
  - id: camera
    build: pip install -e ../../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 4
  - id: camera2
    build: pip install -e ../../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 6
  - id: dora-vggt
    build: pip install -e ../../../node-hub/dora-vggt
    path: dora-vggt
    inputs:
      image: camera/image
      image2: camera2/image
    outputs:
      - depth
      - image
    env:
      SCALE_FACTOR: 0.9
--- a/examples/urdf/vggt/kuka.yml
+++ b/examples/urdf/vggt/kuka.yml
@@ -0,0 +1,68 @@
 nodes:
  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      jointstate_iiwa14_primitive_collision: pytorch_kinematics/cmd_vel
      camera/image: dora-vggt/image
      camera/depth: dora-vggt/depth
    env:
      iiwa14_primitive_collision_urdf: "iiwa14_description"
      iiwa14_primitive_collision_transform: .5 -0. -0.1 1. 0. 0. 0.
      CAMERA_PITCH: 1.5708
  - id: gamepad
    build: pip install -e ../../node-hub/gamepad
    path: gamepad
    outputs:
      - cmd_vel
      - raw_control
    inputs:
      tick: dora/timer/millis/10
    env:
      MAX_LINEAR_SPEED: 0.02
      MAX_ANGULAR_SPEED: 0.10
  - id: pytorch_kinematics
    build: pip install -e ../../node-hub/dora-pytorch-kinematics
    path: dora-pytorch-kinematics
    inputs:
      cmd_vel: gamepad/cmd_vel
    outputs:
      - cmd_vel
    env:
      MODEL_NAME: "iiwa14_description"
      END_EFFECTOR_LINK: "iiwa_link_7"
      TRANSFORM: .5 -0. -0.1 1. 0. 0. 0.
  - id: camera
    build: pip install -e ../../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 4
  - id: camera2
    build: pip install -e ../../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 6
  - id: dora-vggt
    build: pip install -e ../../../node-hub/dora-vggt
    path: dora-vggt
    inputs:
      image: camera/image
      image2: camera2/image
    outputs:
      - depth
      - image
    env:
      SCALE_FACTOR: 0.9
--- a/examples/urdf/vggt/so_arm101.yml
+++ b/examples/urdf/vggt/so_arm101.yml
@@ -0,0 +1,69 @@
 nodes:
  - id: plot
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      jointstate_so101_new_calib: pytorch_kinematics/cmd_vel
      camera/image: dora-vggt/image
      camera/depth: dora-vggt/depth
    env:
      so101_new_calib_urdf: "so_arm101_description"
      so101_new_calib_transform: .14 -0. 0.4 -.5 .5 .5 -.5
  - id: gamepad
    build: pip install -e ../../node-hub/gamepad
    path: gamepad
    outputs:
      - cmd_vel
      - raw_control
    inputs:
      tick: dora/timer/millis/10
    env:
      MAX_LINEAR_SPEED: 0.01
      MAX_ANGULAR_SPEED: 0.05
  - id: pytorch_kinematics
    build: pip install -e ../../node-hub/dora-pytorch-kinematics
    path: dora-pytorch-kinematics
    inputs:
      cmd_vel: gamepad/cmd_vel
    outputs:
      - cmd_vel
    env:
      MODEL_NAME: "so_arm101_description"
      END_EFFECTOR_LINK: "gripper"
      TRANSFORM: .14 -0. 0.4 -.5 .5 .5 -.5
      POSITION_TOLERANCE: 0.01
      ROTATION_TOLERANCE: 0.03
  - id: camera
    build: pip install -e ../../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 4
  - id: camera2
    build: pip install -e ../../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 6
  - id: dora-vggt
    build: pip install -e ../../../node-hub/dora-vggt
    path: dora-vggt
    inputs:
      image: camera/image
      image2: camera2/image
    outputs:
      - depth
      - image
    env:
      SCALE_FACTOR: 0.9
--- a/examples/urdf/vggt/z1.yml
+++ b/examples/urdf/vggt/z1.yml
@@ -0,0 +1,59 @@
 nodes:
  - id: plot
    build: pip install -e ../../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      jointstate_z1: pytorch_kinematics/cmd_vel
      camera/image: dora-vggt/image
      camera/depth: dora-vggt/depth
    env:
      z1_urdf: z1_description
      z1_transform: .5 -0.2 -0.11 1. 0. 0. 0.
      CAMERA_PITCH: 1.5708
  - id: gamepad
    build: pip install -e ../../../node-hub/gamepad
    path: gamepad
    outputs:
      - cmd_vel
      - raw_control
    inputs:
      tick: dora/timer/millis/10
    env:
      MAX_LINEAR_SPEED: 0.01
      MAX_ANGULAR_SPEED: 0.05
  - id: pytorch_kinematics
    build: pip install -e ../../../node-hub/dora-pytorch-kinematics
    path: dora-pytorch-kinematics
    inputs:
      cmd_vel: gamepad/cmd_vel
    outputs:
      - cmd_vel
    env:
      MODEL_NAME: "z1_description"
      END_EFFECTOR_LINK: "link06"
      TRANSFORM: .5 -0.2 -0.11 1. 0. 0. 0.
      POSITION_TOLERANCE: 0.001
      ROTATION_TOLERANCE: 0.001
  - id: camera
    build: pip install -e ../../../node-hub/opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 4
  - id: dora-vggt
    build: pip install -e ../../../node-hub/dora-vggt
    path: dora-vggt
    inputs:
      image: camera/image
    outputs:
      - depth
      - image
    env:
      SCALE_FACTOR: 0.88
--- a/node-hub/dora-vggt/dora_vggt/main.py
+++ b/node-hub/dora-vggt/dora_vggt/main.py
@@ -1,8 +1,8 @@
 """TODO: Add docstring."""
 import io
 import os
 from collections import deque as Deque
 from collections import deque
 import cv2
 import numpy as np
@@ -14,6 +14,8 @@ from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 SCALE_FACTOR = float(os.getenv("SCALE_FACTOR", "1"))
 VGGT_NUM_IMAGES = int(os.getenv("VGGT_NUM_IMAGES", "2"))
 # bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
 dtype = torch.bfloat16
@@ -33,7 +35,7 @@ DEPTH_ENCODING = os.environ.get("DEPTH_ENCODING", "float64")
 def main():
    """TODO: Add docstring."""
    node = Node()
    raw_images = Deque(maxlen=2)
    raw_images = deque(maxlen=VGGT_NUM_IMAGES)
    for event in node:
        if event["type"] == "INPUT":
@@ -92,7 +94,7 @@ def main():
                    pose_enc = model.camera_head(aggregated_tokens_list)[-1]
                    # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
                    extrinsic, intrinsic = pose_encoding_to_extri_intri(
                        pose_enc, images.shape[-2:]
                        pose_enc, images.shape[-2:],
                    )
                    intrinsic = intrinsic[-1][-1]
                    f_0 = intrinsic[0, 0]
@@ -102,20 +104,19 @@ def main():
                    # Predict Depth Maps
                    depth_map, depth_conf = model.depth_head(
                        aggregated_tokens_list, images, ps_idx
                        aggregated_tokens_list, images, ps_idx,
                    )
                    print(depth_conf.max())
                    depth_map[depth_conf < 1.0] = 0.0  # Set low confidence pixels to 0
                    depth_map = depth_map.to(torch.float64)
                    depth_map = depth_map[-1][-1].cpu().numpy()
                    depth_map = SCALE_FACTOR * depth_map
                    # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
                    if DEPTH_ENCODING == "mono16":
                        depth_map = (depth_map * 1000).astype(np.uint16)
                    node.send_output(
                        output_id="depth",
                        output_id=event["id"].replace("image", "depth"),
                        data=pa.array(depth_map.ravel()),
                        metadata={
                            "width": depth_map.shape[1],
@@ -137,13 +138,9 @@ def main():
                    # reorder pixels to be in last dimension
                    image = image.transpose(1, 2, 0)
                    print(
                        f"Image shape: {image.shape}, dtype: {image.dtype} and depth map shape: {depth_map.shape}, dtype: {depth_map.dtype}"
                    )
                    # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
                    node.send_output(
                        output_id="image",
                        output_id=event["id"],
                        data=pa.array(image.ravel()),
                        metadata={
                            "encoding": "rgb8",