From 5d87bd1beba25a72e6edd501d32e664bf5d979e9 Mon Sep 17 00:00:00 2001
From: haixuanTao <tao.xavier@outlook.com>
Date: Fri, 1 Nov 2024 10:34:18 +0100
Subject: [PATCH] Adding `rdt-1b` node

improve pytest of rdt-1b

Add main into rdt-1b

add small cloud fix for rdt 1b

Small rdt-1b main fix

Small improvement on rdt 1b

Small fixes to dora-rdt-1b main

Add piper example

Add environment variable for configuration vision and language parameter

add python feature flag to dora-rerun

Fix play inference

fixing replay issue

make data dir dependant on the date
---
 .gitmodules                                   |   3 +
 examples/piper/README.md                      |  61 ++++
 examples/piper/arms_camera.yml                |  74 ++++
 examples/piper/arms_only.yml                  |  33 ++
 examples/piper/dummy_inference_2.py           | 126 +++++++
 examples/piper/play_dummy_inference.yml       |  43 ++-
 examples/piper/post_process_action.py         |  24 ++
 examples/piper/rdt_1b.yml                     | 132 +++++++
 examples/piper/record.py                      | 231 +++++++++++++
 examples/piper/record.yml                     | 115 +++++++
 node-hub/dora-rdt-1b/README.md                |   3 +
 .../dora_rdt_1b/RoboticsDiffusionTransformer  |   1 +
 node-hub/dora-rdt-1b/dora_rdt_1b/__init__.py  |  19 +
 node-hub/dora-rdt-1b/dora_rdt_1b/main.py      | 324 ++++++++++++++++++
 node-hub/dora-rdt-1b/pyproject.toml           |  36 ++
 node-hub/dora-rdt-1b/tests/conftest.py        |  12 +
 .../dora-rdt-1b/tests/test_dora_rdt_1b.py     | 227 ++++++++++++
 17 files changed, 1459 insertions(+), 5 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 examples/piper/README.md
 create mode 100644 examples/piper/arms_camera.yml
 create mode 100644 examples/piper/arms_only.yml
 create mode 100644 examples/piper/dummy_inference_2.py
 create mode 100644 examples/piper/post_process_action.py
 create mode 100644 examples/piper/rdt_1b.yml
 create mode 100644 examples/piper/record.py
 create mode 100644 examples/piper/record.yml
 create mode 100644 node-hub/dora-rdt-1b/README.md
 create mode 160000 node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
 create mode 100644 node-hub/dora-rdt-1b/dora_rdt_1b/__init__.py
 create mode 100644 node-hub/dora-rdt-1b/dora_rdt_1b/main.py
 create mode 100644 node-hub/dora-rdt-1b/pyproject.toml
 create mode 100644 node-hub/dora-rdt-1b/tests/conftest.py
 create mode 100644 node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..5b3e5af8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer"]
+	path = node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
+	url = https://github.com/thu-ml/RoboticsDiffusionTransformer
diff --git a/examples/piper/README.md b/examples/piper/README.md
new file mode 100644
index 00000000..57b3a70e
--- /dev/null
+++ b/examples/piper/README.md
@@ -0,0 +1,61 @@
+# Getting Started with Tracer + Piper
+
+## Installation (To do once)
+
+Make sure to:
+
+```bash
+dora build rdt_1b.yaml
+
+# Make sure to install from source pyorbbecksdk
+
+git clone https://github.com/orbbec/pyorbbecsdk
+cd pyorbbecsdk
+pip3 install -r requirements.txt
+mkdir build
+cd build
+cmake -Dpybind11_DIR=`pybind11-config --cmakedir` ..
+make -j4
+make install
+cd ..
+pip3 install wheel
+python3 setup.py bdist_wheel
+pip3 install dist/*.whl
+
+export PYTHONPATH=$PYTHONPATH:$(pwd)/install/lib/ # Make sure to save this in your .bashrc
+
+
+# Install ugv_sdk_py from source
+git clone https://github.com/westonrobot/ugv_sdk
+cd ugv_sdk
+python setup.py build_ext --inplace
+
+export PYTHONPATH=$PYTHONPATH:$(pwd) # Make sure to save this in your .bashrc
+```
+
+### Your bashrc should contain something like this
+
+```bash
+export PYTHONPATH=$PYTHONPATH:/home/agilex/pyorbbecsdk/install/lib/:/home/agilex/ugv_sdk
+```
+
+## Setup ( Every boot of the computer )
+
+```bash
+# Run on Agilex provided computer
+source /home/agilex/cobot_magic/Piper_ros_private-ros-noetic/can_config.sh
+```
+
+## Run
+
+### For recording episode
+
+```bash
+dora run record.yml
+```
+
+## For inference
+
+```bash
+dora run rdt_1b.yml
+```
diff --git a/examples/piper/arms_camera.yml b/examples/piper/arms_camera.yml
new file mode 100644
index 00000000..3677683c
--- /dev/null
+++ b/examples/piper/arms_camera.yml
@@ -0,0 +1,74 @@
+nodes:
+  - id: piper_left
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/20
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_left
+
+  - id: camera_left
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/50
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 0
+      ENCODING: jpeg
+
+  - id: camera_center
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/50
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 1
+      ENCODING: jpeg
+
+  - id: camera_right
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/50
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 2
+      ENCODING: jpeg
+      # import opencv as cv
+      # [cv2.VideoCapture(i) for i in range(12)]
+
+  - id: piper_right
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/20
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_right
+
+  - id: rerun
+    path: dora-rerun1
+    inputs:
+      jointstate_piper_left: piper_left/jointstate
+      jointstate_piper_right: piper_right/jointstate
+      image_camera_left: camera_left/image
+      image_camera_center: camera_center/image
+      image_camera_right: camera_right/image
+    env:
+      piper_left_urdf: assets/piper_left.urdf
+      piper_right_urdf: assets/piper_right.urdf
+      piper_left_transform: 0 0.2 0
+      piper_right_transform: 0 -0.2 0
diff --git a/examples/piper/arms_only.yml b/examples/piper/arms_only.yml
new file mode 100644
index 00000000..da5be524
--- /dev/null
+++ b/examples/piper/arms_only.yml
@@ -0,0 +1,33 @@
+nodes:
+  - id: piper_left
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/20
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_left
+
+  - id: piper_right
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/20
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_right
+
+  - id: rerun
+    path: dora-rerun
+    inputs:
+      jointstate_piper_left: piper_left/jointstate
+      jointstate_piper_right: piper_right/jointstate
+    env:
+      piper_left_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_left.urdf
+      piper_right_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_right.urdf
+      piper_left_transform: 0 0.3 0
+      piper_right_transform: 0 -0.3 0
diff --git a/examples/piper/dummy_inference_2.py b/examples/piper/dummy_inference_2.py
new file mode 100644
index 00000000..a90fcfec
--- /dev/null
+++ b/examples/piper/dummy_inference_2.py
@@ -0,0 +1,126 @@
+from dora import Node
+
+
+import numpy as np
+import h5py
+
+f = h5py.File("data/episode_0.hdf5", "r")
+
+data = f["action"][:]
+
+
+STATE_VEC_IDX_MAPPING = {
+    # [0, 10): right arm joint positions
+    **{"arm_joint_{}_pos".format(i): i for i in range(10)},
+    **{"right_arm_joint_{}_pos".format(i): i for i in range(10)},
+    # [10, 15): right gripper joint positions
+    **{"gripper_joint_{}_pos".format(i): i + 10 for i in range(5)},
+    **{"right_gripper_joint_{}_pos".format(i): i + 10 for i in range(5)},
+    "gripper_open": 10,  # alias of right_gripper_joint_0_pos
+    "right_gripper_open": 10,
+    # [15, 25): right arm joint velocities
+    **{"arm_joint_{}_vel".format(i): i + 15 for i in range(10)},
+    **{"right_arm_joint_{}_vel".format(i): i + 15 for i in range(10)},
+    # [25, 30): right gripper joint velocities
+    **{"gripper_joint_{}_vel".format(i): i + 25 for i in range(5)},
+    **{"right_gripper_joint_{}_vel".format(i): i + 25 for i in range(5)},
+    "gripper_open_vel": 25,  # alias of right_gripper_joint_0_vel
+    "right_gripper_open_vel": 25,
+    # [30, 33): right end effector positions
+    "eef_pos_x": 30,
+    "right_eef_pos_x": 30,
+    "eef_pos_y": 31,
+    "right_eef_pos_y": 31,
+    "eef_pos_z": 32,
+    "right_eef_pos_z": 32,
+    # [33, 39): right end effector 6D pose
+    "eef_angle_0": 33,
+    "right_eef_angle_0": 33,
+    "eef_angle_1": 34,
+    "right_eef_angle_1": 34,
+    "eef_angle_2": 35,
+    "right_eef_angle_2": 35,
+    "eef_angle_3": 36,
+    "right_eef_angle_3": 36,
+    "eef_angle_4": 37,
+    "right_eef_angle_4": 37,
+    "eef_angle_5": 38,
+    "right_eef_angle_5": 38,
+    # [39, 42): right end effector velocities
+    "eef_vel_x": 39,
+    "right_eef_vel_x": 39,
+    "eef_vel_y": 40,
+    "right_eef_vel_y": 40,
+    "eef_vel_z": 41,
+    "right_eef_vel_z": 41,
+    # [42, 45): right end effector angular velocities
+    "eef_angular_vel_roll": 42,
+    "right_eef_angular_vel_roll": 42,
+    "eef_angular_vel_pitch": 43,
+    "right_eef_angular_vel_pitch": 43,
+    "eef_angular_vel_yaw": 44,
+    "right_eef_angular_vel_yaw": 44,
+    # [45, 50): reserved
+    # [50, 60): left arm joint positions
+    **{"left_arm_joint_{}_pos".format(i): i + 50 for i in range(10)},
+    # [60, 65): left gripper joint positions
+    **{"left_gripper_joint_{}_pos".format(i): i + 60 for i in range(5)},
+    "left_gripper_open": 60,  # alias of left_gripper_joint_0_pos
+    # [65, 75): left arm joint velocities
+    **{"left_arm_joint_{}_vel".format(i): i + 65 for i in range(10)},
+    # [75, 80): left gripper joint velocities
+    **{"left_gripper_joint_{}_vel".format(i): i + 75 for i in range(5)},
+    "left_gripper_open_vel": 75,  # alias of left_gripper_joint_0_vel
+    # [80, 83): left end effector positions
+    "left_eef_pos_x": 80,
+    "left_eef_pos_y": 81,
+    "left_eef_pos_z": 82,
+    # [83, 89): left end effector 6D pose
+    "left_eef_angle_0": 83,
+    "left_eef_angle_1": 84,
+    "left_eef_angle_2": 85,
+    "left_eef_angle_3": 86,
+    "left_eef_angle_4": 87,
+    "left_eef_angle_5": 88,
+    # [89, 92): left end effector velocities
+    "left_eef_vel_x": 89,
+    "left_eef_vel_y": 90,
+    "left_eef_vel_z": 91,
+    # [92, 95): left end effector angular velocities
+    "left_eef_angular_vel_roll": 92,
+    "left_eef_angular_vel_pitch": 93,
+    "left_eef_angular_vel_yaw": 94,
+    # [95, 100): reserved
+    # [100, 102): base linear velocities
+    "base_vel_x": 100,
+    "base_vel_y": 101,
+    # [102, 103): base angular velocities
+    "base_angular_vel": 102,
+    # [103, 128): reserved
+}
+
+import time
+import pyarrow as pa
+
+node = Node()
+LEFT_UNI_STATE_INDICES = [
+    STATE_VEC_IDX_MAPPING[f"left_arm_joint_{i}_pos"] for i in range(6)
+] + [STATE_VEC_IDX_MAPPING["left_gripper_open"]]
+RIGHT_UNI_STATE_INDICES = [
+    STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)
+] + [STATE_VEC_IDX_MAPPING["right_gripper_open"]]
+MOBILE_BASE_UNI_STATE_INDICES = [STATE_VEC_IDX_MAPPING["base_vel_x"]] + [
+    STATE_VEC_IDX_MAPPING["base_angular_vel"]
+]
+
+for joint in data:
+    node.send_output(
+        "jointstate_left", pa.array(joint[LEFT_UNI_STATE_INDICES], type=pa.float32())
+    )
+    node.send_output(
+        "jointstate_right", pa.array(joint[RIGHT_UNI_STATE_INDICES], type=pa.float32())
+    )
+    node.send_output(
+        "mobile_base", pa.array(joint[MOBILE_BASE_UNI_STATE_INDICES], type=pa.float32())
+    )
+    time.sleep(0.05)
diff --git a/examples/piper/play_dummy_inference.yml b/examples/piper/play_dummy_inference.yml
index d821837c..0a48380a 100644
--- a/examples/piper/play_dummy_inference.yml
+++ b/examples/piper/play_dummy_inference.yml
@@ -1,13 +1,12 @@
 nodes:
   - id: piper
-    path: dummy_inference.py
+    path: dummy_inference_2.py
     inputs:
       tick: dora/timer/millis/20
     outputs:
       - jointstate_left
       - jointstate_right
-    env:
-      CAN_BUS: can_left
+      - mobile_base
 
   - id: rerun
     build: |
@@ -24,10 +23,44 @@ nodes:
       pip install git+https://github.com/rerun-io/rerun-loader-python-example-urdf.git
     path: dora-rerun
     inputs:
-      jointstate_piper_left: piper/jointstate_left
-      jointstate_piper_right: piper/jointstate_right
+      jointstate_piper_left: piper_left/jointstate
+      jointstate_piper_right: piper_right/jointstate
+      jointstate_piper_left_pred: piper/jointstate_left
+      jointstate_piper_right_pred: piper/jointstate_right
+      series_piper_left: piper_left/jointstate
+      series_piper_right: piper_right/jointstate
+      series_piper_left_pred: piper/jointstate_left
+      series_piper_right_pred: piper/jointstate_right
     env:
       piper_left_urdf: piper_left.urdf # Make sure to download meshes from https://github.com/agilexrobotics/Piper_ros/tree/4f22c61f96b8fb3ef3f937b99b63edb697caadf0/src/piper_description/meshes and put them in the assets folder
       piper_right_urdf: piper_right.urdf # Make sure to download meshes from https://github.com/agilexrobotics/Piper_ros/tree/4f22c61f96b8fb3ef3f937b99b63edb697caadf0/src/piper_description/meshes and put them in the assets folder
       piper_left_transform: 0 0.2 0
       piper_right_transform: 0 -0.2 0
+      piper_left_pred_urdf: assets/piper_left_pred.urdf
+      piper_right_pred_urdf: assets/piper_right_pred.urdf
+      piper_left_pred_transform: 0 0.2 0
+      piper_right_pred_transform: 0 -0.2 0
+
+  - id: piper_left
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/500
+      action: piper/jointstate_left
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_left
+
+  - id: piper_right
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/1000
+      action: piper/jointstate_right
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_right
diff --git a/examples/piper/post_process_action.py b/examples/piper/post_process_action.py
new file mode 100644
index 00000000..0f1259b7
--- /dev/null
+++ b/examples/piper/post_process_action.py
@@ -0,0 +1,24 @@
+from dora import Node
+
+node = Node()
+
+import numpy as np
+
+
+import time
+import pyarrow as pa
+
+for event in node:
+    if event["type"] == "INPUT":
+        actions = event["value"].to_numpy().reshape((64, 14))
+
+        # Skip action to only keep 8 spread action
+        actions = actions[[0, 8, 16, 24, 32, 40, 48, 56], :]
+
+        for action in actions:
+            node.send_output("jointstate_left", pa.array(action[:7], type=pa.float32()))
+            node.send_output(
+                "jointstate_right", pa.array(action[7:], type=pa.float32())
+            )
+            time.sleep(0.005)
+        print(actions)
diff --git a/examples/piper/rdt_1b.yml b/examples/piper/rdt_1b.yml
new file mode 100644
index 00000000..bcf7ad7b
--- /dev/null
+++ b/examples/piper/rdt_1b.yml
@@ -0,0 +1,132 @@
+nodes:
+  - id: piper_left
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/500
+      action: post_process_rdt_1b/jointstate_left
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_left
+
+  - id: piper_right
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/1000
+      action: post_process_rdt_1b/jointstate_right
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_right
+
+  - id: camera_left
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/1000
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 0
+      ENCODING: jpeg
+
+  - id: camera_center
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/1000
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 1
+      ENCODING: jpeg
+
+  - id: camera_right
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/1000
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 2
+      ENCODING: jpeg
+      # import opencv as cv
+      # [cv2.VideoCapture(i) for i in range(12)]
+
+  - id: rerun
+    path: dora-rerun
+    inputs:
+      jointstate_piper_left: piper_left/jointstate
+      jointstate_piper_right: piper_right/jointstate
+      jointstate_piper_left_pred: post_process_rdt_1b/jointstate_left
+      jointstate_piper_right_pred: post_process_rdt_1b/jointstate_right
+      series_piper_left: piper_left/jointstate
+      series_piper_right: piper_right/jointstate
+      series_piper_left_pred: post_process_rdt_1b/jointstate_left
+      series_piper_right_pred: post_process_rdt_1b/jointstate_right
+      image_left: camera_left/image
+      image_center: camera_center/image
+      image_right: camera_right/image
+    env:
+      piper_left_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_left.urdf
+      piper_right_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_right.urdf
+      piper_left_transform: 0 0.2 0
+      piper_right_transform: 0 -0.2 0
+      piper_left_pred_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_left_pred.urdf
+      piper_right_pred_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_right_pred.urdf
+      piper_left_pred_transform: 0 0.2 0
+      piper_right_pred_transform: 0 -0.2 0
+
+  - id: rdt_1b
+    path: dora-rdt_1b
+    inputs:
+      jointstate_left:
+        source: piper_left/jointstate
+        queue_size: 1
+      jointstate_right:
+        source: piper_right/jointstate
+        queue_size: 1
+      image_left:
+        source: camera_left/image
+        queue_size: 1
+      image_center:
+        source: camera_center/image
+        queue_size: 1
+      image_right:
+        source: camera_right/image
+        queue_size: 1
+      tick:
+        source: dora/timer/secs/1
+        queue_size: 1
+    outputs:
+      - action
+    env:
+      ROBOTIC_MODEL_NAME_OR_PATH: /home/peter/Documents/work/dora/examples/piper/checkpoints/checkpoint-450
+      VISION_MODEL_NAME_OR_PATH: /home/peter/.cache/huggingface/hub/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3
+      LANGUAGE_EMBEDDING_PATH: lang_embed.pt
+
+  - id: post_process_rdt_1b
+    path: post_process_action.py
+    inputs:
+      action: rdt_1b/action
+    outputs:
+      - jointstate_left
+      - jointstate_right
+
+  - id: mobile_base
+    path: /home/agilex/1ms.ai/ugv_sdk/tracer_node.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/40
+      # action: dummy/mobile_base
+    outputs:
+      - velocity
diff --git a/examples/piper/record.py b/examples/piper/record.py
new file mode 100644
index 00000000..a4fdb57a
--- /dev/null
+++ b/examples/piper/record.py
@@ -0,0 +1,231 @@
+import h5py
+
+import os
+import datetime
+
+from dora import Node
+import numpy as np
+
+STATE_VEC_IDX_MAPPING = {
+    # [0, 10): right arm joint positions
+    **{"arm_joint_{}_pos".format(i): i for i in range(10)},
+    **{"right_arm_joint_{}_pos".format(i): i for i in range(10)},
+    # [10, 15): right gripper joint positions
+    **{"gripper_joint_{}_pos".format(i): i + 10 for i in range(5)},
+    **{"right_gripper_joint_{}_pos".format(i): i + 10 for i in range(5)},
+    "gripper_open": 10,  # alias of right_gripper_joint_0_pos
+    "right_gripper_open": 10,
+    # [15, 25): right arm joint velocities
+    **{"arm_joint_{}_vel".format(i): i + 15 for i in range(10)},
+    **{"right_arm_joint_{}_vel".format(i): i + 15 for i in range(10)},
+    # [25, 30): right gripper joint velocities
+    **{"gripper_joint_{}_vel".format(i): i + 25 for i in range(5)},
+    **{"right_gripper_joint_{}_vel".format(i): i + 25 for i in range(5)},
+    "gripper_open_vel": 25,  # alias of right_gripper_joint_0_vel
+    "right_gripper_open_vel": 25,
+    # [30, 33): right end effector positions
+    "eef_pos_x": 30,
+    "right_eef_pos_x": 30,
+    "eef_pos_y": 31,
+    "right_eef_pos_y": 31,
+    "eef_pos_z": 32,
+    "right_eef_pos_z": 32,
+    # [33, 39): right end effector 6D pose
+    "eef_angle_0": 33,
+    "right_eef_angle_0": 33,
+    "eef_angle_1": 34,
+    "right_eef_angle_1": 34,
+    "eef_angle_2": 35,
+    "right_eef_angle_2": 35,
+    "eef_angle_3": 36,
+    "right_eef_angle_3": 36,
+    "eef_angle_4": 37,
+    "right_eef_angle_4": 37,
+    "eef_angle_5": 38,
+    "right_eef_angle_5": 38,
+    # [39, 42): right end effector velocities
+    "eef_vel_x": 39,
+    "right_eef_vel_x": 39,
+    "eef_vel_y": 40,
+    "right_eef_vel_y": 40,
+    "eef_vel_z": 41,
+    "right_eef_vel_z": 41,
+    # [42, 45): right end effector angular velocities
+    "eef_angular_vel_roll": 42,
+    "right_eef_angular_vel_roll": 42,
+    "eef_angular_vel_pitch": 43,
+    "right_eef_angular_vel_pitch": 43,
+    "eef_angular_vel_yaw": 44,
+    "right_eef_angular_vel_yaw": 44,
+    # [45, 50): reserved
+    # [50, 60): left arm joint positions
+    **{"left_arm_joint_{}_pos".format(i): i + 50 for i in range(10)},
+    # [60, 65): left gripper joint positions
+    **{"left_gripper_joint_{}_pos".format(i): i + 60 for i in range(5)},
+    "left_gripper_open": 60,  # alias of left_gripper_joint_0_pos
+    # [65, 75): left arm joint velocities
+    **{"left_arm_joint_{}_vel".format(i): i + 65 for i in range(10)},
+    # [75, 80): left gripper joint velocities
+    **{"left_gripper_joint_{}_vel".format(i): i + 75 for i in range(5)},
+    "left_gripper_open_vel": 75,  # alias of left_gripper_joint_0_vel
+    # [80, 83): left end effector positions
+    "left_eef_pos_x": 80,
+    "left_eef_pos_y": 81,
+    "left_eef_pos_z": 82,
+    # [83, 89): left end effector 6D pose
+    "left_eef_angle_0": 83,
+    "left_eef_angle_1": 84,
+    "left_eef_angle_2": 85,
+    "left_eef_angle_3": 86,
+    "left_eef_angle_4": 87,
+    "left_eef_angle_5": 88,
+    # [89, 92): left end effector velocities
+    "left_eef_vel_x": 89,
+    "left_eef_vel_y": 90,
+    "left_eef_vel_z": 91,
+    # [92, 95): left end effector angular velocities
+    "left_eef_angular_vel_roll": 92,
+    "left_eef_angular_vel_pitch": 93,
+    "left_eef_angular_vel_yaw": 94,
+    # [95, 100): reserved
+    # [100, 102): base linear velocities
+    "base_vel_x": 100,
+    "base_vel_y": 101,
+    # [102, 103): base angular velocities
+    "base_angular_vel": 102,
+    # [103, 128): reserved
+}
+STATE_VEC_LEN = 128
+
+
+now = datetime.datetime.now()
+
+DATA_DIR = now.strftime("%Y.%m.%d")
+os.makedirs(DATA_DIR, exist_ok=True)
+
+## Make data dir if it does not exist
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+
+def save_data(data_dict, dataset_path, data_size):
+    with h5py.File(dataset_path + ".hdf5", "w", rdcc_nbytes=1024**2 * 2) as root:
+        root.attrs["sim"] = False
+        root.attrs["compress"] = False
+
+        obs = root.create_group("observations")
+        variable_length = h5py.vlen_dtype(np.dtype("uint8"))
+        image = obs.create_group("images")
+        _ = image.create_dataset(
+            "cam_high",
+            (data_size,),
+            dtype=variable_length,
+        )
+        _ = image.create_dataset(
+            "cam_left_wrist",
+            (data_size,),
+            dtype=variable_length,
+        )
+        _ = image.create_dataset(
+            "cam_right_wrist",
+            (data_size,),
+            dtype=variable_length,
+        )
+
+        _ = obs.create_dataset("qpos", (data_size, 128))
+        _ = root.create_dataset("action", (data_size, 128))
+
+        # data_dict write into h5py.File
+        for name, array in data_dict.items():
+            print(name)
+            if "images" in name:
+                image[name][...] = array
+            else:
+                root[name][...] = array
+
+
+data_dict = {
+    "/observations/qpos": [],
+    "/observations/images/cam_high": [],
+    "/observations/images/cam_left_wrist": [],
+    "/observations/images/cam_right_wrist": [],
+    "/action": [],
+}
+
+
+node = Node()
+
+LEAD_CAMERA = "/observations/images/cam_high"
+
+tmp_dict = {}
+
+i = 0
+
+start = False
+for event in node:
+    if event["type"] == "INPUT":
+        if "save" in event["id"]:
+            char = event["value"][0].as_py()
+            if char == "p":
+                if start == False:
+                    continue
+
+                save_data(
+                    data_dict,
+                    f"{DATA_DIR}/episode_{i}",
+                    len(data_dict["/observations/qpos"]),
+                )
+
+                # Reset dict
+                data_dict = {
+                    "/observations/qpos": [],
+                    "/observations/images/cam_high": [],
+                    "/observations/images/cam_left_wrist": [],
+                    "/observations/images/cam_right_wrist": [],
+                    "/action": [],
+                }
+                i += 1
+                start = False
+            elif char == "s":
+                start = True
+
+        elif "image" in event["id"]:
+            tmp_dict[event["id"]] = event["value"].to_numpy()
+        elif "qpos" in event["id"]:
+            tmp_dict[event["id"]] = event["value"].to_numpy()
+        elif "base_vel" in event["id"]:
+            tmp_dict[event["id"]] = event["value"].to_numpy()
+
+        # Check if tmp dict is full
+        if len(tmp_dict) != 6:
+            continue
+        elif event["id"] == LEAD_CAMERA and start == True:
+            values = np.concatenate(
+                [
+                    tmp_dict["/observations/qpos_left"],
+                    tmp_dict["/observations/qpos_right"],
+                    tmp_dict["/observations/base_vel"],
+                ]
+            )
+            UNI_STATE_INDICES = (
+                [STATE_VEC_IDX_MAPPING[f"left_arm_joint_{i}_pos"] for i in range(6)]
+                + [STATE_VEC_IDX_MAPPING["left_gripper_open"]]
+                + [STATE_VEC_IDX_MAPPING[f"right_arm_joint_{i}_pos"] for i in range(6)]
+                + [STATE_VEC_IDX_MAPPING["right_gripper_open"]]
+                + [STATE_VEC_IDX_MAPPING["base_vel_x"]]
+                + [STATE_VEC_IDX_MAPPING["base_angular_vel"]]
+            )
+            universal_vec = np.zeros(STATE_VEC_LEN)
+            universal_vec[UNI_STATE_INDICES] = values
+            data_dict["/observations/qpos"].append(universal_vec)
+            # We reproduce obs and action
+            data_dict["/action"].append(universal_vec)
+            data_dict["/observations/images/cam_high"].append(
+                tmp_dict["/observations/images/cam_high"]
+            )
+            data_dict["/observations/images/cam_left_wrist"].append(
+                tmp_dict["/observations/images/cam_left_wrist"]
+            )
+            data_dict["/observations/images/cam_right_wrist"].append(
+                tmp_dict["/observations/images/cam_right_wrist"]
+            )
diff --git a/examples/piper/record.yml b/examples/piper/record.yml
new file mode 100644
index 00000000..ef3bf37d
--- /dev/null
+++ b/examples/piper/record.yml
@@ -0,0 +1,115 @@
+nodes:
+  - id: piper_left
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/40
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_left
+      TEACH_MODE: True
+
+  - id: piper_right
+    path: /home/agilex/1ms.ai/piper_sdk/dora_piper.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/40
+    outputs:
+      - jointstate
+    env:
+      CAN_BUS: can_right
+      TEACH_MODE: True
+
+  - id: mobile_base
+    path: /home/agilex/1ms.ai/ugv_sdk/tracer_node.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/40
+    outputs:
+      - velocity
+
+  - id: camera_left
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/40
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 0
+      ENCODING: jpeg
+
+  - id: camera_center
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/40
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 1
+      ENCODING: jpeg
+
+  - id: camera_right
+    path: /home/agilex/1ms.ai/pyorbbecsdk/examples/color_viewer.py
+    _unstable_deploy:
+      machine: piper
+    inputs:
+      tick: dora/timer/millis/40
+    outputs:
+      - image
+    env:
+      DEVICE_INDEX: 2
+      ENCODING: jpeg
+      # import opencv as cv
+      # [cv2.VideoCapture(i) for i in range(12)]
+
+  - id: rerun
+    path: dora-rerun
+    inputs:
+      jointstate_piper_left: piper_left/jointstate
+      jointstate_piper_right: piper_right/jointstate
+      series_base_vel: mobile_base/velocity
+      image_left: camera_left/image
+      image_center: camera_center/image
+      image_right: camera_right/image
+    env:
+      piper_left_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_left.urdf
+      piper_right_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_right.urdf
+      piper_left_transform: 0 0.2 0
+      piper_right_transform: 0 -0.2 0
+      piper_left_pred_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_left_pred.urdf
+      piper_right_pred_urdf: /home/peter/Documents/work/dora/examples/piper/assets/piper_right_pred.urdf
+      piper_left_pred_transform: 0 0.2 0
+      piper_right_pred_transform: 0 -0.2 0
+
+  - id: keyboard
+    build: pip install dora-keyboard
+    path: dora-keyboard
+    inputs:
+      tick: dora/timer/millis/1000
+    outputs:
+      - char
+
+  - id: recorder
+    path: record.py
+    inputs:
+      /observations/qpos_left:
+        source: piper_left/jointstate
+      /observations/qpos_right:
+        source: piper_right/jointstate
+      /observations/base_vel:
+        source: mobile_base/velocity
+      /observations/images/cam_left_wrist:
+        source: camera_left/image
+      /observations/images/cam_high:
+        source: camera_center/image
+      /observations/images/cam_right_wrist:
+        source: camera_right/image
+      save: keyboard/char
diff --git a/node-hub/dora-rdt-1b/README.md b/node-hub/dora-rdt-1b/README.md
new file mode 100644
index 00000000..6f2da50f
--- /dev/null
+++ b/node-hub/dora-rdt-1b/README.md
@@ -0,0 +1,3 @@
+# Dora RDT-1B node
+
+Experimental node for using a RDT-1B VLA model.
diff --git a/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer b/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
new file mode 160000
index 00000000..b2889e65
--- /dev/null
+++ b/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
@@ -0,0 +1 @@
+Subproject commit b2889e65cfe62571ced3ce88f00e7d80b41fee69
diff --git a/node-hub/dora-rdt-1b/dora_rdt_1b/__init__.py b/node-hub/dora-rdt-1b/dora_rdt_1b/__init__.py
new file mode 100644
index 00000000..ed4e2191
--- /dev/null
+++ b/node-hub/dora-rdt-1b/dora_rdt_1b/__init__.py
@@ -0,0 +1,19 @@
+import os
+import sys
+from pathlib import Path
+
+# Define the path to the README file relative to the package directory
+readme_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.md")
+
+# Read the content of the README file
+try:
+    with open(readme_path, "r", encoding="utf-8") as f:
+        __doc__ = f.read()
+except FileNotFoundError:
+    __doc__ = "README file not found."
+
+
+# Set up the import hook
+
+submodule_path = Path(__file__).resolve().parent / "RoboticsDiffusionTransformer"
+sys.path.insert(0, str(submodule_path))
diff --git a/node-hub/dora-rdt-1b/dora_rdt_1b/main.py b/node-hub/dora-rdt-1b/dora_rdt_1b/main.py
new file mode 100644
index 00000000..f8ca7485
--- /dev/null
+++ b/node-hub/dora-rdt-1b/dora_rdt_1b/main.py
@@ -0,0 +1,324 @@
+# install dependencies as shown in the README here https://github.com/alik-git/RoboticsDiffusionTransformer?tab=readme-ov-file#installation
+import yaml
+import torch
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+
+from dora_rdt_1b.RoboticsDiffusionTransformer.configs.state_vec import (
+    STATE_VEC_IDX_MAPPING,
+)
+from dora_rdt_1b.RoboticsDiffusionTransformer.models.multimodal_encoder.siglip_encoder import (
+    SiglipVisionTower,
+)
+from dora_rdt_1b.RoboticsDiffusionTransformer.models.rdt_runner import RDTRunner
+from dora_rdt_1b.RoboticsDiffusionTransformer.configs.state_vec import (
+    STATE_VEC_IDX_MAPPING,
+)
+from dora import Node
+import cv2
+import pyarrow as pa
+import os
+from pathlib import Path
+
+VISION_DEFAULT_PATH = "robotics-diffusion-transformer/rdt-1b"
+ROBOTIC_MODEL_NAME_OR_PATH = os.getenv(
+    "ROBOTIC_MODEL_NAME_OR_PATH", VISION_DEFAULT_PATH
+)
+LANGUAGE_EMBEDDING_PATH = os.getenv("LANGUAGE_EMBEDDING", "lang_embed.pt")
+
+VISION_DEFAULT_PATH = "google/siglip-so400m-patch14-384"
+VISION_MODEL_NAME_OR_PATH = os.getenv("VISION_MODEL_NAME_OR_PATH", VISION_DEFAULT_PATH)
+
+
+def get_policy():
+    from dora_rdt_1b.RoboticsDiffusionTransformer.models.rdt_runner import RDTRunner
+
+    pretrained_model_name_or_path = ROBOTIC_MODEL_NAME_OR_PATH
+    rdt = RDTRunner.from_pretrained(pretrained_model_name_or_path)
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+    rdt.to(device, dtype=dtype)
+    rdt.eval()
+    return rdt
+
+
+def get_vision_model():
+    from dora_rdt_1b.RoboticsDiffusionTransformer.models.multimodal_encoder.siglip_encoder import (
+        SiglipVisionTower,
+    )
+
+    # Load vision encoder
+    vision_encoder = SiglipVisionTower(
+        vision_tower=VISION_MODEL_NAME_OR_PATH,
+        args=None,
+    )
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+    vision_encoder.to(device, dtype=dtype)
+    vision_encoder.eval()
+    image_processor = vision_encoder.image_processor
+    return vision_encoder, image_processor
+
+
+def get_language_embeddings():
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+
+    lang_embeddings = torch.load(
+        LANGUAGE_EMBEDDING_PATH,
+        map_location=device,
+    )
+
+    return lang_embeddings.unsqueeze(
+        0
+    )  # Size:  (B, L_lang, D) or None, language condition tokens (variable length),    dimension D is assumed to be the same as the hidden size.
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_image(rgbs_lst, image_processor, vision_encoder):
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+
+    file_path = Path(__file__).parent
+
+    config_path = (
+        file_path / "RoboticsDiffusionTransformer/configs/base.yaml"
+    )  # default config
+
+    with open(config_path, "r") as fp:
+        config = yaml.safe_load(fp)
+
+    # previous_image_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/img.jpeg"
+    # # previous_image = None # if t = 0
+    # previous_image = Image.fromarray(previous_image_path).convert("RGB")  # if t > 0
+
+    # current_image_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/img.jpeg"
+    # current_image = Image.fromarray(current_image_path).convert("RGB")
+
+    # here I suppose you only have an image from exterior (e.g., 3rd person view) and you don't have any state information
+    # the images shoud arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
+    # rgbs_lst = [[previous_image, None, None], [current_image, None, None]]
+    # if your have an right_wrist_image, then it should be
+    # rgbs_lst = [
+    #     [previous_image, previous_right_wrist_image, None],
+    #     [current_image, current_right_wrist_image, None]
+    # ]
+
+    # image pre-processing
+    # The background image used for padding
+
+    image_tensor_list = []
+    for step in range(config["common"]["img_history_size"]):
+        rgbs = rgbs_lst[step]
+        for rgb in rgbs:
+            assert rgb, "You should not have None image"
+            image = rgb
+
+            if config["dataset"].get("image_aspect_ratio", "pad") == "pad":
+                background_color = tuple(
+                    int(x * 255) for x in image_processor.image_mean
+                )
+                image = expand2square(image, background_color)
+            image = image_processor.preprocess(image, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            image_tensor_list.append(image)
+
+    image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
+    # encode images
+    image_embeds = vision_encoder(image_tensor).detach()
+    return image_embeds.reshape(-1, vision_encoder.hidden_size).unsqueeze(0)
+
+
+def get_states(proprio):
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+
+    # suppose you control in 7DOF joint position
+    STATE_INDICES = [
+        STATE_VEC_IDX_MAPPING["left_arm_joint_0_pos"],
+        STATE_VEC_IDX_MAPPING["left_arm_joint_1_pos"],
+        STATE_VEC_IDX_MAPPING["left_arm_joint_2_pos"],
+        STATE_VEC_IDX_MAPPING["left_arm_joint_3_pos"],
+        STATE_VEC_IDX_MAPPING["left_arm_joint_4_pos"],
+        STATE_VEC_IDX_MAPPING["left_arm_joint_5_pos"],
+        STATE_VEC_IDX_MAPPING["left_gripper_open"],
+        STATE_VEC_IDX_MAPPING["right_arm_joint_0_pos"],
+        STATE_VEC_IDX_MAPPING["right_arm_joint_1_pos"],
+        STATE_VEC_IDX_MAPPING["right_arm_joint_2_pos"],
+        STATE_VEC_IDX_MAPPING["right_arm_joint_3_pos"],
+        STATE_VEC_IDX_MAPPING["right_arm_joint_4_pos"],
+        STATE_VEC_IDX_MAPPING["right_arm_joint_5_pos"],
+        STATE_VEC_IDX_MAPPING["right_gripper_open"],
+    ]
+
+    file_path = Path(__file__).parent
+
+    config_path = (
+        file_path / "RoboticsDiffusionTransformer/configs/base.yaml"
+    )  # default config
+    with open(config_path, "r") as fp:
+        config = yaml.safe_load(fp)
+
+    B, N = 1, 1  # batch size and state history size
+    states = torch.zeros(
+        (B, N, config["model"]["state_token_dim"]), device=device, dtype=dtype
+    )
+    # suppose you do not have proprio
+    # it's kind of tricky, I strongly suggest adding proprio as input and futher fine-tuning
+    proprio = torch.tensor(proprio, device=device, dtype=dtype).reshape(
+        (1, 1, -1)
+    )  # B, N = 1, 1  # batch size and state history size
+
+    # if you have proprio, you can do like this
+    # format like this: [arm_joint_0_pos, arm_joint_1_pos, arm_joint_2_pos, arm_joint_3_pos, arm_joint_4_pos, arm_joint_5_pos, arm_joint_6_pos, gripper_open]
+    # proprio = torch.tensor([0, 1, 2, 3, 4, 5, 6, 0.5]).reshape((1, 1, -1))
+    states[:, :, STATE_INDICES] = proprio
+
+    state_elem_mask = torch.zeros(
+        (1, config["model"]["state_token_dim"]), device=device, dtype=torch.bool
+    )
+
+    state_elem_mask[:, STATE_INDICES] = True
+    states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(
+        device, dtype=dtype
+    )
+    states = states[:, -1:, :]  # only use the last state
+    return states, state_elem_mask, STATE_INDICES
+
+
+def main():
+
+    device = torch.device("cuda:0")
+    rdt = get_policy()
+    lang_embeddings = get_language_embeddings()
+    vision_encoder, image_processor = get_vision_model()
+
+    ## for image
+    # image_embeds = process_image(rgb_lst, image_processor, vision_encoder)
+    ## for states
+    # states, state_elem_mask, STATE_INDICES = get_states(states)
+    node = Node()
+    frames = {}
+    joints = {}
+    with torch.no_grad():
+
+        for event in node:
+            event_type = event["type"]
+            if event_type == "INPUT":
+
+                event_id = event["id"]
+
+                if "image" in event_id:
+                    storage = event["value"]
+                    metadata = event["metadata"]
+                    encoding = metadata["encoding"]
+
+                    if encoding == "bgr8":
+                        channels = 3
+                        storage_type = np.uint8
+                    elif encoding == "rgb8":
+                        channels = 3
+                        storage_type = np.uint8
+                    elif encoding in ["jpeg", "jpg", "jpe", "bmp", "webp", "png"]:
+                        channels = 3
+                        storage_type = np.uint8
+                    else:
+                        raise RuntimeError(f"Unsupported image encoding: {encoding}")
+
+                    if encoding == "bgr8":
+                        width = metadata["width"]
+                        height = metadata["height"]
+                        frame = (
+                            storage.to_numpy()
+                            .astype(storage_type)
+                            .reshape((height, width, channels))
+                        )
+                        frame = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
+                    elif encoding == "rgb8":
+                        width = metadata["width"]
+                        height = metadata["height"]
+                        frame = (
+                            storage.to_numpy()
+                            .astype(storage_type)
+                            .reshape((height, width, channels))
+                        )
+                    elif encoding in ["jpeg", "jpg", "jpe", "bmp", "webp", "png"]:
+                        storage = storage.to_numpy()
+                        frame = cv2.imdecode(storage, cv2.IMREAD_COLOR)
+                        frame = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
+                    else:
+                        raise RuntimeError(f"Unsupported image encoding: {encoding}")
+                    frames[f"last_{event_id}"] = frames.get(
+                        event_id, Image.fromarray(frame)
+                    )
+                    frames[event_id] = Image.fromarray(frame)
+                elif "jointstate" in event_id:
+                    joints[event_id] = event["value"].to_numpy()
+
+                elif "tick" == event_id:
+                    ## Wait for all images
+                    if len(frames.keys()) < 6:
+                        continue
+                    if len(joints.keys()) < 2:
+                        continue
+
+                    ## Embed images
+                    rgbs_lst = [
+                        [
+                            frames["last_image_center"],
+                            frames["last_image_right"],
+                            frames["last_image_left"],
+                        ],
+                        [
+                            frames["image_center"],
+                            frames["image_right"],
+                            frames["image_left"],
+                        ],
+                    ]
+                    image_embeds = process_image(
+                        rgbs_lst, image_processor, vision_encoder
+                    )
+
+                    ## Embed states
+                    proprio = np.concatenate(
+                        [
+                            joints["jointstate_left"],
+                            joints["jointstate_right"],
+                        ]
+                    )
+                    states, state_elem_mask, state_indices = get_states(proprio=proprio)
+
+                    actions = rdt.predict_action(
+                        lang_tokens=lang_embeddings,
+                        lang_attn_mask=torch.ones(
+                            lang_embeddings.shape[:2], dtype=torch.bool, device=device
+                        ),
+                        img_tokens=image_embeds,
+                        state_tokens=states,  # how can I get this?
+                        action_mask=state_elem_mask.unsqueeze(1),  # how can I get this?
+                        ctrl_freqs=torch.tensor(
+                            [25.0], device=device
+                        ),  # would this default work?
+                    )  # (1, chunk_size, 128)
+
+                    # select the meaning action via STATE_INDICES
+                    action = actions[
+                        :, :, state_indices
+                    ]  # (1, chunk_size, len(STATE_INDICES)) = (1, chunk_size, 7+ 1)
+                    action = action.detach().float().to("cpu").numpy()
+                    node.send_output("action", pa.array(action.ravel()))
diff --git a/node-hub/dora-rdt-1b/pyproject.toml b/node-hub/dora-rdt-1b/pyproject.toml
new file mode 100644
index 00000000..fcbbee94
--- /dev/null
+++ b/node-hub/dora-rdt-1b/pyproject.toml
@@ -0,0 +1,36 @@
+[tool.poetry]
+name = "dora-rdt-1b"
+version = "0.3.6-rc0"
+authors = ["Haixuan Xavier Tao <tao.xavier@outlook.com>"]
+description = "Dora Node for VLM"
+readme = "README.md"
+
+packages = [{ include = "dora_rdt_1b" }]
+
+[tool.poetry.dependencies]
+python = "^3.7"
+dora-rs = "^0.3.6"
+numpy = "< 2.0.0"
+torch = "^2.4.0"
+torchvision = "^0.19"
+transformers = "^4.45"
+qwen-vl-utils = "^0.0.2"
+accelerate = "^0.33"
+opencv-python = ">= 4.1.1"
+modelscope = "^1.18.1"
+packaging = "24.0"
+wandb = "0.17.0"
+diffusers = "0.27.2"
+timm = "1.0.3"
+sentencepiece = "0.2.0"
+h5py = "3.11.0"
+imgaug = "0.4.0"
+# flash_attn = "^2.6.1" # Install using: pip install -U flash-attn --no-build-isolation
+
+
+[tool.poetry.scripts]
+dora-rdt-1b = "dora_rdt_1b.main:main"
+
+[build-system]
+requires = ["poetry-core>=1.8.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/node-hub/dora-rdt-1b/tests/conftest.py b/node-hub/dora-rdt-1b/tests/conftest.py
new file mode 100644
index 00000000..46712c46
--- /dev/null
+++ b/node-hub/dora-rdt-1b/tests/conftest.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+def pytest_configure():
+    pytest.rdt = None
+    pytest.lang_embeddings = None
+    pytest.image_processor = None
+    pytest.vision_encoder = None
+    pytest.image_embeds = None
+    pytest.state_elem_mask = None
+    pytest.states = None
+    pytest.STATE_INDICES = None
diff --git a/node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py b/node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py
new file mode 100644
index 00000000..03fd5c55
--- /dev/null
+++ b/node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py
@@ -0,0 +1,227 @@
+import pytest
+import torch
+import yaml
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+
+
+def test_import_main():
+    # from dora_rdt_1b.main import main
+
+    # Check that everything is working, and catch dora Runtime Exception as we're not running in a dora dataflow.
+    # with pytest.raises(RuntimeError):
+    pass
+    # main()
+    import dora_rdt_1b
+    import dora_rdt_1b.RoboticsDiffusionTransformer
+
+
+def test_download_policy():
+    from dora_rdt_1b.RoboticsDiffusionTransformer.models.rdt_runner import RDTRunner
+
+    pretrained_model_name_or_path = "robotics-diffusion-transformer/rdt-1b"
+    rdt = RDTRunner.from_pretrained(pretrained_model_name_or_path)
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+    rdt.to(device, dtype=dtype)
+    rdt.eval()
+    pytest.rdt = rdt
+
+
+def test_download_vision_model():
+    from dora_rdt_1b.RoboticsDiffusionTransformer.models.multimodal_encoder.siglip_encoder import (
+        SiglipVisionTower,
+    )
+
+    # Load vision encoder
+    vision_encoder = SiglipVisionTower(
+        vision_tower="google/siglip-so400m-patch14-384", args=None
+    )
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+    vision_encoder.to(device, dtype=dtype)
+    vision_encoder.eval()
+    image_processor = vision_encoder.image_processor
+    pytest.vision_encoder = vision_encoder
+    pytest.image_processor = image_processor
+
+
+def test_download_language_embeddings():
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+    lang_embeddings = torch.load(
+        "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/outs/handover_pan.pt",
+        map_location=device,
+    )
+    pytest.lang_embeddings = lang_embeddings["embeddings"]
+
+
+def test_load_dummy_image():
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+    config_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml"  # default config
+    with open(config_path, "r") as fp:
+        config = yaml.safe_load(fp)
+
+    # Load pretrained model (in HF style)
+    image_processor = pytest.image_processor
+    vision_encoder = pytest.vision_encoder
+
+    previous_image_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/img.jpeg"
+    # previous_image = None # if t = 0
+    previous_image = Image.open(previous_image_path).convert("RGB")  # if t > 0
+
+    current_image_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/img.jpeg"
+    current_image = Image.open(current_image_path).convert("RGB")
+
+    # here I suppose you only have an image from exterior (e.g., 3rd person view) and you don't have any state information
+    # the images shoud arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
+    rgbs_lst = [[previous_image, None, None], [current_image, None, None]]
+    # if your have an right_wrist_image, then it should be
+    # rgbs_lst = [
+    #     [previous_image, previous_right_wrist_image, None],
+    #     [current_image, current_right_wrist_image, None]
+    # ]
+
+    # image pre-processing
+    # The background image used for padding
+    background_color = np.array(
+        [int(x * 255) for x in image_processor.image_mean], dtype=np.uint8
+    ).reshape(1, 1, 3)
+    background_image = (
+        np.ones(
+            (image_processor.size["height"], image_processor.size["width"], 3),
+            dtype=np.uint8,
+        )
+        * background_color
+    )
+
+    image_tensor_list = []
+    for step in range(config["common"]["img_history_size"]):
+        rgbs = rgbs_lst[step % len(rgbs_lst)]
+        for rgb in rgbs:
+            if rgb is None:
+                # Replace it with the background image
+                image = Image.fromarray(background_image)
+            else:
+                image = rgb
+
+            if config["dataset"].get("auto_adjust_image_brightness", False):
+                pixel_values = list(image.getdata())
+                average_brightness = sum(sum(pixel) for pixel in pixel_values) / (
+                    len(pixel_values) * 255.0 * 3
+                )
+                if average_brightness <= 0.15:
+                    image = transforms.ColorJitter(brightness=(1.75, 1.75))(image)
+
+            if config["dataset"].get("image_aspect_ratio", "pad") == "pad":
+
+                def expand2square(pil_img, background_color):
+                    width, height = pil_img.size
+                    if width == height:
+                        return pil_img
+                    elif width > height:
+                        result = Image.new(
+                            pil_img.mode, (width, width), background_color
+                        )
+                        result.paste(pil_img, (0, (width - height) // 2))
+                        return result
+                    else:
+                        result = Image.new(
+                            pil_img.mode, (height, height), background_color
+                        )
+                        result.paste(pil_img, ((height - width) // 2, 0))
+                        return result
+
+                image = expand2square(
+                    image, tuple(int(x * 255) for x in image_processor.image_mean)
+                )
+            image = image_processor.preprocess(image, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            image_tensor_list.append(image)
+
+    image_tensor = torch.stack(image_tensor_list, dim=0).to(device, dtype=dtype)
+    # encode images
+    image_embeds = vision_encoder(image_tensor).detach()
+    pytest.image_embeds = image_embeds.reshape(
+        -1, vision_encoder.hidden_size
+    ).unsqueeze(0)
+
+
+def test_dummy_states():
+    device = torch.device("cuda:0")
+    dtype = torch.bfloat16  # recommanded
+    config_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml"  # default config
+    with open(config_path, "r") as fp:
+        config = yaml.safe_load(fp)
+
+    # suppose you do not have proprio
+    # it's kind of tricky, I strongly suggest adding proprio as input and futher fine-tuning
+    B, N = 1, 1  # batch size and state history size
+    states = torch.zeros(
+        (B, N, config["model"]["state_token_dim"]), device=device, dtype=dtype
+    )
+
+    # if you have proprio, you can do like this
+    # format like this: [arm_joint_0_pos, arm_joint_1_pos, arm_joint_2_pos, arm_joint_3_pos, arm_joint_4_pos, arm_joint_5_pos, arm_joint_6_pos, gripper_open]
+    # proprio = torch.tensor([0, 1, 2, 3, 4, 5, 6, 0.5]).reshape((1, 1, -1))
+    # states[:, :, STATE_INDICES] = proprio
+
+    state_elem_mask = torch.zeros(
+        (B, config["model"]["state_token_dim"]), device=device, dtype=torch.bool
+    )
+    from dora_rdt_1b.RoboticsDiffusionTransformer.configs.state_vec import (
+        STATE_VEC_IDX_MAPPING,
+    )
+
+    # suppose you control in 7DOF joint position
+    STATE_INDICES = [
+        STATE_VEC_IDX_MAPPING["arm_joint_0_pos"],
+        STATE_VEC_IDX_MAPPING["arm_joint_1_pos"],
+        STATE_VEC_IDX_MAPPING["arm_joint_2_pos"],
+        STATE_VEC_IDX_MAPPING["arm_joint_3_pos"],
+        STATE_VEC_IDX_MAPPING["arm_joint_4_pos"],
+        STATE_VEC_IDX_MAPPING["arm_joint_5_pos"],
+        STATE_VEC_IDX_MAPPING["arm_joint_6_pos"],
+        STATE_VEC_IDX_MAPPING["gripper_open"],
+    ]
+
+    state_elem_mask[:, STATE_INDICES] = True
+    states, state_elem_mask = states.to(device, dtype=dtype), state_elem_mask.to(
+        device, dtype=dtype
+    )
+    states = states[:, -1:, :]  # only use the last state
+    pytest.states = states
+    pytest.state_elem_mask = state_elem_mask
+    pytest.STATE_INDICES = STATE_INDICES
+
+
+def test_dummy_input(request):
+
+    rdt = pytest.rdt
+    lang_embeddings = pytest.lang_embeddings
+    image_embeds = pytest.image_embeds
+    state_elem_mask = pytest.state_elem_mask
+    states = pytest.states
+    STATE_INDICES = pytest.STATE_INDICES
+
+    device = torch.device("cuda:0")
+
+    actions = rdt.predict_action(
+        lang_tokens=lang_embeddings,
+        lang_attn_mask=torch.ones(
+            lang_embeddings.shape[:2], dtype=torch.bool, device=device
+        ),
+        img_tokens=image_embeds,
+        state_tokens=states,  # how can I get this?
+        action_mask=state_elem_mask.unsqueeze(1),  # how can I get this?
+        ctrl_freqs=torch.tensor([25.0], device=device),  # would this default work?
+    )  # (1, chunk_size, 128)
+
+    # select the meaning action via STATE_INDICES
+    action = actions[
+        :, :, STATE_INDICES
+    ]  # (1, chunk_size, len(STATE_INDICES)) = (1, chunk_size, 7+ 1)
+    print(action)