Browse Source

Fix typos within RDT 1B

tags/0.3.8-rc
haixuantao 1 year ago
parent
commit
8d1bd7afc2
3 changed files with 14 additions and 14 deletions
  1. +2
    -2
      examples/piper/convert.py
  2. +6
    -6
      node-hub/dora-rdt-1b/dora_rdt_1b/main.py
  3. +6
    -6
      node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py

+ 2
- 2
examples/piper/convert.py View File

@@ -4,7 +4,7 @@ from scipy.spatial.transform import Rotation as R

def convert_quaternion_to_euler(quat):
"""
Convert Quarternion (xyzw) to Euler angles (rpy)
Convert Quaternion (xyzw) to Euler angles (rpy)
"""
# Normalize
quat = quat / np.linalg.norm(quat)
@@ -15,7 +15,7 @@ def convert_quaternion_to_euler(quat):

def convert_euler_to_quaternion(euler):
"""
Convert Euler angles (rpy) to Quarternion (xyzw)
Convert Euler angles (rpy) to Quaternion (xyzw)
"""
quat = R.from_euler("xyz", euler).as_quat()



+ 6
- 6
node-hub/dora-rdt-1b/dora_rdt_1b/main.py View File

@@ -38,7 +38,7 @@ def get_policy():
pretrained_model_name_or_path = ROBOTIC_MODEL_NAME_OR_PATH
rdt = RDTRunner.from_pretrained(pretrained_model_name_or_path)
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended
rdt.to(device, dtype=dtype)
rdt.eval()
return rdt
@@ -55,7 +55,7 @@ def get_vision_model():
args=None,
)
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended
vision_encoder.to(device, dtype=dtype)
vision_encoder.eval()
image_processor = vision_encoder.image_processor
@@ -91,7 +91,7 @@ def expand2square(pil_img, background_color):

def process_image(rgbs_lst, image_processor, vision_encoder):
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended

# previous_image_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/img.jpeg"
# # previous_image = None # if t = 0
@@ -101,7 +101,7 @@ def process_image(rgbs_lst, image_processor, vision_encoder):
# current_image = Image.fromarray(current_image_path).convert("RGB")

# here I suppose you only have an image from exterior (e.g., 3rd person view) and you don't have any state information
# the images shoud arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
# the images should arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
# rgbs_lst = [[previous_image, None, None], [current_image, None, None]]
# if your have an right_wrist_image, then it should be
# rgbs_lst = [
@@ -137,7 +137,7 @@ def process_image(rgbs_lst, image_processor, vision_encoder):

def get_states(proprio):
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended

# suppose you control in 7DOF joint position
STATE_INDICES = [
@@ -162,7 +162,7 @@ def get_states(proprio):
(B, N, config["model"]["state_token_dim"]), device=device, dtype=dtype
)
# suppose you do not have proprio
# it's kind of tricky, I strongly suggest adding proprio as input and futher fine-tuning
# it's kind of tricky, I strongly suggest adding proprio as input and further fine-tuning
proprio = torch.tensor(proprio, device=device, dtype=dtype).reshape(
(1, 1, -1)
) # B, N = 1, 1 # batch size and state history size


+ 6
- 6
node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py View File

@@ -22,7 +22,7 @@ def test_download_policy():
pretrained_model_name_or_path = "robotics-diffusion-transformer/rdt-1b"
rdt = RDTRunner.from_pretrained(pretrained_model_name_or_path)
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended
rdt.to(device, dtype=dtype)
rdt.eval()
pytest.rdt = rdt
@@ -38,7 +38,7 @@ def test_download_vision_model():
vision_tower="google/siglip-so400m-patch14-384", args=None
)
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended
vision_encoder.to(device, dtype=dtype)
vision_encoder.eval()
image_processor = vision_encoder.image_processor
@@ -57,7 +57,7 @@ def test_download_language_embeddings():

def test_load_dummy_image():
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended
config_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml" # default config
with open(config_path, "r", encoding="utf-8") as fp:
config = yaml.safe_load(fp)
@@ -74,7 +74,7 @@ def test_load_dummy_image():
current_image = Image.open(current_image_path).convert("RGB")

# here I suppose you only have an image from exterior (e.g., 3rd person view) and you don't have any state information
# the images shoud arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
# the images should arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
rgbs_lst = [[previous_image, None, None], [current_image, None, None]]
# if your have an right_wrist_image, then it should be
# rgbs_lst = [
@@ -150,13 +150,13 @@ def test_load_dummy_image():

def test_dummy_states():
device = torch.device("cuda:0")
dtype = torch.bfloat16 # recommanded
dtype = torch.bfloat16 # recommended
config_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml" # default config
with open(config_path, "r", encoding="utf-8") as fp:
config = yaml.safe_load(fp)

# suppose you do not have proprio
# it's kind of tricky, I strongly suggest adding proprio as input and futher fine-tuning
# it's kind of tricky, I strongly suggest adding proprio as input and further fine-tuning
B, N = 1, 1 # batch size and state history size
states = torch.zeros(
(B, N, config["model"]["state_token_dim"]), device=device, dtype=dtype


Loading…
Cancel
Save