Fix typos within RDT 1B

1 year ago · 8d1bd7afc2
--- a/examples/piper/convert.py
+++ b/examples/piper/convert.py
@@ -4,7 +4,7 @@ from scipy.spatial.transform import Rotation as R

 def convert_quaternion_to_euler(quat):
    """
    Convert Quarternion (xyzw) to Euler angles (rpy)
    Convert Quaternion (xyzw) to Euler angles (rpy)
    """
    # Normalize
    quat = quat / np.linalg.norm(quat)
@@ -15,7 +15,7 @@ def convert_quaternion_to_euler(quat):

 def convert_euler_to_quaternion(euler):
    """
    Convert Euler angles (rpy) to Quarternion (xyzw)
    Convert Euler angles (rpy) to Quaternion (xyzw)
    """
    quat = R.from_euler("xyz", euler).as_quat()

--- a/node-hub/dora-rdt-1b/dora_rdt_1b/main.py
+++ b/node-hub/dora-rdt-1b/dora_rdt_1b/main.py
@@ -38,7 +38,7 @@ def get_policy():
    pretrained_model_name_or_path = ROBOTIC_MODEL_NAME_OR_PATH
    rdt = RDTRunner.from_pretrained(pretrained_model_name_or_path)
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended
    rdt.to(device, dtype=dtype)
    rdt.eval()
    return rdt
@@ -55,7 +55,7 @@ def get_vision_model():
        args=None,
    )
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended
    vision_encoder.to(device, dtype=dtype)
    vision_encoder.eval()
    image_processor = vision_encoder.image_processor
@@ -91,7 +91,7 @@ def expand2square(pil_img, background_color):

 def process_image(rgbs_lst, image_processor, vision_encoder):
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended

    # previous_image_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/img.jpeg"
    # # previous_image = None # if t = 0
@@ -101,7 +101,7 @@ def process_image(rgbs_lst, image_processor, vision_encoder):
    # current_image = Image.fromarray(current_image_path).convert("RGB")

    # here I suppose you only have an image from exterior (e.g., 3rd person view) and you don't have any state information
    # the images shoud arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
    # the images should arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
    # rgbs_lst = [[previous_image, None, None], [current_image, None, None]]
    # if your have an right_wrist_image, then it should be
    # rgbs_lst = [
@@ -137,7 +137,7 @@ def process_image(rgbs_lst, image_processor, vision_encoder):

 def get_states(proprio):
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended

    # suppose you control in 7DOF joint position
    STATE_INDICES = [
@@ -162,7 +162,7 @@ def get_states(proprio):
        (B, N, config["model"]["state_token_dim"]), device=device, dtype=dtype
    )
    # suppose you do not have proprio
    # it's kind of tricky, I strongly suggest adding proprio as input and futher fine-tuning
    # it's kind of tricky, I strongly suggest adding proprio as input and further fine-tuning
    proprio = torch.tensor(proprio, device=device, dtype=dtype).reshape(
        (1, 1, -1)
    )  # B, N = 1, 1  # batch size and state history size
--- a/node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py
+++ b/node-hub/dora-rdt-1b/tests/test_dora_rdt_1b.py
@@ -22,7 +22,7 @@ def test_download_policy():
    pretrained_model_name_or_path = "robotics-diffusion-transformer/rdt-1b"
    rdt = RDTRunner.from_pretrained(pretrained_model_name_or_path)
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended
    rdt.to(device, dtype=dtype)
    rdt.eval()
    pytest.rdt = rdt
@@ -38,7 +38,7 @@ def test_download_vision_model():
        vision_tower="google/siglip-so400m-patch14-384", args=None
    )
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended
    vision_encoder.to(device, dtype=dtype)
    vision_encoder.eval()
    image_processor = vision_encoder.image_processor
@@ -57,7 +57,7 @@ def test_download_language_embeddings():

 def test_load_dummy_image():
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended
    config_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml"  # default config
    with open(config_path, "r", encoding="utf-8") as fp:
        config = yaml.safe_load(fp)
@@ -74,7 +74,7 @@ def test_load_dummy_image():
    current_image = Image.open(current_image_path).convert("RGB")

    # here I suppose you only have an image from exterior (e.g., 3rd person view) and you don't have any state information
    # the images shoud arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
    # the images should arrange in sequence [exterior_image, right_wrist_image, left_wrist_image] * image_history_size (e.g., 2)
    rgbs_lst = [[previous_image, None, None], [current_image, None, None]]
    # if your have an right_wrist_image, then it should be
    # rgbs_lst = [
@@ -150,13 +150,13 @@ def test_load_dummy_image():

 def test_dummy_states():
    device = torch.device("cuda:0")
    dtype = torch.bfloat16  # recommanded
    dtype = torch.bfloat16  # recommended
    config_path = "/mnt/hpfs/1ms.ai/dora/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer/configs/base.yaml"  # default config
    with open(config_path, "r", encoding="utf-8") as fp:
        config = yaml.safe_load(fp)

    # suppose you do not have proprio
    # it's kind of tricky, I strongly suggest adding proprio as input and futher fine-tuning
    # it's kind of tricky, I strongly suggest adding proprio as input and further fine-tuning
    B, N = 1, 1  # batch size and state history size
    states = torch.zeros(
        (B, N, config["model"]["state_token_dim"]), device=device, dtype=dtype