Update to use z axis for scaling factor

Merge remote-tracking branch 'origin/add-avif-serialization-for-monochrome' into rescale-vggt-to-meter
Rescale vggt using camera height heuristic
--- a/examples/vggt/image_saver.py
+++ b/examples/vggt/image_saver.py
@@ -1,3 +1,5 @@
 import time
 from dora import Node
 node = Node()
@@ -7,6 +9,7 @@ i = 0
 LEAD_TOPIC = "vggt_depth"
 current_time = time.strftime("%H:%M:%S")
 for event in node:
    if event["type"] == "INPUT":
        if LEAD_TOPIC in event["id"]:
@@ -17,11 +20,13 @@ for event in node:
            height = metadata["height"]
            # Save to file
            filename = f"out/{event['id']}_{i}.{encoding}"
            filename = f"out/{current_time}_{event['id']}_{i}.{encoding}"
            with open(filename, "wb") as f:
                f.write(storage.to_numpy())
            for key, value in index_dict.items():
                filename = f"out/{key}_{i}.{value['metadata']['encoding']}"
                filename = (
                    f"out/{current_time}_{key}_{i}.{value['metadata']['encoding']}"
                )
                with open(filename, "wb") as f:
                    f.write(value["value"])
            i += 1
--- a/examples/vggt/vggt-v-realsense.yaml
+++ b/examples/vggt/vggt-v-realsense.yaml
@@ -1,29 +1,98 @@
 nodes:
  - id: camera
  - id: camera-0
    build: pip install -e ../../node-hub/dora-pyrealsense
    path: dora-pyrealsense
    inputs:
      tick: dora/timer/millis/100
      tick: dora/timer/millis/300
    outputs:
      - image
      - depth
    env:
      CAPTURE_PATH: 8
      DEVICE_SERIAL: "243222073837"
  - id: camera-1
    build: pip install -e ../../node-hub/dora-pyrealsense
    path: dora-pyrealsense
    inputs:
      tick: dora/timer/millis/300
    outputs:
      - image
      - depth
    env:
      DEVICE_SERIAL: "243322073274"
  - id: dora-vggt
    build: pip install -e ../../node-hub/dora-vggt
    path: dora-vggt
    inputs:
      image: camera/image
      image-0: camera-0/image
      image-1: camera-1/image
    outputs:
      - depth-0
      - image-0
      - depth-1
      - image-1
    env:
      CAMERA_HEIGHT_Y: 0.505
      DEPTH_ENCODING: mono16
  - id: rav1e-depth
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      depth-0: dora-vggt/depth-0
    outputs:
      - depth-0
    env:
      ENCODING: avif
  - id: rav1e-image
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      image-0: dora-vggt/image-0
    outputs:
      - image-0
    env:
      ENCODING: avif
  - id: camera-0-rav1e-image
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      image: camera-0/image
    outputs:
      - depth
      - image
    env:
      ENCODING: avif
  - id: camera-0-rav1e-depth
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      depth: camera-0/depth
    outputs:
      - depth
    env:
      ENCODING: avif
  - id: plot
    build: pip install dora-rerun
    path: dora-rerun
    inputs:
      camera/image: dora-vggt/image
      camera/depth: dora-vggt/depth
      realsense/image: camera/image
      realsense/depth: camera/depth
      vggt-0/image-0: dora-vggt/image-0
      vggt-0/depth-0: dora-vggt/depth-0
      realsense-0/image: camera-0/image
      realsense-0/depth: camera-0/depth
      vggt-1/image-1: dora-vggt/image-1
      vggt-1/depth-1: dora-vggt/depth-1
      realsense-1/image: camera-1/image
      realsense-1/depth: camera-1/depth
  - id: bench
    path: image_saver.py
    inputs:
      vggt_image: rav1e-image/image-0
      vggt_depth: rav1e-depth/depth-0
      camera_image: camera-0-rav1e-image/image
      camera_depth: camera-0-rav1e-depth/depth
--- a/node-hub/dora-pyrealsense/dora_pyrealsense/main.py
+++ b/node-hub/dora-pyrealsense/dora_pyrealsense/main.py
@@ -26,7 +26,7 @@ def main():
    # Serial list
    serials = [device.get_info(rs.camera_info.serial_number) for device in devices]
    if device_serial and (device_serial in serials):
    if device_serial and (device_serial not in serials):
        raise ConnectionError(
            f"Device with serial {device_serial} not found within: {serials}.",
        )
--- a/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
+++ b/node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
@@ -1 +0,0 @@
 Subproject commit b2889e65cfe62571ced3ce88f00e7d80b41fee69
--- a/node-hub/dora-vggt/dora_vggt/main.py
+++ b/node-hub/dora-vggt/dora_vggt/main.py
@@ -11,11 +11,13 @@ import torch
 from dora import Node
 from PIL import Image
 from vggt.models.vggt import VGGT
 from vggt.utils.geometry import unproject_depth_map_to_point_map
 from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 # bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
 CAMERA_HEIGHT_Y = os.getenv("CAMERA_HEIGHT_Y", "0.115")
 # bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
 dtype = torch.bfloat16
 # Check if cuda is available and set the device accordingly
@@ -27,7 +29,6 @@ model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
 model.eval()
 DEPTH_ENCODING = os.environ.get("DEPTH_ENCODING", "float64")
 # Import vecdeque
 def main():
@@ -94,28 +95,62 @@ def main():
                    extrinsic, intrinsic = pose_encoding_to_extri_intri(
                        pose_enc, images.shape[-2:]
                    )
                    intrinsic = intrinsic[-1][-1]
                    f_0 = intrinsic[0, 0]
                    f_1 = intrinsic[1, 1]
                    r_0 = intrinsic[0, 2]
                    r_1 = intrinsic[1, 2]
                    print(f"Extrinsic: {extrinsic}")
                    print(f"Intrinsic: {intrinsic}")
                    # Predict Depth Maps
                    depth_map, depth_conf = model.depth_head(
                        aggregated_tokens_list, images, ps_idx
                    )
                    print(depth_conf.max())
                    depth_map[depth_conf < 1.0] = 0.0  # Set low confidence pixels to 0
                    depth_map[depth_conf < 0.6] = 0.0  # Set low confidence pixels to 0
                    # Construct 3D Points from Depth Maps and Cameras
                    # which usually leads to more accurate 3D points than point map branch
                    point_map_by_unprojection = unproject_depth_map_to_point_map(
                        depth_map.squeeze(0), extrinsic.squeeze(0), intrinsic.squeeze(0)
                    )
                    # Get the last quartile of the 2nd axis
                    z_value = point_map_by_unprojection[0, :, :, 2]  # S, H, W, 3
                    scale_factor = 0.51
                    print(
                        f"Event Id: {event['id']} Scale factor: {scale_factor}, with height: {CAMERA_HEIGHT_Y} and max depth: {point_map_by_unprojection[0, :, :, 1].max()}"
                    )
                    print(
                        f" 0. all min and max depth values: {point_map_by_unprojection[0, :, :, 0].min()} / {point_map_by_unprojection[0, :, :, 0].max()}"
                    )
                    print(
                        f" 1. all min and max depth values: {point_map_by_unprojection[0, :, :, 1].min()} / {point_map_by_unprojection[0, :, :, 1].max()}"
                    )
                    print(
                        f" 2. all min and max depth values: {point_map_by_unprojection[0, :, :, 2].min()} / {point_map_by_unprojection[0, :, :, 2].max()}"
                    )
                    print(
                        f"Depth map before scaling: min and max: {depth_map.min()} / {depth_map.max()}"
                    )
                    depth_map = (
                        depth_map * scale_factor
                    )  # Scale depth map to the desired depth
                    print(
                        f"Depth map after scaling min and max in meters: {depth_map.min()} / {depth_map.max()}. Depth map shape: {depth_map.shape}"
                    )
                    depth_map = depth_map.to(torch.float64)
                    intrinsic = intrinsic[-1][-1]
                    f_0 = intrinsic[0, 0]
                    f_1 = intrinsic[1, 1]
                    r_0 = intrinsic[0, 2]
                    r_1 = intrinsic[1, 2]
                    depth_map = depth_map[-1][-1].cpu().numpy()
                    # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
                    if DEPTH_ENCODING == "mono16":
                        depth_map = (depth_map * 1000).astype(np.uint16)
                    node.send_output(
                        output_id="depth",
                        output_id=event["id"].replace("image", "depth"),
                        data=pa.array(depth_map.ravel()),
                        metadata={
                            "width": depth_map.shape[1],
@@ -137,13 +172,9 @@ def main():
                    # reorder pixels to be in last dimension
                    image = image.transpose(1, 2, 0)
                    print(
                        f"Image shape: {image.shape}, dtype: {image.dtype} and depth map shape: {depth_map.shape}, dtype: {depth_map.dtype}"
                    )
                    # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
                    node.send_output(
                        output_id="image",
                        output_id=event["id"],
                        data=pa.array(image.ravel()),
                        metadata={
                            "encoding": "rgb8",
Author	SHA1	Message	Date
haixuanTao	4faa7fd6ba	Update to use z axis for scaling factor	8 months ago
haixuanTao	5924e8743b	Merge remote-tracking branch 'origin/add-avif-serialization-for-monochrome' into rescale-vggt-to-meter	8 months ago
haixuantao	ab3764d18b	Rescale vggt using camera height heuristic	8 months ago