|
|
@@ -11,11 +11,13 @@ import torch |
|
|
from dora import Node |
|
|
from dora import Node |
|
|
from PIL import Image |
|
|
from PIL import Image |
|
|
from vggt.models.vggt import VGGT |
|
|
from vggt.models.vggt import VGGT |
|
|
|
|
|
from vggt.utils.geometry import unproject_depth_map_to_point_map |
|
|
from vggt.utils.load_fn import load_and_preprocess_images |
|
|
from vggt.utils.load_fn import load_and_preprocess_images |
|
|
from vggt.utils.pose_enc import pose_encoding_to_extri_intri |
|
|
from vggt.utils.pose_enc import pose_encoding_to_extri_intri |
|
|
|
|
|
|
|
|
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) |
|
|
|
|
|
|
|
|
CAMERA_HEIGHT_Y = os.getenv("CAMERA_HEIGHT_Y", "0.115") |
|
|
|
|
|
|
|
|
|
|
|
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) |
|
|
dtype = torch.bfloat16 |
|
|
dtype = torch.bfloat16 |
|
|
|
|
|
|
|
|
# Check if cuda is available and set the device accordingly |
|
|
# Check if cuda is available and set the device accordingly |
|
|
@@ -27,7 +29,6 @@ model = VGGT.from_pretrained("facebook/VGGT-1B").to(device) |
|
|
model.eval() |
|
|
model.eval() |
|
|
|
|
|
|
|
|
DEPTH_ENCODING = os.environ.get("DEPTH_ENCODING", "float64") |
|
|
DEPTH_ENCODING = os.environ.get("DEPTH_ENCODING", "float64") |
|
|
# Import vecdeque |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
def main(): |
|
|
@@ -94,28 +95,62 @@ def main(): |
|
|
extrinsic, intrinsic = pose_encoding_to_extri_intri( |
|
|
extrinsic, intrinsic = pose_encoding_to_extri_intri( |
|
|
pose_enc, images.shape[-2:] |
|
|
pose_enc, images.shape[-2:] |
|
|
) |
|
|
) |
|
|
intrinsic = intrinsic[-1][-1] |
|
|
|
|
|
f_0 = intrinsic[0, 0] |
|
|
|
|
|
f_1 = intrinsic[1, 1] |
|
|
|
|
|
r_0 = intrinsic[0, 2] |
|
|
|
|
|
r_1 = intrinsic[1, 2] |
|
|
|
|
|
|
|
|
print(f"Extrinsic: {extrinsic}") |
|
|
|
|
|
print(f"Intrinsic: {intrinsic}") |
|
|
|
|
|
|
|
|
# Predict Depth Maps |
|
|
# Predict Depth Maps |
|
|
depth_map, depth_conf = model.depth_head( |
|
|
depth_map, depth_conf = model.depth_head( |
|
|
aggregated_tokens_list, images, ps_idx |
|
|
aggregated_tokens_list, images, ps_idx |
|
|
) |
|
|
) |
|
|
print(depth_conf.max()) |
|
|
|
|
|
depth_map[depth_conf < 1.0] = 0.0 # Set low confidence pixels to 0 |
|
|
|
|
|
|
|
|
depth_map[depth_conf < 0.6] = 0.0 # Set low confidence pixels to 0 |
|
|
|
|
|
|
|
|
|
|
|
# Construct 3D Points from Depth Maps and Cameras |
|
|
|
|
|
# which usually leads to more accurate 3D points than point map branch |
|
|
|
|
|
point_map_by_unprojection = unproject_depth_map_to_point_map( |
|
|
|
|
|
depth_map.squeeze(0), extrinsic.squeeze(0), intrinsic.squeeze(0) |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
# Get the last quartile of the 2nd axis |
|
|
|
|
|
z_value = point_map_by_unprojection[0, :, :, 2] # S, H, W, 3 |
|
|
|
|
|
scale_factor = 0.51 |
|
|
|
|
|
|
|
|
|
|
|
print( |
|
|
|
|
|
f"Event Id: {event['id']} Scale factor: {scale_factor}, with height: {CAMERA_HEIGHT_Y} and max depth: {point_map_by_unprojection[0, :, :, 1].max()}" |
|
|
|
|
|
) |
|
|
|
|
|
print( |
|
|
|
|
|
f" 0. all min and max depth values: {point_map_by_unprojection[0, :, :, 0].min()} / {point_map_by_unprojection[0, :, :, 0].max()}" |
|
|
|
|
|
) |
|
|
|
|
|
print( |
|
|
|
|
|
f" 1. all min and max depth values: {point_map_by_unprojection[0, :, :, 1].min()} / {point_map_by_unprojection[0, :, :, 1].max()}" |
|
|
|
|
|
) |
|
|
|
|
|
print( |
|
|
|
|
|
f" 2. all min and max depth values: {point_map_by_unprojection[0, :, :, 2].min()} / {point_map_by_unprojection[0, :, :, 2].max()}" |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print( |
|
|
|
|
|
f"Depth map before scaling: min and max: {depth_map.min()} / {depth_map.max()}" |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
depth_map = ( |
|
|
|
|
|
depth_map * scale_factor |
|
|
|
|
|
) # Scale depth map to the desired depth |
|
|
|
|
|
print( |
|
|
|
|
|
f"Depth map after scaling min and max in meters: {depth_map.min()} / {depth_map.max()}. Depth map shape: {depth_map.shape}" |
|
|
|
|
|
) |
|
|
depth_map = depth_map.to(torch.float64) |
|
|
depth_map = depth_map.to(torch.float64) |
|
|
|
|
|
|
|
|
|
|
|
intrinsic = intrinsic[-1][-1] |
|
|
|
|
|
f_0 = intrinsic[0, 0] |
|
|
|
|
|
f_1 = intrinsic[1, 1] |
|
|
|
|
|
r_0 = intrinsic[0, 2] |
|
|
|
|
|
r_1 = intrinsic[1, 2] |
|
|
depth_map = depth_map[-1][-1].cpu().numpy() |
|
|
depth_map = depth_map[-1][-1].cpu().numpy() |
|
|
|
|
|
|
|
|
# Warning: Make sure to add my_output_id and my_input_id within the dataflow. |
|
|
# Warning: Make sure to add my_output_id and my_input_id within the dataflow. |
|
|
if DEPTH_ENCODING == "mono16": |
|
|
if DEPTH_ENCODING == "mono16": |
|
|
depth_map = (depth_map * 1000).astype(np.uint16) |
|
|
depth_map = (depth_map * 1000).astype(np.uint16) |
|
|
|
|
|
|
|
|
node.send_output( |
|
|
node.send_output( |
|
|
output_id="depth", |
|
|
|
|
|
|
|
|
output_id=event["id"].replace("image", "depth"), |
|
|
data=pa.array(depth_map.ravel()), |
|
|
data=pa.array(depth_map.ravel()), |
|
|
metadata={ |
|
|
metadata={ |
|
|
"width": depth_map.shape[1], |
|
|
"width": depth_map.shape[1], |
|
|
@@ -137,13 +172,9 @@ def main(): |
|
|
# reorder pixels to be in last dimension |
|
|
# reorder pixels to be in last dimension |
|
|
image = image.transpose(1, 2, 0) |
|
|
image = image.transpose(1, 2, 0) |
|
|
|
|
|
|
|
|
print( |
|
|
|
|
|
f"Image shape: {image.shape}, dtype: {image.dtype} and depth map shape: {depth_map.shape}, dtype: {depth_map.dtype}" |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
# Warning: Make sure to add my_output_id and my_input_id within the dataflow. |
|
|
# Warning: Make sure to add my_output_id and my_input_id within the dataflow. |
|
|
node.send_output( |
|
|
node.send_output( |
|
|
output_id="image", |
|
|
|
|
|
|
|
|
output_id=event["id"], |
|
|
data=pa.array(image.ravel()), |
|
|
data=pa.array(image.ravel()), |
|
|
metadata={ |
|
|
metadata={ |
|
|
"encoding": "rgb8", |
|
|
"encoding": "rgb8", |
|
|
|