Browse Source

Adding monochrome encoding to avif (#1037)

This makes it possible to use rav1e to encode monochrome depth image to
be stored or shared as standalone avif file. This makes it easier to
work with depth image outside of dora-rs.
tags/v0.3.12
Haixuan Xavier Tao GitHub 6 months ago
parent
commit
e2d154dae0
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
7 changed files with 133 additions and 24 deletions
  1. +2
    -2
      Cargo.lock
  2. +54
    -0
      examples/vggt/depth-to-avif.yaml
  3. +0
    -8
      examples/vggt/depth.dora-session.yaml
  4. +34
    -0
      examples/vggt/image_saver.py
  5. +1
    -1
      node-hub/dora-rav1e/Cargo.toml
  6. +23
    -2
      node-hub/dora-rav1e/src/lib.rs
  7. +19
    -11
      node-hub/dora-vggt/dora_vggt/main.py

+ 2
- 2
Cargo.lock View File

@@ -1165,9 +1165,9 @@ dependencies = [

[[package]]
name = "avif-serialize"
version = "0.8.3"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98922d6a4cfbcb08820c69d8eeccc05bb1f29bfa06b4f5b1dbfe9a868bd7608e"
checksum = "19135c0c7a60bfee564dbe44ab5ce0557c6bf3884e5291a50be76a15640c4fbd"
dependencies = [
"arrayvec",
]


+ 54
- 0
examples/vggt/depth-to-avif.yaml View File

@@ -0,0 +1,54 @@
nodes:
- id: camera
build: pip install opencv-video-capture
path: opencv-video-capture
inputs:
tick: dora/timer/millis/100
outputs:
- image
env:
CAPTURE_PATH: 1

- id: dora-vggt
build: pip install -e ../../node-hub/dora-vggt
path: dora-vggt
inputs:
image: camera/image
outputs:
- depth
- image
env:
DEPTH_ENCODING: mono16

- id: rav1e-depth
path: dora-rav1e
build: cargo build -p dora-rav1e --release
inputs:
depth: dora-vggt/depth
outputs:
- depth
env:
ENCODING: avif

- id: rav1e-image
path: dora-rav1e
build: cargo build -p dora-rav1e --release
inputs:
image: dora-vggt/image
outputs:
- image
env:
ENCODING: avif

- id: bench
path: image_saver.py
inputs:
camera_depth: rav1e-image/image
vggt_depth: rav1e-depth/depth

- id: plot
build: pip install dora-rerun
path: dora-rerun
inputs:
camera/image: dora-vggt/image
camera/depth: dora-vggt/depth

+ 0
- 8
examples/vggt/depth.dora-session.yaml View File

@@ -1,8 +0,0 @@
build_id: 2b402c1e-e52e-45e9-86e5-236b33a77369
session_id: 275de19c-e605-4865-bc5f-2f15916bade9
git_sources: {}
local_build:
node_working_dirs:
camera: /Users/xaviertao/Documents/work/dora/examples/vggt
dora-vggt: /Users/xaviertao/Documents/work/dora/examples/vggt
plot: /Users/xaviertao/Documents/work/dora/examples/vggt

+ 34
- 0
examples/vggt/image_saver.py View File

@@ -0,0 +1,34 @@
from dora import Node

node = Node()

index_dict = {}
i = 0

LEAD_TOPIC = "vggt_depth"

for event in node:
if event["type"] == "INPUT":
if LEAD_TOPIC in event["id"]:
storage = event["value"]
metadata = event["metadata"]
encoding = metadata["encoding"]
width = metadata["width"]
height = metadata["height"]

# Save to file
filename = f"out/{event['id']}_{i}.{encoding}"
with open(filename, "wb") as f:
f.write(storage.to_numpy())
for key, value in index_dict.items():
filename = f"out/{key}_{i}.{value['metadata']['encoding']}"
with open(filename, "wb") as f:
f.write(value["value"])
i += 1
else:
# Store the event in the index dictionary
index_dict[event["id"]] = {
"type": event["type"],
"value": event["value"].to_numpy(),
"metadata": event["metadata"],
}

+ 1
- 1
node-hub/dora-rav1e/Cargo.toml View File

@@ -25,7 +25,7 @@ pyo3 = { workspace = true, features = [
"eyre",
"generate-import-lib",
], optional = true }
avif-serialize = "0.8.3"
avif-serialize = "0.8.4"


[lib]


+ 23
- 2
node-hub/dora-rav1e/src/lib.rs View File

@@ -336,7 +336,7 @@ pub fn lib_main() -> Result<()> {
if let Some(buffer) = data.as_primitive_opt::<UInt16Type>() {
let mut buffer = buffer.values().to_vec();
if std::env::var("FILL_ZEROS")
.map(|s| s != "false")
.map(|s| s.to_lowercase() != "false")
.unwrap_or(true)
{
fill_zeros_toward_center_y_plane_in_place(&mut buffer, width, height);
@@ -370,7 +370,28 @@ pub fn lib_main() -> Result<()> {
let data = pkt.data;
match output_encoding.as_str() {
"avif" => {
warn!("avif encoding not supported for mono16");
metadata.parameters.insert(
"encoding".to_string(),
Parameter::String("avif".to_string()),
);

let data = avif_serialize::Aviffy::new()
.full_color_range(false)
.set_seq_profile(0)
.set_monochrome(true)
.to_vec(
&data,
None,
enc.width as u32,
enc.height as u32,
enc.bit_depth as u8,
);

let arrow = data.into_arrow();

node.send_output(id, metadata.parameters.clone(), arrow)
.context("could not send output")
.unwrap();
}
_ => {
metadata.parameters.insert(


+ 19
- 11
node-hub/dora-vggt/dora_vggt/main.py View File

@@ -1,6 +1,7 @@
"""TODO: Add docstring."""

import io
import os
from collections import deque as Deque

import cv2
@@ -17,11 +18,15 @@ from vggt.utils.pose_enc import pose_encoding_to_extri_intri

dtype = torch.bfloat16

# Check if cuda is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to("cuda")
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
model.eval()

DEPTH_ENCODING = os.environ.get("DEPTH_ENCODING", "float64")
# Import vecdeque


@@ -32,7 +37,6 @@ def main():

for event in node:
if event["type"] == "INPUT":

if "image" in event["id"]:
storage = event["value"]
metadata = event["metadata"]
@@ -80,7 +84,7 @@ def main():
raw_images.append(buffer)

with torch.no_grad():
images = load_and_preprocess_images(raw_images).to("cuda")
images = load_and_preprocess_images(raw_images).to(device)

images = images[None] # add batch dimension
aggregated_tokens_list, ps_idx = model.aggregator(images)
@@ -107,20 +111,24 @@ def main():
depth_map = depth_map[-1][-1].cpu().numpy()

# Warning: Make sure to add my_output_id and my_input_id within the dataflow.
if DEPTH_ENCODING == "mono16":
depth_map = (depth_map * 1000).astype(np.uint16)

node.send_output(
output_id="depth",
data=pa.array(depth_map.ravel()),
metadata={
"width": depth_map.shape[1],
"height": depth_map.shape[0],
"focal": [
int(f_0),
int(f_1),
],
"resolution": [
int(r_0),
int(r_1),
],
"encoding": DEPTH_ENCODING,
"focal": [
int(f_0),
int(f_1),
],
"resolution": [
int(r_0),
int(r_1),
],
},
)



Loading…
Cancel
Save