Adding monochrome encoding to avif (#1037)

This makes it possible to use rav1e to encode monochrome depth image to be stored or shared as standalone avif file. This makes it easier to work with depth image outside of dora-rs.
8 months ago · e2d154dae0
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1165,9 +1165,9 @@ dependencies = [

 [[package]]
 name = "avif-serialize"
 version = "0.8.3"
 version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "98922d6a4cfbcb08820c69d8eeccc05bb1f29bfa06b4f5b1dbfe9a868bd7608e"
 checksum = "19135c0c7a60bfee564dbe44ab5ce0557c6bf3884e5291a50be76a15640c4fbd"
 dependencies = [
 "arrayvec",
 ]
--- a/examples/vggt/depth-to-avif.yaml
+++ b/examples/vggt/depth-to-avif.yaml
@@ -0,0 +1,54 @@
 nodes:
  - id: camera
    build: pip install opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 1

  - id: dora-vggt
    build: pip install -e ../../node-hub/dora-vggt
    path: dora-vggt
    inputs:
      image: camera/image
    outputs:
      - depth
      - image
    env:
      DEPTH_ENCODING: mono16

  - id: rav1e-depth
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      depth: dora-vggt/depth
    outputs:
      - depth
    env:
      ENCODING: avif

  - id: rav1e-image
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      image: dora-vggt/image
    outputs:
      - image
    env:
      ENCODING: avif

  - id: bench
    path: image_saver.py
    inputs:
      camera_depth: rav1e-image/image
      vggt_depth: rav1e-depth/depth

  - id: plot
    build: pip install dora-rerun
    path: dora-rerun
    inputs:
      camera/image: dora-vggt/image
      camera/depth: dora-vggt/depth
--- a/examples/vggt/depth.dora-session.yaml
+++ b/examples/vggt/depth.dora-session.yaml
@@ -1,8 +0,0 @@
 build_id: 2b402c1e-e52e-45e9-86e5-236b33a77369
 session_id: 275de19c-e605-4865-bc5f-2f15916bade9
 git_sources: {}
 local_build:
  node_working_dirs:
    camera: /Users/xaviertao/Documents/work/dora/examples/vggt
    dora-vggt: /Users/xaviertao/Documents/work/dora/examples/vggt
    plot: /Users/xaviertao/Documents/work/dora/examples/vggt
--- a/examples/vggt/image_saver.py
+++ b/examples/vggt/image_saver.py
@@ -0,0 +1,34 @@
 from dora import Node

 node = Node()

 index_dict = {}
 i = 0

 LEAD_TOPIC = "vggt_depth"

 for event in node:
    if event["type"] == "INPUT":
        if LEAD_TOPIC in event["id"]:
            storage = event["value"]
            metadata = event["metadata"]
            encoding = metadata["encoding"]
            width = metadata["width"]
            height = metadata["height"]

            # Save to file
            filename = f"out/{event['id']}_{i}.{encoding}"
            with open(filename, "wb") as f:
                f.write(storage.to_numpy())
            for key, value in index_dict.items():
                filename = f"out/{key}_{i}.{value['metadata']['encoding']}"
                with open(filename, "wb") as f:
                    f.write(value["value"])
            i += 1
        else:
            # Store the event in the index dictionary
            index_dict[event["id"]] = {
                "type": event["type"],
                "value": event["value"].to_numpy(),
                "metadata": event["metadata"],
            }
--- a/node-hub/dora-rav1e/Cargo.toml
+++ b/node-hub/dora-rav1e/Cargo.toml
@@ -25,7 +25,7 @@ pyo3 = { workspace = true, features = [
    "eyre",
    "generate-import-lib",
 ], optional = true }
 avif-serialize = "0.8.3"
 avif-serialize = "0.8.4"


 [lib]
--- a/node-hub/dora-rav1e/src/lib.rs
+++ b/node-hub/dora-rav1e/src/lib.rs
@@ -336,7 +336,7 @@ pub fn lib_main() -> Result<()> {
                    if let Some(buffer) = data.as_primitive_opt::<UInt16Type>() {
                        let mut buffer = buffer.values().to_vec();
                        if std::env::var("FILL_ZEROS")
                            .map(|s| s != "false")
                            .map(|s| s.to_lowercase() != "false")
                            .unwrap_or(true)
                        {
                            fill_zeros_toward_center_y_plane_in_place(&mut buffer, width, height);
@@ -370,7 +370,28 @@ pub fn lib_main() -> Result<()> {
                                let data = pkt.data;
                                match output_encoding.as_str() {
                                    "avif" => {
                                        warn!("avif encoding not supported for mono16");
                                        metadata.parameters.insert(
                                            "encoding".to_string(),
                                            Parameter::String("avif".to_string()),
                                        );

                                        let data = avif_serialize::Aviffy::new()
                                            .full_color_range(false)
                                            .set_seq_profile(0)
                                            .set_monochrome(true)
                                            .to_vec(
                                                &data,
                                                None,
                                                enc.width as u32,
                                                enc.height as u32,
                                                enc.bit_depth as u8,
                                            );

                                        let arrow = data.into_arrow();

                                        node.send_output(id, metadata.parameters.clone(), arrow)
                                            .context("could not send output")
                                            .unwrap();
                                    }
                                    _ => {
                                        metadata.parameters.insert(
--- a/node-hub/dora-vggt/dora_vggt/main.py
+++ b/node-hub/dora-vggt/dora_vggt/main.py
@@ -1,6 +1,7 @@
 """TODO: Add docstring."""

 import io
 import os
 from collections import deque as Deque

 import cv2
@@ -17,11 +18,15 @@ from vggt.utils.pose_enc import pose_encoding_to_extri_intri

 dtype = torch.bfloat16

 # Check if cuda is available and set the device accordingly
 device = "cuda" if torch.cuda.is_available() else "cpu"

 # Initialize the model and load the pretrained weights.
 # This will automatically download the model weights the first time it's run, which may take a while.
 model = VGGT.from_pretrained("facebook/VGGT-1B").to("cuda")
 model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
 model.eval()

 DEPTH_ENCODING = os.environ.get("DEPTH_ENCODING", "float64")
 # Import vecdeque


@@ -32,7 +37,6 @@ def main():

    for event in node:
        if event["type"] == "INPUT":

            if "image" in event["id"]:
                storage = event["value"]
                metadata = event["metadata"]
@@ -80,7 +84,7 @@ def main():
                raw_images.append(buffer)

                with torch.no_grad():
                    images = load_and_preprocess_images(raw_images).to("cuda")
                    images = load_and_preprocess_images(raw_images).to(device)

                    images = images[None]  # add batch dimension
                    aggregated_tokens_list, ps_idx = model.aggregator(images)
@@ -107,20 +111,24 @@ def main():
                    depth_map = depth_map[-1][-1].cpu().numpy()

                    # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
                    if DEPTH_ENCODING == "mono16":
                        depth_map = (depth_map * 1000).astype(np.uint16)

                    node.send_output(
                        output_id="depth",
                        data=pa.array(depth_map.ravel()),
                        metadata={
                            "width": depth_map.shape[1],
                            "height": depth_map.shape[0],
                        "focal": [
                            int(f_0),
                            int(f_1),
                        ],
                        "resolution": [
                            int(r_0),
                            int(r_1),
                        ],
                            "encoding": DEPTH_ENCODING,
                            "focal": [
                                int(f_0),
                                int(f_1),
                            ],
                            "resolution": [
                                int(r_0),
                                int(r_1),
                            ],
                        },
                    )