Adding monochrome encoding and examples

8 months ago · 2cef9eb626
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1165,9 +1165,9 @@ dependencies = [

 [[package]]
 name = "avif-serialize"
 version = "0.8.3"
 version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "98922d6a4cfbcb08820c69d8eeccc05bb1f29bfa06b4f5b1dbfe9a868bd7608e"
 checksum = "19135c0c7a60bfee564dbe44ab5ce0557c6bf3884e5291a50be76a15640c4fbd"
 dependencies = [
 "arrayvec",
 ]
--- a/examples/vggt/depth-to-avif.yaml
+++ b/examples/vggt/depth-to-avif.yaml
@@ -0,0 +1,54 @@
 nodes:
  - id: camera
    build: pip install opencv-video-capture
    path: opencv-video-capture
    inputs:
      tick: dora/timer/millis/100
    outputs:
      - image
    env:
      CAPTURE_PATH: 1

  - id: dora-vggt
    build: pip install -e ../../node-hub/dora-vggt
    path: dora-vggt
    inputs:
      image: camera/image
    outputs:
      - depth
      - image
    env:
      DEPTH_ENCODING: mono16

  - id: rav1e-depth
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      depth: dora-vggt/depth
    outputs:
      - depth
    env:
      ENCODING: avif

  - id: rav1e-image
    path: dora-rav1e
    build: cargo build -p dora-rav1e --release
    inputs:
      image: dora-vggt/image
    outputs:
      - image
    env:
      ENCODING: avif

  - id: bench
    path: image_saver.py
    inputs:
      camera_depth: rav1e-image/image
      vggt_depth: rav1e-depth/depth

  - id: plot
    build: pip install dora-rerun
    path: dora-rerun
    inputs:
      camera/image: dora-vggt/image
      camera/depth: dora-vggt/depth
--- a/examples/vggt/depth.dora-session.yaml
+++ b/examples/vggt/depth.dora-session.yaml
@@ -1,8 +0,0 @@
 build_id: 2b402c1e-e52e-45e9-86e5-236b33a77369
 session_id: 275de19c-e605-4865-bc5f-2f15916bade9
 git_sources: {}
 local_build:
  node_working_dirs:
    camera: /Users/xaviertao/Documents/work/dora/examples/vggt
    dora-vggt: /Users/xaviertao/Documents/work/dora/examples/vggt
    plot: /Users/xaviertao/Documents/work/dora/examples/vggt
--- a/examples/vggt/image_saver.py
+++ b/examples/vggt/image_saver.py
@@ -0,0 +1,34 @@
 from dora import Node

 node = Node()

 index_dict = {}
 i = 0

 LEAD_TOPIC = "vggt_depth"

 for event in node:
    if event["type"] == "INPUT":
        if LEAD_TOPIC in event["id"]:
            storage = event["value"]
            metadata = event["metadata"]
            encoding = metadata["encoding"]
            width = metadata["width"]
            height = metadata["height"]

            # Save to file
            filename = f"out/{event['id']}_{i}.{encoding}"
            with open(filename, "wb") as f:
                f.write(storage.to_numpy())
            for key, value in index_dict.items():
                filename = f"out/{key}_{i}.{value['metadata']['encoding']}"
                with open(filename, "wb") as f:
                    f.write(value["value"])
            i += 1
        else:
            # Store the event in the index dictionary
            index_dict[event["id"]] = {
                "type": event["type"],
                "value": event["value"].to_numpy(),
                "metadata": event["metadata"],
            }
--- a/node-hub/dora-rav1e/Cargo.toml
+++ b/node-hub/dora-rav1e/Cargo.toml
@@ -25,7 +25,7 @@ pyo3 = { workspace = true, features = [
    "eyre",
    "generate-import-lib",
 ], optional = true }
 avif-serialize = "0.8.3"
 avif-serialize = "0.8.4"


 [lib]
--- a/node-hub/dora-rav1e/src/lib.rs
+++ b/node-hub/dora-rav1e/src/lib.rs
@@ -336,7 +336,7 @@ pub fn lib_main() -> Result<()> {
                    if let Some(buffer) = data.as_primitive_opt::<UInt16Type>() {
                        let mut buffer = buffer.values().to_vec();
                        if std::env::var("FILL_ZEROS")
                            .map(|s| s != "false")
                            .map(|s| s.to_lowercase() != "false")
                            .unwrap_or(true)
                        {
                            fill_zeros_toward_center_y_plane_in_place(&mut buffer, width, height);
@@ -370,7 +370,28 @@ pub fn lib_main() -> Result<()> {
                                let data = pkt.data;
                                match output_encoding.as_str() {
                                    "avif" => {
                                        warn!("avif encoding not supported for mono16");
                                        metadata.parameters.insert(
                                            "encoding".to_string(),
                                            Parameter::String("avif".to_string()),
                                        );

                                        let data = avif_serialize::Aviffy::new()
                                            .full_color_range(false)
                                            .set_seq_profile(0)
                                            .set_monochrome(true)
                                            .to_vec(
                                                &data,
                                                None,
                                                enc.width as u32,
                                                enc.height as u32,
                                                enc.bit_depth as u8,
                                            );

                                        let arrow = data.into_arrow();

                                        node.send_output(id, metadata.parameters.clone(), arrow)
                                            .context("could not send output")
                                            .unwrap();
                                    }
                                    _ => {
                                        metadata.parameters.insert(
--- a/node-hub/dora-vggt/dora_vggt/main.py
+++ b/node-hub/dora-vggt/dora_vggt/main.py
@@ -1,6 +1,7 @@
 """TODO: Add docstring."""

 import io
 import os
 from collections import deque as Deque

 import cv2
@@ -17,11 +18,15 @@ from vggt.utils.pose_enc import pose_encoding_to_extri_intri

 dtype = torch.bfloat16

 # Check if cuda is available and set the device accordingly
 device = "cuda" if torch.cuda.is_available() else "cpu"

 # Initialize the model and load the pretrained weights.
 # This will automatically download the model weights the first time it's run, which may take a while.
 model = VGGT.from_pretrained("facebook/VGGT-1B").to("cuda")
 model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
 model.eval()

 DEPTH_ENCODING = os.environ.get("DEPTH_ENCODING", "float64")
 # Import vecdeque


@@ -32,7 +37,6 @@ def main():

    for event in node:
        if event["type"] == "INPUT":

            if "image" in event["id"]:
                storage = event["value"]
                metadata = event["metadata"]
@@ -80,7 +84,7 @@ def main():
                raw_images.append(buffer)

                with torch.no_grad():
                    images = load_and_preprocess_images(raw_images).to("cuda")
                    images = load_and_preprocess_images(raw_images).to(device)

                    images = images[None]  # add batch dimension
                    aggregated_tokens_list, ps_idx = model.aggregator(images)
@@ -107,20 +111,24 @@ def main():
                    depth_map = depth_map[-1][-1].cpu().numpy()

                    # Warning: Make sure to add my_output_id and my_input_id within the dataflow.
                    if DEPTH_ENCODING == "mono16":
                        depth_map = (depth_map * 1000).astype(np.uint16)

                    node.send_output(
                        output_id="depth",
                        data=pa.array(depth_map.ravel()),
                        metadata={
                            "width": depth_map.shape[1],
                            "height": depth_map.shape[0],
                        "focal": [
                            int(f_0),
                            int(f_1),
                        ],
                        "resolution": [
                            int(r_0),
                            int(r_1),
                        ],
                            "encoding": DEPTH_ENCODING,
                            "focal": [
                                int(f_0),
                                int(f_1),
                            ],
                            "resolution": [
                                int(r_0),
                                int(r_1),
                            ],
                        },
                    )