feat: updated dora-magma node

1 year ago · 65dfbb302a
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer"]
 	path = node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
 	url = https://github.com/thu-ml/RoboticsDiffusionTransformer
 [submodule "node-hub/dora-magma/dora_magma/Magma"]
 	path = node-hub/dora-magma/dora_magma/Magma
 	url = https://github.com/microsoft/Magma
--- a/node-hub/dora-magma/README.md
+++ b/node-hub/dora-magma/README.md
@@ -0,0 +1,7 @@
 # Dora Magma Node
 This Dora node integrates Microsoft's Magma model for multimodal trajectory planning and visualization within dora.
 # Additional Documentation
 - Magma: https://github.com/microsoft/Magma
--- a/node-hub/dora-magma/dora_magma/Magma
+++ b/node-hub/dora-magma/dora_magma/Magma
@@ -0,0 +1 @@
 Subproject commit adc6d8272e57ed9666e1f70584425e72525185fa
--- a/node-hub/dora-magma/dora_magma/init.py
+++ b/node-hub/dora-magma/dora_magma/init.py
@@ -0,0 +1,19 @@
 """TODO: Add docstring."""
 import os
 import sys
 from pathlib import Path
 # Define the path to the README file relative to the package directory
 readme_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.md")
 # Read the content of the README file
 try:
    with open(readme_path, encoding="utf-8") as f:
        __doc__ = f.read()
 except FileNotFoundError:
    __doc__ = "README file not found."
 submodule_path = Path(__file__).resolve().parent / "Magma"
 sys.path.insert(0, str(submodule_path))
--- a/node-hub/dora-magma/dora_magma/main.py
+++ b/node-hub/dora-magma/dora_magma/main.py
@@ -0,0 +1,161 @@
 """TODO: Add docstring."""  
 import os
 from pathlib import Path
 import cv2
 import numpy as np
 import pyarrow as pa
 import torch
 from dora import Node
 from PIL import Image
 from transformers import AutoModelForCausalLM, AutoProcessor
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 current_dir = Path(__file__).parent.absolute()
 magma_dir = current_dir.parent / "Magma" / "magma"
 def load_magma_models():
    """TODO: Add docstring."""  
    DEFAULT_PATH = str(magma_dir.parent / "checkpoints" / "Magma-8B")
    if not os.path.exists(DEFAULT_PATH):
        DEFAULT_PATH = str(magma_dir.parent)
        if not os.path.exists(DEFAULT_PATH):
            logger.warning("Warning: Magma submodule not found, falling back to HuggingFace version")
            DEFAULT_PATH = "microsoft/Magma-8B"
    MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
    logger.info(f"Loading Magma model from: {MODEL_NAME_OR_PATH}")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME_OR_PATH, 
            trust_remote_code=True,
            torch_dtype=torch.bfloat16, 
            device_map="auto"
        )
        processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH, trust_remote_code=True)
    except Exception as e:
        logger.error(f"Failed to load model: {e}")
        raise
    return model, processor, MODEL_NAME_OR_PATH
 model, processor, MODEL_NAME_OR_PATH = load_magma_models()
 def generate(image, task_description, template=None, num_marks=10, speed=8, steps=8):
    """TODO: Add docstring."""  
    if template is None:
        template = (
            "<image>\nThe image is split into 256x256 grids and is labeled with numeric marks {}.\n"
            "The robot is doing: {}. To finish the task, how to move the numerical marks in the image "
            "with speed {} for the next {} steps?\n"
        )
    mark_ids = [i + 1 for i in range(num_marks)]
    conv_user = template.format(mark_ids, task_description, speed, steps)
    if hasattr(model.config, 'mm_use_image_start_end') and model.config.mm_use_image_start_end:
        conv_user = conv_user.replace("<image>", "<image_start><image><image_end>")
    convs = [
        {"role": "system", "content": "You are an agent that can see, talk, and act."},
        {"role": "user", "content": conv_user},
    ]
    prompt = processor.tokenizer.apply_chat_template(
        convs,
        tokenize=False,
        add_generation_prompt=True
    )
    try:
        inputs = processor(images=image, texts=prompt, return_tensors="pt")
        inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
        inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
        inputs = inputs.to(model.device)
        with torch.inference_mode():
            output_ids = model.generate(
                **inputs,
                temperature=0.3,
                do_sample=True,
                num_beams=1,
                max_new_tokens=1024,
                use_cache=True,
            )
        response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
        return response
    except Exception as e:
        logger.error(f"Error in generate: {e}")
        return f"Error: {e}"
 def main():
    """TODO: Add docstring."""  
    node = Node()
    frames = {} 
    for event in node:
        event_type = event["type"]
        if event_type == "INPUT":
            event_id = event["id"]
            if "image" in event_id:
                storage = event["value"]
                metadata = event["metadata"]
                encoding = metadata["encoding"]
                width = metadata["width"]
                height = metadata["height"]
                try:
                    if encoding == "bgr8":
                        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
                        frame = frame[:, :, ::-1]  # Convert BGR to RGB
                    elif encoding == "rgb8":
                        frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
                    elif encoding in ["jpeg", "jpg", "jpe", "bmp", "webp", "png"]:
                        storage = storage.to_numpy()
                        frame = cv2.imdecode(storage, cv2.IMREAD_COLOR)
                        if frame is None:
                            raise ValueError(f"Failed to decode image with encoding {encoding}")
                        frame = frame[:, :, ::-1]  # Convert BGR to RGB
                    else:
                        raise ValueError(f"Unsupported image encoding: {encoding}")
                    image = Image.fromarray(frame)
                    frames[event_id] = image
                    # Cleanup old frames
                    if len(frames) > 10:
                        frames.popitem(last=False)
                except Exception as e:
                    logger.error(f"Error processing image {event_id}: {e}")
            # Handle text inputs
            elif "text" in event_id:
                if len(event["value"]) > 0:
                    task_description = event["value"][0].as_py()
                    image_id = event["metadata"].get("image_id", None)
                    if image_id is None or image_id not in frames:
                        logger.error(f"Image ID {image_id} not found in frames")
                        continue
                    image = frames[image_id]
                    response = generate(image, task_description)
                    node.send_output(
                        "text",
                        pa.array([response]),
                        {"image_id": image_id}
                    )
                else:
                    continue
        elif event_type == "ERROR":
            logger.error(f"Event Error: {event['error']}")
 if __name__ == "__main__":
    main()
--- a/node-hub/dora-magma/pyproject.toml
+++ b/node-hub/dora-magma/pyproject.toml
@@ -0,0 +1,46 @@
 [build-system]
 requires = ["poetry-core>=1.8.0"]
 build-backend = "poetry.core.masonry.api"
 [project]
 name = "dora-magma"
 version = "0.1.0"
 description = "Dora node for Microsoft Magma model"
 requires-python = ">=3.10"
 license = {text = "MIT"}
 readme = "README.md"
 authors = [
  {name = "Munish Mummadi", email = "moneymindedmunish1@gmail.com"}
 ]
 dependencies = [
    "dora-rs >= 0.3.9",
    "numpy >= 2.2.3",
    "torch >= 2.4.0",
    "torchvision >= 0.19",
    "transformers >= 4.45",
    "opencv-python >= 4.1.1",
    "accelerate>=1.5.1",
    "psutil>=7.0.0",
    "open-clip-torch>=2.31.0",
 ]
 [dependency-groups]
 dev = [
    "pytest>=8.1.1",
    "ruff>=0.9.1",
    "pytest-cov>=4.0.0",
 ]
 [project.scripts]
 dora-magma = "dora_magma.main:main"
 [tool.ruff]
 exclude = ["dora_magma/Magma"]
 [tool.black]
 extend.exclude = "dora_magma/Magma"
 [tool.ruff.lint]
 extend-select = [
  "D",   # pydocstyle
 ]
--- a/node-hub/dora-magma/tests/test_magma_node.py
+++ b/node-hub/dora-magma/tests/test_magma_node.py
@@ -0,0 +1,11 @@
 """TODO: Add docstring."""
 import pytest
 def test_import_main():
    """TODO: Add docstring."""
    from dora_magma.main import main
    # Check that everything is working, and catch dora Runtime Exception as we're not running in a dora dataflow.
    with pytest.raises(RuntimeError):
        main()
--- a/node-hub/dora-magma/uv.lock
+++ b/node-hub/dora-magma/uv.lock