Browse Source

feat: updated dora-magma node

tags/v0.3.11-rc1
Munish Mummadi 10 months ago
parent
commit
65dfbb302a
8 changed files with 1521 additions and 0 deletions
  1. +3
    -0
      .gitmodules
  2. +7
    -0
      node-hub/dora-magma/README.md
  3. +1
    -0
      node-hub/dora-magma/dora_magma/Magma
  4. +19
    -0
      node-hub/dora-magma/dora_magma/__init__.py
  5. +161
    -0
      node-hub/dora-magma/dora_magma/main.py
  6. +46
    -0
      node-hub/dora-magma/pyproject.toml
  7. +11
    -0
      node-hub/dora-magma/tests/test_magma_node.py
  8. +1273
    -0
      node-hub/dora-magma/uv.lock

+ 3
- 0
.gitmodules View File

@@ -1,3 +1,6 @@
[submodule "node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer"] [submodule "node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer"]
path = node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer path = node-hub/dora-rdt-1b/dora_rdt_1b/RoboticsDiffusionTransformer
url = https://github.com/thu-ml/RoboticsDiffusionTransformer url = https://github.com/thu-ml/RoboticsDiffusionTransformer
[submodule "node-hub/dora-magma/dora_magma/Magma"]
path = node-hub/dora-magma/dora_magma/Magma
url = https://github.com/microsoft/Magma

+ 7
- 0
node-hub/dora-magma/README.md View File

@@ -0,0 +1,7 @@
# Dora Magma Node

This Dora node integrates Microsoft's Magma model for multimodal trajectory planning and visualization within dora.


# Additional Documentation
- Magma: https://github.com/microsoft/Magma

+ 1
- 0
node-hub/dora-magma/dora_magma/Magma

@@ -0,0 +1 @@
Subproject commit adc6d8272e57ed9666e1f70584425e72525185fa

+ 19
- 0
node-hub/dora-magma/dora_magma/__init__.py View File

@@ -0,0 +1,19 @@
"""TODO: Add docstring."""

import os
import sys
from pathlib import Path

# Define the path to the README file relative to the package directory
readme_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.md")

# Read the content of the README file
try:
with open(readme_path, encoding="utf-8") as f:
__doc__ = f.read()
except FileNotFoundError:
__doc__ = "README file not found."


submodule_path = Path(__file__).resolve().parent / "Magma"
sys.path.insert(0, str(submodule_path))

+ 161
- 0
node-hub/dora-magma/dora_magma/main.py View File

@@ -0,0 +1,161 @@
"""TODO: Add docstring."""

import os
from pathlib import Path
import cv2
import numpy as np
import pyarrow as pa
import torch
from dora import Node
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

current_dir = Path(__file__).parent.absolute()
magma_dir = current_dir.parent / "Magma" / "magma"

def load_magma_models():
"""TODO: Add docstring."""
DEFAULT_PATH = str(magma_dir.parent / "checkpoints" / "Magma-8B")
if not os.path.exists(DEFAULT_PATH):
DEFAULT_PATH = str(magma_dir.parent)
if not os.path.exists(DEFAULT_PATH):
logger.warning("Warning: Magma submodule not found, falling back to HuggingFace version")
DEFAULT_PATH = "microsoft/Magma-8B"

MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
logger.info(f"Loading Magma model from: {MODEL_NAME_OR_PATH}")
try:
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME_OR_PATH,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH, trust_remote_code=True)
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
return model, processor, MODEL_NAME_OR_PATH

model, processor, MODEL_NAME_OR_PATH = load_magma_models()

def generate(image, task_description, template=None, num_marks=10, speed=8, steps=8):
"""TODO: Add docstring."""
if template is None:
template = (
"<image>\nThe image is split into 256x256 grids and is labeled with numeric marks {}.\n"
"The robot is doing: {}. To finish the task, how to move the numerical marks in the image "
"with speed {} for the next {} steps?\n"
)
mark_ids = [i + 1 for i in range(num_marks)]
conv_user = template.format(mark_ids, task_description, speed, steps)
if hasattr(model.config, 'mm_use_image_start_end') and model.config.mm_use_image_start_end:
conv_user = conv_user.replace("<image>", "<image_start><image><image_end>")
convs = [
{"role": "system", "content": "You are an agent that can see, talk, and act."},
{"role": "user", "content": conv_user},
]
prompt = processor.tokenizer.apply_chat_template(
convs,
tokenize=False,
add_generation_prompt=True
)
try:
inputs = processor(images=image, texts=prompt, return_tensors="pt")
inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
inputs = inputs.to(model.device)
with torch.inference_mode():
output_ids = model.generate(
**inputs,
temperature=0.3,
do_sample=True,
num_beams=1,
max_new_tokens=1024,
use_cache=True,
)
response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
return response
except Exception as e:
logger.error(f"Error in generate: {e}")
return f"Error: {e}"

def main():
"""TODO: Add docstring."""
node = Node()
frames = {}
for event in node:
event_type = event["type"]
if event_type == "INPUT":
event_id = event["id"]
if "image" in event_id:
storage = event["value"]
metadata = event["metadata"]
encoding = metadata["encoding"]
width = metadata["width"]
height = metadata["height"]
try:
if encoding == "bgr8":
frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
frame = frame[:, :, ::-1] # Convert BGR to RGB
elif encoding == "rgb8":
frame = storage.to_numpy().astype(np.uint8).reshape((height, width, 3))
elif encoding in ["jpeg", "jpg", "jpe", "bmp", "webp", "png"]:
storage = storage.to_numpy()
frame = cv2.imdecode(storage, cv2.IMREAD_COLOR)
if frame is None:
raise ValueError(f"Failed to decode image with encoding {encoding}")
frame = frame[:, :, ::-1] # Convert BGR to RGB
else:
raise ValueError(f"Unsupported image encoding: {encoding}")
image = Image.fromarray(frame)
frames[event_id] = image
# Cleanup old frames
if len(frames) > 10:
frames.popitem(last=False)
except Exception as e:
logger.error(f"Error processing image {event_id}: {e}")
# Handle text inputs
elif "text" in event_id:
if len(event["value"]) > 0:
task_description = event["value"][0].as_py()
image_id = event["metadata"].get("image_id", None)
if image_id is None or image_id not in frames:
logger.error(f"Image ID {image_id} not found in frames")
continue
image = frames[image_id]
response = generate(image, task_description)
node.send_output(
"text",
pa.array([response]),
{"image_id": image_id}
)
else:
continue
elif event_type == "ERROR":
logger.error(f"Event Error: {event['error']}")

if __name__ == "__main__":
main()

+ 46
- 0
node-hub/dora-magma/pyproject.toml View File

@@ -0,0 +1,46 @@
[build-system]
requires = ["poetry-core>=1.8.0"]
build-backend = "poetry.core.masonry.api"

[project]
name = "dora-magma"
version = "0.1.0"
description = "Dora node for Microsoft Magma model"
requires-python = ">=3.10"
license = {text = "MIT"}
readme = "README.md"
authors = [
{name = "Munish Mummadi", email = "moneymindedmunish1@gmail.com"}
]
dependencies = [
"dora-rs >= 0.3.9",
"numpy >= 2.2.3",
"torch >= 2.4.0",
"torchvision >= 0.19",
"transformers >= 4.45",
"opencv-python >= 4.1.1",
"accelerate>=1.5.1",
"psutil>=7.0.0",
"open-clip-torch>=2.31.0",
]

[dependency-groups]
dev = [
"pytest>=8.1.1",
"ruff>=0.9.1",
"pytest-cov>=4.0.0",
]

[project.scripts]
dora-magma = "dora_magma.main:main"

[tool.ruff]
exclude = ["dora_magma/Magma"]

[tool.black]
extend.exclude = "dora_magma/Magma"

[tool.ruff.lint]
extend-select = [
"D", # pydocstyle
]

+ 11
- 0
node-hub/dora-magma/tests/test_magma_node.py View File

@@ -0,0 +1,11 @@
"""TODO: Add docstring."""

import pytest


def test_import_main():
"""TODO: Add docstring."""
from dora_magma.main import main
# Check that everything is working, and catch dora Runtime Exception as we're not running in a dora dataflow.
with pytest.raises(RuntimeError):
main()

+ 1273
- 0
node-hub/dora-magma/uv.lock
File diff suppressed because it is too large
View File


Loading…
Cancel
Save