I have added the Phi-4 multimodal inside the node-hub directory

1 year ago · 4aa047ed80
--- a/node-hub/dora-phi4/README.md
+++ b/node-hub/dora-phi4/README.md
@@ -0,0 +1,40 @@
 # dora-phi4

 ## Getting started

 - Install it with uv:

 ```bash
 uv venv -p 3.11 --seed
 uv pip install -e .
 ```

 ## Contribution Guide

 - Format with [ruff](https://docs.astral.sh/ruff/):

 ```bash
 uv pip install ruff
 uv run ruff check . --fix
 ```

 - Lint with ruff:

 ```bash
 uv run ruff check .
 ```

 - Test with [pytest](https://github.com/pytest-dev/pytest)

 ```bash
 uv pip install pytest
 uv run pytest . # Test
 ```

 ## YAML Specification

 ## Examples

 ## License

 dora-phi4's code are released under the MIT License
--- a/node-hub/dora-phi4/dora_phi4/init.py
+++ b/node-hub/dora-phi4/dora_phi4/init.py
@@ -0,0 +1,11 @@
 import os

 # Define the path to the README file relative to the package directory
 readme_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.md")

 # Read the content of the README file
 try:
    with open(readme_path, "r", encoding="utf-8") as f:
        __doc__ = f.read()
 except FileNotFoundError:
    __doc__ = "README file not found."
--- a/node-hub/dora-phi4/dora_phi4/main.py
+++ b/node-hub/dora-phi4/dora_phi4/main.py
@@ -0,0 +1,5 @@
 from .main import main


 if __name__ == "__main__":
    main()
--- a/node-hub/dora-phi4/dora_phi4/main.py
+++ b/node-hub/dora-phi4/dora_phi4/main.py
@@ -0,0 +1,90 @@
 from dora import Node
 import pyarrow as pa
 import requests
 import torch
 import io
 from PIL import Image
 import soundfile as sf
 from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 from urllib.request import urlopen

 # Load the model and processor
 MODEL_PATH = "microsoft/Phi-4-multimodal-instruct"

 processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="flash_attention_2",
 ).cuda()

 generation_config = GenerationConfig.from_pretrained(MODEL_PATH)

 # Define prompt structure
 USER_PROMPT = "<|user|>"
 ASSISTANT_PROMPT = "<|assistant|>"
 PROMPT_SUFFIX = "<|end|>"


 def process_image(image_url):
    """Processes an image through the model and returns the response."""
    prompt = f"{USER_PROMPT}<|image_1|>What is shown in this image?{PROMPT_SUFFIX}{ASSISTANT_PROMPT}"

    # Download and open image
    image = Image.open(requests.get(image_url, stream=True).raw)

    # Process input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0")

    # Generate response
    generate_ids = model.generate(**inputs, max_new_tokens=1000, generation_config=generation_config)
    generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
    
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response


 def process_audio(audio_url):
    """Processes an audio file through the model and returns the transcript + translation."""
    speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator."
    prompt = f"{USER_PROMPT}<|audio_1|>{speech_prompt}{PROMPT_SUFFIX}{ASSISTANT_PROMPT}"

    # Download and read audio file
    audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))

    # Process input
    inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to("cuda:0")

    # Generate response
    generate_ids = model.generate(**inputs, max_new_tokens=1000, generation_config=generation_config)
    generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
    
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response


 def main():
    node = Node()

    for event in node:
        if event["type"] == "INPUT":
            input_id = event["id"]
            value = event["value"]

            print(f"Received event: {input_id}, value: {value}")

            # Check if it's an image URL
            if input_id == "image_input":
                image_response = process_image(value.as_py())  # Convert from PyArrow
                node.send_output(output_id="image_output", data=pa.array([image_response]))

            # Check if it's an audio URL
            elif input_id == "audio_input":
                audio_response = process_audio(value.as_py())  # Convert from PyArrow
                node.send_output(output_id="audio_output", data=pa.array([audio_response]))


 if __name__ == "__main__":
    main()
--- a/node-hub/dora-phi4/pyproject.toml
+++ b/node-hub/dora-phi4/pyproject.toml
@@ -0,0 +1,32 @@
 [project]
 name = "dora-phi4"
 version = "0.0.0"
 authors = [{ name = "Somay", email = "ssomay2002@gmail.com" }]
 description = "DORA node for Phi-4 multimodal model"
 license = { text = "MIT" }
 readme = "README.md"
 requires-python = ">=3.8"

 dependencies = [
    "dora-rs >=0.3.9",
    "torch==2.6.0",
    "torchvision==0.21.0",
    "flash_attn==2.7.4.post1",
    "transformers==4.48.2",
    "accelerate==1.3.0",
    "soundfile==0.13.1",
    "pillow==11.1.0",
    "scipy==1.15.2",
    "backoff==2.2.1",
    "peft==0.13.2",
    "requests"
 ]

 [tool.setuptools]
 packages = ["dora_phi4"]

 [dependency-groups]
 dev = ["pytest >=8.1.1", "ruff >=0.9.1"]

 [project.scripts]
 dora-phi4 = "dora_phi4.main:main"
--- a/node-hub/dora-phi4/tests/test_dora_phi4.py
+++ b/node-hub/dora-phi4/tests/test_dora_phi4.py
@@ -0,0 +1,9 @@
 import pytest


 def test_import_main():
    from dora_phi4.main import main

    # Check that everything is working, and catch dora Runtime Exception as we're not running in a dora dataflow.
    with pytest.raises(RuntimeError):
        main()