Browse Source

I have added the Phi-4 multimodal inside the node-hub directory

#814
7SOMAY haixuantao 10 months ago
parent
commit
4aa047ed80
6 changed files with 187 additions and 0 deletions
  1. +40
    -0
      node-hub/dora-phi4/README.md
  2. +11
    -0
      node-hub/dora-phi4/dora_phi4/__init__.py
  3. +5
    -0
      node-hub/dora-phi4/dora_phi4/__main__.py
  4. +90
    -0
      node-hub/dora-phi4/dora_phi4/main.py
  5. +32
    -0
      node-hub/dora-phi4/pyproject.toml
  6. +9
    -0
      node-hub/dora-phi4/tests/test_dora_phi4.py

+ 40
- 0
node-hub/dora-phi4/README.md View File

@@ -0,0 +1,40 @@
# dora-phi4

## Getting started

- Install it with uv:

```bash
uv venv -p 3.11 --seed
uv pip install -e .
```

## Contribution Guide

- Format with [ruff](https://docs.astral.sh/ruff/):

```bash
uv pip install ruff
uv run ruff check . --fix
```

- Lint with ruff:

```bash
uv run ruff check .
```

- Test with [pytest](https://github.com/pytest-dev/pytest)

```bash
uv pip install pytest
uv run pytest . # Test
```

## YAML Specification

## Examples

## License

dora-phi4's code are released under the MIT License

+ 11
- 0
node-hub/dora-phi4/dora_phi4/__init__.py View File

@@ -0,0 +1,11 @@
import os

# Define the path to the README file relative to the package directory
readme_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.md")

# Read the content of the README file
try:
with open(readme_path, "r", encoding="utf-8") as f:
__doc__ = f.read()
except FileNotFoundError:
__doc__ = "README file not found."

+ 5
- 0
node-hub/dora-phi4/dora_phi4/__main__.py View File

@@ -0,0 +1,5 @@
from .main import main


if __name__ == "__main__":
main()

+ 90
- 0
node-hub/dora-phi4/dora_phi4/main.py View File

@@ -0,0 +1,90 @@
from dora import Node
import pyarrow as pa
import requests
import torch
import io
from PIL import Image
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from urllib.request import urlopen

# Load the model and processor
MODEL_PATH = "microsoft/Phi-4-multimodal-instruct"

processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="flash_attention_2",
).cuda()

generation_config = GenerationConfig.from_pretrained(MODEL_PATH)

# Define prompt structure
USER_PROMPT = "<|user|>"
ASSISTANT_PROMPT = "<|assistant|>"
PROMPT_SUFFIX = "<|end|>"


def process_image(image_url):
"""Processes an image through the model and returns the response."""
prompt = f"{USER_PROMPT}<|image_1|>What is shown in this image?{PROMPT_SUFFIX}{ASSISTANT_PROMPT}"

# Download and open image
image = Image.open(requests.get(image_url, stream=True).raw)

# Process input
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0")

# Generate response
generate_ids = model.generate(**inputs, max_new_tokens=1000, generation_config=generation_config)
generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return response


def process_audio(audio_url):
"""Processes an audio file through the model and returns the transcript + translation."""
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator."
prompt = f"{USER_PROMPT}<|audio_1|>{speech_prompt}{PROMPT_SUFFIX}{ASSISTANT_PROMPT}"

# Download and read audio file
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))

# Process input
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to("cuda:0")

# Generate response
generate_ids = model.generate(**inputs, max_new_tokens=1000, generation_config=generation_config)
generate_ids = generate_ids[:, inputs["input_ids"].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return response


def main():
node = Node()

for event in node:
if event["type"] == "INPUT":
input_id = event["id"]
value = event["value"]

print(f"Received event: {input_id}, value: {value}")

# Check if it's an image URL
if input_id == "image_input":
image_response = process_image(value.as_py()) # Convert from PyArrow
node.send_output(output_id="image_output", data=pa.array([image_response]))

# Check if it's an audio URL
elif input_id == "audio_input":
audio_response = process_audio(value.as_py()) # Convert from PyArrow
node.send_output(output_id="audio_output", data=pa.array([audio_response]))


if __name__ == "__main__":
main()

+ 32
- 0
node-hub/dora-phi4/pyproject.toml View File

@@ -0,0 +1,32 @@
[project]
name = "dora-phi4"
version = "0.0.0"
authors = [{ name = "Somay", email = "ssomay2002@gmail.com" }]
description = "DORA node for Phi-4 multimodal model"
license = { text = "MIT" }
readme = "README.md"
requires-python = ">=3.8"

dependencies = [
"dora-rs >=0.3.9",
"torch==2.6.0",
"torchvision==0.21.0",
"flash_attn==2.7.4.post1",
"transformers==4.48.2",
"accelerate==1.3.0",
"soundfile==0.13.1",
"pillow==11.1.0",
"scipy==1.15.2",
"backoff==2.2.1",
"peft==0.13.2",
"requests"
]

[tool.setuptools]
packages = ["dora_phi4"]

[dependency-groups]
dev = ["pytest >=8.1.1", "ruff >=0.9.1"]

[project.scripts]
dora-phi4 = "dora_phi4.main:main"

+ 9
- 0
node-hub/dora-phi4/tests/test_dora_phi4.py View File

@@ -0,0 +1,9 @@
import pytest


def test_import_main():
from dora_phi4.main import main

# Check that everything is working, and catch dora Runtime Exception as we're not running in a dora dataflow.
with pytest.raises(RuntimeError):
main()

Loading…
Cancel
Save