|
|
|
@@ -1,13 +1,7 @@ |
|
|
|
import io |
|
|
|
from urllib.request import urlopen |
|
|
|
|
|
|
|
import pyarrow as pa |
|
|
|
import requests |
|
|
|
import soundfile as sf |
|
|
|
import torch |
|
|
|
from accelerate import infer_auto_device_map |
|
|
|
from dora import Node |
|
|
|
from PIL import Image |
|
|
|
from transformers import ( |
|
|
|
AutoModelForCausalLM, |
|
|
|
AutoProcessor, |
|
|
|
@@ -18,9 +12,10 @@ from transformers import ( |
|
|
|
if torch.cuda.is_available(): |
|
|
|
device = "cuda" |
|
|
|
torch_dtype = torch.float16 # Use float16 for efficiency |
|
|
|
elif torch.backends.mps.is_available(): |
|
|
|
device = "mps" |
|
|
|
torch_dtype = torch.float16 # Reduce memory usage for MPS |
|
|
|
# TODO: Uncomment this once phi4 support mps backend. |
|
|
|
# elif torch.backends.mps.is_available(): |
|
|
|
# device = "mps" |
|
|
|
# torch_dtype = torch.float16 # Reduce memory usage for MPS |
|
|
|
else: |
|
|
|
device = "cpu" |
|
|
|
torch_dtype = torch.bfloat16 # CPU uses bfloat16 for efficiency |
|
|
|
@@ -29,7 +24,9 @@ else: |
|
|
|
# Load the model and processor |
|
|
|
MODEL_PATH = "microsoft/Phi-4-multimodal-instruct" |
|
|
|
|
|
|
|
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) |
|
|
|
processor = AutoProcessor.from_pretrained( |
|
|
|
MODEL_PATH, trust_remote_code=True, use_fast=True |
|
|
|
) |
|
|
|
|
|
|
|
# Define model config |
|
|
|
MODEL_CONFIG = { |
|
|
|
@@ -42,66 +39,20 @@ MODEL_CONFIG = { |
|
|
|
} |
|
|
|
|
|
|
|
# Infer device map without full initialization |
|
|
|
device_map = infer_auto_device_map(AutoModelForCausalLM.from_pretrained(MODEL_PATH, **MODEL_CONFIG)) |
|
|
|
device_map = infer_auto_device_map( |
|
|
|
AutoModelForCausalLM.from_pretrained(MODEL_PATH, **MODEL_CONFIG) |
|
|
|
) |
|
|
|
|
|
|
|
# Load the model directly with the inferred device map |
|
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, **MODEL_CONFIG, device_map=device_map) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
|
MODEL_PATH, **MODEL_CONFIG, device_map=device_map |
|
|
|
) |
|
|
|
|
|
|
|
generation_config = GenerationConfig.from_pretrained(MODEL_PATH) |
|
|
|
|
|
|
|
# Define prompt structure |
|
|
|
USER_PROMPT = "<|user|>" |
|
|
|
ASSISTANT_PROMPT = "<|assistant|>" |
|
|
|
PROMPT_SUFFIX = "<|end|>" |
|
|
|
|
|
|
|
|
|
|
|
def process_image(image_url): |
|
|
|
"""Processes an image through the model and returns the response.""" |
|
|
|
prompt = f"{USER_PROMPT}<|image_1|>What is shown in this image?{PROMPT_SUFFIX}{ASSISTANT_PROMPT}" |
|
|
|
|
|
|
|
# Download and open image |
|
|
|
image = Image.open(requests.get(image_url, stream=True).raw) |
|
|
|
|
|
|
|
# Process input |
|
|
|
inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device) |
|
|
|
|
|
|
|
# Generate response |
|
|
|
with torch.no_grad(): |
|
|
|
generate_ids = model.generate( |
|
|
|
**inputs, max_new_tokens=512, generation_config=generation_config, |
|
|
|
) |
|
|
|
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] |
|
|
|
|
|
|
|
response = processor.batch_decode( |
|
|
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False, |
|
|
|
)[0] |
|
|
|
return response |
|
|
|
|
|
|
|
|
|
|
|
def process_audio(audio_url): |
|
|
|
"""Processes an audio file through the model and returns the transcript + translation.""" |
|
|
|
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator." |
|
|
|
prompt = f"{USER_PROMPT}<|audio_1|>{speech_prompt}{PROMPT_SUFFIX}{ASSISTANT_PROMPT}" |
|
|
|
|
|
|
|
# Download and read audio file |
|
|
|
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read())) |
|
|
|
|
|
|
|
# Process input |
|
|
|
inputs = processor( |
|
|
|
text=prompt, audios=[(audio, samplerate)], return_tensors="pt", |
|
|
|
).to(model.device) |
|
|
|
|
|
|
|
# Generate response |
|
|
|
with torch.no_grad(): |
|
|
|
generate_ids = model.generate( |
|
|
|
**inputs, max_new_tokens=512, generation_config=generation_config, |
|
|
|
) |
|
|
|
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] |
|
|
|
|
|
|
|
response = processor.batch_decode( |
|
|
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False, |
|
|
|
)[0] |
|
|
|
return response |
|
|
|
user_prompt = "<|user|>" |
|
|
|
assistant_prompt = "<|assistant|>" |
|
|
|
prompt_suffix = "<|end|>" |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
@@ -110,23 +61,30 @@ def main(): |
|
|
|
for event in node: |
|
|
|
if event["type"] == "INPUT": |
|
|
|
input_id = event["id"] |
|
|
|
value = event["value"] |
|
|
|
|
|
|
|
print(f"Received event: {input_id}, value: {value}") |
|
|
|
|
|
|
|
# Check if it's an image URL |
|
|
|
if input_id == "image_input": |
|
|
|
image_response = process_image(value.as_py()) # Convert from PyArrow |
|
|
|
node.send_output( |
|
|
|
output_id="image_output", data=pa.array([image_response]), |
|
|
|
) |
|
|
|
|
|
|
|
# Check if it's an audio URL |
|
|
|
elif input_id == "audio_input": |
|
|
|
audio_response = process_audio(value.as_py()) # Convert from PyArrow |
|
|
|
node.send_output( |
|
|
|
output_id="audio_output", data=pa.array([audio_response]), |
|
|
|
) |
|
|
|
if input_id == "text": |
|
|
|
text = event["value"][0].as_py() |
|
|
|
prompt = f"{user_prompt}{text}{prompt_suffix}{assistant_prompt}" |
|
|
|
|
|
|
|
# Process input |
|
|
|
inputs = processor( |
|
|
|
text=prompt, |
|
|
|
return_tensors="pt", |
|
|
|
).to(model.device) |
|
|
|
# Generate response |
|
|
|
with torch.no_grad(): |
|
|
|
generate_ids = model.generate( |
|
|
|
**inputs, |
|
|
|
max_new_tokens=512, |
|
|
|
generation_config=generation_config, |
|
|
|
) |
|
|
|
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :] |
|
|
|
|
|
|
|
response = processor.batch_decode( |
|
|
|
generate_ids, |
|
|
|
skip_special_tokens=True, |
|
|
|
clean_up_tokenization_spaces=False, |
|
|
|
)[0] |
|
|
|
node.send_output("text", pa.array([response])) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|