Browse Source

Improve audio capabilities of phi4

tags/v0.3.11-rc1
haixuanTao 10 months ago
parent
commit
2be6c50448
5 changed files with 101 additions and 19 deletions
  1. +57
    -0
      examples/translation/phi4-dev.yml
  2. +25
    -1
      node-hub/dora-phi4/dora_phi4/main.py
  3. +13
    -12
      node-hub/dora-phi4/pyproject.toml
  4. +4
    -4
      node-hub/dora-pyaudio/dora_pyaudio/main.py
  5. +2
    -2
      node-hub/dora-vad/dora_vad/main.py

+ 57
- 0
examples/translation/phi4-dev.yml View File

@@ -0,0 +1,57 @@
nodes:
- id: dora-microphone
build: pip install -e ../../node-hub/dora-microphone
path: dora-microphone
inputs:
tick: dora/timer/millis/2000
outputs:
- audio

- id: dora-vad
build: pip install -e ../../node-hub/dora-vad
path: dora-vad
inputs:
audio: dora-microphone/audio
outputs:
- audio

- id: dora-phi4
build: |
pip install flash-attn --no-build-isolation
pip install -e ../../node-hub/dora-phi4
path: dora-phi4
inputs:
audio: dora-vad/audio
text: sender/data
outputs:
- text
env:
LEAD_MODALITY: audio

- id: sender
build: pip install -e ../../node-hub/pyarrow-sender
path: pyarrow-sender
outputs:
- data
env:
DATA: "Translate this chinese audio to english"

- id: dora-rerun
build: pip install -e ../../node-hub/dora-rerun
path: dora-rerun
inputs:
original_text: dora-phi4/text

- id: dora-kokoro-tts
build: pip install -e ../../node-hub/dora-kokoro-tts
path: dora-kokoro-tts
inputs:
text: dora-phi4/text
outputs:
- audio

- id: dora-pyaudio
build: pip install -e ../../node-hub/dora-pyaudio
path: dora-pyaudio
inputs:
audio: dora-kokoro-tts/audio

+ 25
- 1
node-hub/dora-phi4/dora_phi4/main.py View File

@@ -1,5 +1,7 @@
"""TODO: Add docstring."""

import os

import cv2
import numpy as np
import pyarrow as pa
@@ -61,6 +63,23 @@ user_prompt = "<|user|>"
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"

LEAD_MODALITY = os.getenv("LEAD_MODALITY", "text")

BAD_SENTENCES = [
"The stock market closed down by 0.1%.",
"The stock market closed down by 0.1 percent.",
"The market is closed on Mondays and Tuesdays.",
"The first time I saw the movie, I was very impressed.",
"The first time I saw the sea, I was very young.",
"The first time I saw the sea was when I was a child.",
"The sound of the wind is so loud.",
"The first time I saw the sea.",
"The first time I saw the sea was in the movie.",
"The first time I saw the movie.",
"I don't know what to do.",
"I don't know.",
]


def main():
"""TODO: Add docstring."""
@@ -70,6 +89,7 @@ def main():
image_id = None
image = None
audios = None
text = ""
for event in node:
if event["type"] == "INPUT":
input_id = event["id"]
@@ -118,6 +138,8 @@ def main():
audios = [(audio, sample_rate)]
elif input_id == "text":
text = event["value"][0].as_py()

if LEAD_MODALITY == input_id:
if len(frames) > 1:
raise ValueError("Multiple images are not supported yet!")
elif len(frames) == 1:
@@ -153,7 +175,9 @@ def main():
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
node.send_output("text", pa.array([response]))

if response not in BAD_SENTENCES:
node.send_output("text", pa.array([response]))


if __name__ == "__main__":


+ 13
- 12
node-hub/dora-phi4/pyproject.toml View File

@@ -8,18 +8,19 @@ readme = "README.md"
requires-python = ">=3.10"

dependencies = [
"dora-rs>=0.3.9",
"torch==2.6.0",
"torchvision==0.21.0",
"transformers==4.48.2",
"accelerate==1.3.0",
"soundfile==0.13.1",
"pillow==11.1.0",
"scipy==1.15.2",
"backoff==2.2.1",
"peft==0.13.2",
"bitsandbytes>=0.42.0",
"requests"
"dora-rs>=0.3.9",
"torch==2.6.0",
"torchvision==0.21.0",
"transformers==4.48.2",
"accelerate==1.3.0",
"soundfile==0.13.1",
"pillow==11.1.0",
"scipy==1.15.2",
"backoff==2.2.1",
"peft==0.13.2",
"bitsandbytes>=0.42.0",
"opencv-python",
"requests",
]

[tool.setuptools]


+ 4
- 4
node-hub/dora-pyaudio/dora_pyaudio/main.py View File

@@ -48,16 +48,16 @@ def main():
if event["id"] == "audio":
audio = event["value"].to_numpy()
sr = event["metadata"].get("sample_rate", SAMPLE_RATE)
stream = play_audio(audio[0 : sr // 10], sr, stream)
i = sr // 10
stream = play_audio(audio[0:sr], sr, stream)
i = sr

else:
audio = np.array([])
i = 0
elif event["type"] == "ERROR":
if i < len(audio):
stream = play_audio(audio[i : i + sr // 10], sr, stream)
i += sr // 10
stream = play_audio(audio[i : i + sr], sr, stream)
i += sr

if stream is not None:
stream.stop_stream()


+ 2
- 2
node-hub/dora-vad/dora_vad/main.py View File

@@ -51,10 +51,10 @@ def main():
)
continue
audio = audio[0 : speech_timestamps[-1]["end"]]
node.send_output("audio", pa.array(audio))
node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr})
last_audios = [audio[speech_timestamps[-1]["end"] :]]

# If there is no sound for too long return the audio
elif len(last_audios) > MAX_AUDIO_DURATION_S:
node.send_output("audio", pa.array(audio))
node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr})
last_audios = []

Loading…
Cancel
Save