Improve audio capabilities of phi4

10 months ago · 2be6c50448
--- a/examples/translation/phi4-dev.yml
+++ b/examples/translation/phi4-dev.yml
@@ -0,0 +1,57 @@
 nodes:
  - id: dora-microphone
    build: pip install -e ../../node-hub/dora-microphone
    path: dora-microphone
    inputs:
      tick: dora/timer/millis/2000
    outputs:
      - audio

  - id: dora-vad
    build: pip install -e ../../node-hub/dora-vad
    path: dora-vad
    inputs:
      audio: dora-microphone/audio
    outputs:
      - audio

  - id: dora-phi4
    build: |
      pip install flash-attn --no-build-isolation
      pip install -e ../../node-hub/dora-phi4
    path: dora-phi4
    inputs:
      audio: dora-vad/audio
      text: sender/data
    outputs:
      - text
    env:
      LEAD_MODALITY: audio

  - id: sender
    build: pip install -e ../../node-hub/pyarrow-sender
    path: pyarrow-sender
    outputs:
      - data
    env:
      DATA: "Translate this chinese audio to english"

  - id: dora-rerun
    build: pip install -e ../../node-hub/dora-rerun
    path: dora-rerun
    inputs:
      original_text: dora-phi4/text

  - id: dora-kokoro-tts
    build: pip install -e ../../node-hub/dora-kokoro-tts
    path: dora-kokoro-tts
    inputs:
      text: dora-phi4/text
    outputs:
      - audio

  - id: dora-pyaudio
    build: pip install -e ../../node-hub/dora-pyaudio
    path: dora-pyaudio
    inputs:
      audio: dora-kokoro-tts/audio
--- a/node-hub/dora-phi4/dora_phi4/main.py
+++ b/node-hub/dora-phi4/dora_phi4/main.py
@@ -1,5 +1,7 @@
 """TODO: Add docstring."""

 import os

 import cv2
 import numpy as np
 import pyarrow as pa
@@ -61,6 +63,23 @@ user_prompt = "<|user|>"
 assistant_prompt = "<|assistant|>"
 prompt_suffix = "<|end|>"

 LEAD_MODALITY = os.getenv("LEAD_MODALITY", "text")

 BAD_SENTENCES = [
    "The stock market closed down by 0.1%.",
    "The stock market closed down by 0.1 percent.",
    "The market is closed on Mondays and Tuesdays.",
    "The first time I saw the movie, I was very impressed.",
    "The first time I saw the sea, I was very young.",
    "The first time I saw the sea was when I was a child.",
    "The sound of the wind is so loud.",
    "The first time I saw the sea.",
    "The first time I saw the sea was in the movie.",
    "The first time I saw the movie.",
    "I don't know what to do.",
    "I don't know.",
 ]


 def main():
    """TODO: Add docstring."""
@@ -70,6 +89,7 @@ def main():
    image_id = None
    image = None
    audios = None
    text = ""
    for event in node:
        if event["type"] == "INPUT":
            input_id = event["id"]
@@ -118,6 +138,8 @@ def main():
                audios = [(audio, sample_rate)]
            elif input_id == "text":
                text = event["value"][0].as_py()

            if LEAD_MODALITY == input_id:
                if len(frames) > 1:
                    raise ValueError("Multiple images are not supported yet!")
                elif len(frames) == 1:
@@ -153,7 +175,9 @@ def main():
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False,
                )[0]
                node.send_output("text", pa.array([response]))

                if response not in BAD_SENTENCES:
                    node.send_output("text", pa.array([response]))


 if __name__ == "__main__":
--- a/node-hub/dora-phi4/pyproject.toml
+++ b/node-hub/dora-phi4/pyproject.toml
@@ -8,18 +8,19 @@ readme = "README.md"
 requires-python = ">=3.10"

 dependencies = [
    "dora-rs>=0.3.9",
    "torch==2.6.0",
    "torchvision==0.21.0",
    "transformers==4.48.2",
    "accelerate==1.3.0",
    "soundfile==0.13.1",
    "pillow==11.1.0",
    "scipy==1.15.2",
    "backoff==2.2.1",
    "peft==0.13.2",
    "bitsandbytes>=0.42.0",
    "requests"
  "dora-rs>=0.3.9",
  "torch==2.6.0",
  "torchvision==0.21.0",
  "transformers==4.48.2",
  "accelerate==1.3.0",
  "soundfile==0.13.1",
  "pillow==11.1.0",
  "scipy==1.15.2",
  "backoff==2.2.1",
  "peft==0.13.2",
  "bitsandbytes>=0.42.0",
  "opencv-python",
  "requests",
 ]

 [tool.setuptools]
--- a/node-hub/dora-pyaudio/dora_pyaudio/main.py
+++ b/node-hub/dora-pyaudio/dora_pyaudio/main.py
@@ -48,16 +48,16 @@ def main():
            if event["id"] == "audio":
                audio = event["value"].to_numpy()
                sr = event["metadata"].get("sample_rate", SAMPLE_RATE)
                stream = play_audio(audio[0 : sr // 10], sr, stream)
                i = sr // 10
                stream = play_audio(audio[0:sr], sr, stream)
                i = sr

            else:
                audio = np.array([])
                i = 0
        elif event["type"] == "ERROR":
            if i < len(audio):
                stream = play_audio(audio[i : i + sr // 10], sr, stream)
                i += sr // 10
                stream = play_audio(audio[i : i + sr], sr, stream)
                i += sr

    if stream is not None:
        stream.stop_stream()
--- a/node-hub/dora-vad/dora_vad/main.py
+++ b/node-hub/dora-vad/dora_vad/main.py
@@ -51,10 +51,10 @@ def main():
                    )
                    continue
                audio = audio[0 : speech_timestamps[-1]["end"]]
                node.send_output("audio", pa.array(audio))
                node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr})
                last_audios = [audio[speech_timestamps[-1]["end"] :]]

            # If there is no sound for too long return the audio
            elif len(last_audios) > MAX_AUDIO_DURATION_S:
                node.send_output("audio", pa.array(audio))
                node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr})
                last_audios = []