diff --git a/examples/translation/phi4-dev.yml b/examples/translation/phi4-dev.yml new file mode 100644 index 00000000..5d20819f --- /dev/null +++ b/examples/translation/phi4-dev.yml @@ -0,0 +1,57 @@ +nodes: + - id: dora-microphone + build: pip install -e ../../node-hub/dora-microphone + path: dora-microphone + inputs: + tick: dora/timer/millis/2000 + outputs: + - audio + + - id: dora-vad + build: pip install -e ../../node-hub/dora-vad + path: dora-vad + inputs: + audio: dora-microphone/audio + outputs: + - audio + + - id: dora-phi4 + build: | + pip install flash-attn --no-build-isolation + pip install -e ../../node-hub/dora-phi4 + path: dora-phi4 + inputs: + audio: dora-vad/audio + text: sender/data + outputs: + - text + env: + LEAD_MODALITY: audio + + - id: sender + build: pip install -e ../../node-hub/pyarrow-sender + path: pyarrow-sender + outputs: + - data + env: + DATA: "Translate this chinese audio to english" + + - id: dora-rerun + build: pip install -e ../../node-hub/dora-rerun + path: dora-rerun + inputs: + original_text: dora-phi4/text + + - id: dora-kokoro-tts + build: pip install -e ../../node-hub/dora-kokoro-tts + path: dora-kokoro-tts + inputs: + text: dora-phi4/text + outputs: + - audio + + - id: dora-pyaudio + build: pip install -e ../../node-hub/dora-pyaudio + path: dora-pyaudio + inputs: + audio: dora-kokoro-tts/audio diff --git a/node-hub/dora-phi4/dora_phi4/main.py b/node-hub/dora-phi4/dora_phi4/main.py index 85f602f3..80b9ed71 100644 --- a/node-hub/dora-phi4/dora_phi4/main.py +++ b/node-hub/dora-phi4/dora_phi4/main.py @@ -1,5 +1,7 @@ """TODO: Add docstring.""" +import os + import cv2 import numpy as np import pyarrow as pa @@ -61,6 +63,23 @@ user_prompt = "<|user|>" assistant_prompt = "<|assistant|>" prompt_suffix = "<|end|>" +LEAD_MODALITY = os.getenv("LEAD_MODALITY", "text") + +BAD_SENTENCES = [ + "The stock market closed down by 0.1%.", + "The stock market closed down by 0.1 percent.", + "The market is closed on Mondays and Tuesdays.", + "The first time I saw the movie, I was very impressed.", + "The first time I saw the sea, I was very young.", + "The first time I saw the sea was when I was a child.", + "The sound of the wind is so loud.", + "The first time I saw the sea.", + "The first time I saw the sea was in the movie.", + "The first time I saw the movie.", + "I don't know what to do.", + "I don't know.", +] + def main(): """TODO: Add docstring.""" @@ -70,6 +89,7 @@ def main(): image_id = None image = None audios = None + text = "" for event in node: if event["type"] == "INPUT": input_id = event["id"] @@ -118,6 +138,8 @@ def main(): audios = [(audio, sample_rate)] elif input_id == "text": text = event["value"][0].as_py() + + if LEAD_MODALITY == input_id: if len(frames) > 1: raise ValueError("Multiple images are not supported yet!") elif len(frames) == 1: @@ -153,7 +175,9 @@ def main(): skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0] - node.send_output("text", pa.array([response])) + + if response not in BAD_SENTENCES: + node.send_output("text", pa.array([response])) if __name__ == "__main__": diff --git a/node-hub/dora-phi4/pyproject.toml b/node-hub/dora-phi4/pyproject.toml index fad2b975..474f11b6 100644 --- a/node-hub/dora-phi4/pyproject.toml +++ b/node-hub/dora-phi4/pyproject.toml @@ -8,18 +8,19 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ - "dora-rs>=0.3.9", - "torch==2.6.0", - "torchvision==0.21.0", - "transformers==4.48.2", - "accelerate==1.3.0", - "soundfile==0.13.1", - "pillow==11.1.0", - "scipy==1.15.2", - "backoff==2.2.1", - "peft==0.13.2", - "bitsandbytes>=0.42.0", - "requests" + "dora-rs>=0.3.9", + "torch==2.6.0", + "torchvision==0.21.0", + "transformers==4.48.2", + "accelerate==1.3.0", + "soundfile==0.13.1", + "pillow==11.1.0", + "scipy==1.15.2", + "backoff==2.2.1", + "peft==0.13.2", + "bitsandbytes>=0.42.0", + "opencv-python", + "requests", ] [tool.setuptools] diff --git a/node-hub/dora-pyaudio/dora_pyaudio/main.py b/node-hub/dora-pyaudio/dora_pyaudio/main.py index 2cc3f5f4..f11cb3f2 100644 --- a/node-hub/dora-pyaudio/dora_pyaudio/main.py +++ b/node-hub/dora-pyaudio/dora_pyaudio/main.py @@ -48,16 +48,16 @@ def main(): if event["id"] == "audio": audio = event["value"].to_numpy() sr = event["metadata"].get("sample_rate", SAMPLE_RATE) - stream = play_audio(audio[0 : sr // 10], sr, stream) - i = sr // 10 + stream = play_audio(audio[0:sr], sr, stream) + i = sr else: audio = np.array([]) i = 0 elif event["type"] == "ERROR": if i < len(audio): - stream = play_audio(audio[i : i + sr // 10], sr, stream) - i += sr // 10 + stream = play_audio(audio[i : i + sr], sr, stream) + i += sr if stream is not None: stream.stop_stream() diff --git a/node-hub/dora-vad/dora_vad/main.py b/node-hub/dora-vad/dora_vad/main.py index bafb290d..9a674ddb 100644 --- a/node-hub/dora-vad/dora_vad/main.py +++ b/node-hub/dora-vad/dora_vad/main.py @@ -51,10 +51,10 @@ def main(): ) continue audio = audio[0 : speech_timestamps[-1]["end"]] - node.send_output("audio", pa.array(audio)) + node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr}) last_audios = [audio[speech_timestamps[-1]["end"] :]] # If there is no sound for too long return the audio elif len(last_audios) > MAX_AUDIO_DURATION_S: - node.send_output("audio", pa.array(audio)) + node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr}) last_audios = []