| @@ -0,0 +1,57 @@ | |||
| nodes: | |||
| - id: dora-microphone | |||
| build: pip install -e ../../node-hub/dora-microphone | |||
| path: dora-microphone | |||
| inputs: | |||
| tick: dora/timer/millis/2000 | |||
| outputs: | |||
| - audio | |||
| - id: dora-vad | |||
| build: pip install -e ../../node-hub/dora-vad | |||
| path: dora-vad | |||
| inputs: | |||
| audio: dora-microphone/audio | |||
| outputs: | |||
| - audio | |||
| - id: dora-phi4 | |||
| build: | | |||
| pip install flash-attn --no-build-isolation | |||
| pip install -e ../../node-hub/dora-phi4 | |||
| path: dora-phi4 | |||
| inputs: | |||
| audio: dora-vad/audio | |||
| text: sender/data | |||
| outputs: | |||
| - text | |||
| env: | |||
| LEAD_MODALITY: audio | |||
| - id: sender | |||
| build: pip install -e ../../node-hub/pyarrow-sender | |||
| path: pyarrow-sender | |||
| outputs: | |||
| - data | |||
| env: | |||
| DATA: "Translate this chinese audio to english" | |||
| - id: dora-rerun | |||
| build: pip install -e ../../node-hub/dora-rerun | |||
| path: dora-rerun | |||
| inputs: | |||
| original_text: dora-phi4/text | |||
| - id: dora-kokoro-tts | |||
| build: pip install -e ../../node-hub/dora-kokoro-tts | |||
| path: dora-kokoro-tts | |||
| inputs: | |||
| text: dora-phi4/text | |||
| outputs: | |||
| - audio | |||
| - id: dora-pyaudio | |||
| build: pip install -e ../../node-hub/dora-pyaudio | |||
| path: dora-pyaudio | |||
| inputs: | |||
| audio: dora-kokoro-tts/audio | |||
| @@ -1,5 +1,7 @@ | |||
| """TODO: Add docstring.""" | |||
| import os | |||
| import cv2 | |||
| import numpy as np | |||
| import pyarrow as pa | |||
| @@ -61,6 +63,23 @@ user_prompt = "<|user|>" | |||
| assistant_prompt = "<|assistant|>" | |||
| prompt_suffix = "<|end|>" | |||
| LEAD_MODALITY = os.getenv("LEAD_MODALITY", "text") | |||
| BAD_SENTENCES = [ | |||
| "The stock market closed down by 0.1%.", | |||
| "The stock market closed down by 0.1 percent.", | |||
| "The market is closed on Mondays and Tuesdays.", | |||
| "The first time I saw the movie, I was very impressed.", | |||
| "The first time I saw the sea, I was very young.", | |||
| "The first time I saw the sea was when I was a child.", | |||
| "The sound of the wind is so loud.", | |||
| "The first time I saw the sea.", | |||
| "The first time I saw the sea was in the movie.", | |||
| "The first time I saw the movie.", | |||
| "I don't know what to do.", | |||
| "I don't know.", | |||
| ] | |||
| def main(): | |||
| """TODO: Add docstring.""" | |||
| @@ -70,6 +89,7 @@ def main(): | |||
| image_id = None | |||
| image = None | |||
| audios = None | |||
| text = "" | |||
| for event in node: | |||
| if event["type"] == "INPUT": | |||
| input_id = event["id"] | |||
| @@ -118,6 +138,8 @@ def main(): | |||
| audios = [(audio, sample_rate)] | |||
| elif input_id == "text": | |||
| text = event["value"][0].as_py() | |||
| if LEAD_MODALITY == input_id: | |||
| if len(frames) > 1: | |||
| raise ValueError("Multiple images are not supported yet!") | |||
| elif len(frames) == 1: | |||
| @@ -153,7 +175,9 @@ def main(): | |||
| skip_special_tokens=True, | |||
| clean_up_tokenization_spaces=False, | |||
| )[0] | |||
| node.send_output("text", pa.array([response])) | |||
| if response not in BAD_SENTENCES: | |||
| node.send_output("text", pa.array([response])) | |||
| if __name__ == "__main__": | |||
| @@ -8,18 +8,19 @@ readme = "README.md" | |||
| requires-python = ">=3.10" | |||
| dependencies = [ | |||
| "dora-rs>=0.3.9", | |||
| "torch==2.6.0", | |||
| "torchvision==0.21.0", | |||
| "transformers==4.48.2", | |||
| "accelerate==1.3.0", | |||
| "soundfile==0.13.1", | |||
| "pillow==11.1.0", | |||
| "scipy==1.15.2", | |||
| "backoff==2.2.1", | |||
| "peft==0.13.2", | |||
| "bitsandbytes>=0.42.0", | |||
| "requests" | |||
| "dora-rs>=0.3.9", | |||
| "torch==2.6.0", | |||
| "torchvision==0.21.0", | |||
| "transformers==4.48.2", | |||
| "accelerate==1.3.0", | |||
| "soundfile==0.13.1", | |||
| "pillow==11.1.0", | |||
| "scipy==1.15.2", | |||
| "backoff==2.2.1", | |||
| "peft==0.13.2", | |||
| "bitsandbytes>=0.42.0", | |||
| "opencv-python", | |||
| "requests", | |||
| ] | |||
| [tool.setuptools] | |||
| @@ -48,16 +48,16 @@ def main(): | |||
| if event["id"] == "audio": | |||
| audio = event["value"].to_numpy() | |||
| sr = event["metadata"].get("sample_rate", SAMPLE_RATE) | |||
| stream = play_audio(audio[0 : sr // 10], sr, stream) | |||
| i = sr // 10 | |||
| stream = play_audio(audio[0:sr], sr, stream) | |||
| i = sr | |||
| else: | |||
| audio = np.array([]) | |||
| i = 0 | |||
| elif event["type"] == "ERROR": | |||
| if i < len(audio): | |||
| stream = play_audio(audio[i : i + sr // 10], sr, stream) | |||
| i += sr // 10 | |||
| stream = play_audio(audio[i : i + sr], sr, stream) | |||
| i += sr | |||
| if stream is not None: | |||
| stream.stop_stream() | |||
| @@ -51,10 +51,10 @@ def main(): | |||
| ) | |||
| continue | |||
| audio = audio[0 : speech_timestamps[-1]["end"]] | |||
| node.send_output("audio", pa.array(audio)) | |||
| node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr}) | |||
| last_audios = [audio[speech_timestamps[-1]["end"] :]] | |||
| # If there is no sound for too long return the audio | |||
| elif len(last_audios) > MAX_AUDIO_DURATION_S: | |||
| node.send_output("audio", pa.array(audio)) | |||
| node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr}) | |||
| last_audios = [] | |||