| @@ -0,0 +1,57 @@ | |||||
| nodes: | |||||
| - id: dora-microphone | |||||
| build: pip install -e ../../node-hub/dora-microphone | |||||
| path: dora-microphone | |||||
| inputs: | |||||
| tick: dora/timer/millis/2000 | |||||
| outputs: | |||||
| - audio | |||||
| - id: dora-vad | |||||
| build: pip install -e ../../node-hub/dora-vad | |||||
| path: dora-vad | |||||
| inputs: | |||||
| audio: dora-microphone/audio | |||||
| outputs: | |||||
| - audio | |||||
| - id: dora-phi4 | |||||
| build: | | |||||
| pip install flash-attn --no-build-isolation | |||||
| pip install -e ../../node-hub/dora-phi4 | |||||
| path: dora-phi4 | |||||
| inputs: | |||||
| audio: dora-vad/audio | |||||
| text: sender/data | |||||
| outputs: | |||||
| - text | |||||
| env: | |||||
| LEAD_MODALITY: audio | |||||
| - id: sender | |||||
| build: pip install -e ../../node-hub/pyarrow-sender | |||||
| path: pyarrow-sender | |||||
| outputs: | |||||
| - data | |||||
| env: | |||||
| DATA: "Translate this chinese audio to english" | |||||
| - id: dora-rerun | |||||
| build: pip install -e ../../node-hub/dora-rerun | |||||
| path: dora-rerun | |||||
| inputs: | |||||
| original_text: dora-phi4/text | |||||
| - id: dora-kokoro-tts | |||||
| build: pip install -e ../../node-hub/dora-kokoro-tts | |||||
| path: dora-kokoro-tts | |||||
| inputs: | |||||
| text: dora-phi4/text | |||||
| outputs: | |||||
| - audio | |||||
| - id: dora-pyaudio | |||||
| build: pip install -e ../../node-hub/dora-pyaudio | |||||
| path: dora-pyaudio | |||||
| inputs: | |||||
| audio: dora-kokoro-tts/audio | |||||
| @@ -1,5 +1,7 @@ | |||||
| """TODO: Add docstring.""" | """TODO: Add docstring.""" | ||||
| import os | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| import pyarrow as pa | import pyarrow as pa | ||||
| @@ -61,6 +63,23 @@ user_prompt = "<|user|>" | |||||
| assistant_prompt = "<|assistant|>" | assistant_prompt = "<|assistant|>" | ||||
| prompt_suffix = "<|end|>" | prompt_suffix = "<|end|>" | ||||
| LEAD_MODALITY = os.getenv("LEAD_MODALITY", "text") | |||||
| BAD_SENTENCES = [ | |||||
| "The stock market closed down by 0.1%.", | |||||
| "The stock market closed down by 0.1 percent.", | |||||
| "The market is closed on Mondays and Tuesdays.", | |||||
| "The first time I saw the movie, I was very impressed.", | |||||
| "The first time I saw the sea, I was very young.", | |||||
| "The first time I saw the sea was when I was a child.", | |||||
| "The sound of the wind is so loud.", | |||||
| "The first time I saw the sea.", | |||||
| "The first time I saw the sea was in the movie.", | |||||
| "The first time I saw the movie.", | |||||
| "I don't know what to do.", | |||||
| "I don't know.", | |||||
| ] | |||||
| def main(): | def main(): | ||||
| """TODO: Add docstring.""" | """TODO: Add docstring.""" | ||||
| @@ -70,6 +89,7 @@ def main(): | |||||
| image_id = None | image_id = None | ||||
| image = None | image = None | ||||
| audios = None | audios = None | ||||
| text = "" | |||||
| for event in node: | for event in node: | ||||
| if event["type"] == "INPUT": | if event["type"] == "INPUT": | ||||
| input_id = event["id"] | input_id = event["id"] | ||||
| @@ -118,6 +138,8 @@ def main(): | |||||
| audios = [(audio, sample_rate)] | audios = [(audio, sample_rate)] | ||||
| elif input_id == "text": | elif input_id == "text": | ||||
| text = event["value"][0].as_py() | text = event["value"][0].as_py() | ||||
| if LEAD_MODALITY == input_id: | |||||
| if len(frames) > 1: | if len(frames) > 1: | ||||
| raise ValueError("Multiple images are not supported yet!") | raise ValueError("Multiple images are not supported yet!") | ||||
| elif len(frames) == 1: | elif len(frames) == 1: | ||||
| @@ -153,7 +175,9 @@ def main(): | |||||
| skip_special_tokens=True, | skip_special_tokens=True, | ||||
| clean_up_tokenization_spaces=False, | clean_up_tokenization_spaces=False, | ||||
| )[0] | )[0] | ||||
| node.send_output("text", pa.array([response])) | |||||
| if response not in BAD_SENTENCES: | |||||
| node.send_output("text", pa.array([response])) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| @@ -8,18 +8,19 @@ readme = "README.md" | |||||
| requires-python = ">=3.10" | requires-python = ">=3.10" | ||||
| dependencies = [ | dependencies = [ | ||||
| "dora-rs>=0.3.9", | |||||
| "torch==2.6.0", | |||||
| "torchvision==0.21.0", | |||||
| "transformers==4.48.2", | |||||
| "accelerate==1.3.0", | |||||
| "soundfile==0.13.1", | |||||
| "pillow==11.1.0", | |||||
| "scipy==1.15.2", | |||||
| "backoff==2.2.1", | |||||
| "peft==0.13.2", | |||||
| "bitsandbytes>=0.42.0", | |||||
| "requests" | |||||
| "dora-rs>=0.3.9", | |||||
| "torch==2.6.0", | |||||
| "torchvision==0.21.0", | |||||
| "transformers==4.48.2", | |||||
| "accelerate==1.3.0", | |||||
| "soundfile==0.13.1", | |||||
| "pillow==11.1.0", | |||||
| "scipy==1.15.2", | |||||
| "backoff==2.2.1", | |||||
| "peft==0.13.2", | |||||
| "bitsandbytes>=0.42.0", | |||||
| "opencv-python", | |||||
| "requests", | |||||
| ] | ] | ||||
| [tool.setuptools] | [tool.setuptools] | ||||
| @@ -48,16 +48,16 @@ def main(): | |||||
| if event["id"] == "audio": | if event["id"] == "audio": | ||||
| audio = event["value"].to_numpy() | audio = event["value"].to_numpy() | ||||
| sr = event["metadata"].get("sample_rate", SAMPLE_RATE) | sr = event["metadata"].get("sample_rate", SAMPLE_RATE) | ||||
| stream = play_audio(audio[0 : sr // 10], sr, stream) | |||||
| i = sr // 10 | |||||
| stream = play_audio(audio[0:sr], sr, stream) | |||||
| i = sr | |||||
| else: | else: | ||||
| audio = np.array([]) | audio = np.array([]) | ||||
| i = 0 | i = 0 | ||||
| elif event["type"] == "ERROR": | elif event["type"] == "ERROR": | ||||
| if i < len(audio): | if i < len(audio): | ||||
| stream = play_audio(audio[i : i + sr // 10], sr, stream) | |||||
| i += sr // 10 | |||||
| stream = play_audio(audio[i : i + sr], sr, stream) | |||||
| i += sr | |||||
| if stream is not None: | if stream is not None: | ||||
| stream.stop_stream() | stream.stop_stream() | ||||
| @@ -51,10 +51,10 @@ def main(): | |||||
| ) | ) | ||||
| continue | continue | ||||
| audio = audio[0 : speech_timestamps[-1]["end"]] | audio = audio[0 : speech_timestamps[-1]["end"]] | ||||
| node.send_output("audio", pa.array(audio)) | |||||
| node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr}) | |||||
| last_audios = [audio[speech_timestamps[-1]["end"] :]] | last_audios = [audio[speech_timestamps[-1]["end"] :]] | ||||
| # If there is no sound for too long return the audio | # If there is no sound for too long return the audio | ||||
| elif len(last_audios) > MAX_AUDIO_DURATION_S: | elif len(last_audios) > MAX_AUDIO_DURATION_S: | ||||
| node.send_output("audio", pa.array(audio)) | |||||
| node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr}) | |||||
| last_audios = [] | last_audios = [] | ||||