| @@ -7566,7 +7566,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | |||||
| checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" | checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" | ||||
| dependencies = [ | dependencies = [ | ||||
| "once_cell", | "once_cell", | ||||
| "python3-dll-a", | |||||
| "target-lexicon", | "target-lexicon", | ||||
| ] | ] | ||||
| @@ -7628,15 +7627,6 @@ dependencies = [ | |||||
| "syn 2.0.94", | "syn 2.0.94", | ||||
| ] | ] | ||||
| [[package]] | |||||
| name = "python3-dll-a" | |||||
| version = "0.2.13" | |||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | |||||
| checksum = "49fe4227a288cf9493942ad0220ea3f185f4d1f2a14f197f7344d6d02f4ed4ed" | |||||
| dependencies = [ | |||||
| "cc", | |||||
| ] | |||||
| [[package]] | [[package]] | ||||
| name = "pythonize" | name = "pythonize" | ||||
| version = "0.22.0" | version = "0.22.0" | ||||
| @@ -14,6 +14,7 @@ nodes: | |||||
| audio: dora-microphone/audio | audio: dora-microphone/audio | ||||
| outputs: | outputs: | ||||
| - audio | - audio | ||||
| - timestamp_start | |||||
| - id: dora-distil-whisper | - id: dora-distil-whisper | ||||
| build: pip install -e ../../node-hub/dora-distil-whisper | build: pip install -e ../../node-hub/dora-distil-whisper | ||||
| @@ -55,3 +56,4 @@ nodes: | |||||
| path: dora-pyaudio | path: dora-pyaudio | ||||
| inputs: | inputs: | ||||
| audio: dora-kokoro-tts/audio | audio: dora-kokoro-tts/audio | ||||
| timestamp_start: dora-vad/timestamp_start | |||||
| @@ -35,7 +35,10 @@ def main(): | |||||
| # Start recording | # Start recording | ||||
| with sd.InputStream( | with sd.InputStream( | ||||
| callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE, | |||||
| callback=callback, | |||||
| dtype=np.int16, | |||||
| channels=1, | |||||
| samplerate=SAMPLE_RATE, | |||||
| ): | ): | ||||
| while not finished: | while not finished: | ||||
| sd.sleep(1000) | sd.sleep(1000) | ||||
| @@ -18,8 +18,7 @@ def play_audio( | |||||
| ) -> pyaudio.Stream: | ) -> pyaudio.Stream: | ||||
| """Play audio using pyaudio and replace stream if already exists""" | """Play audio using pyaudio and replace stream if already exists""" | ||||
| if np.issubdtype(audio_array.dtype, np.floating): | if np.issubdtype(audio_array.dtype, np.floating): | ||||
| max_val = np.max(np.abs(audio_array)) | |||||
| audio_array = (audio_array / max_val) * 32767 | |||||
| audio_array = audio_array * 70_000 | |||||
| audio_array = audio_array.astype(np.int16) | audio_array = audio_array.astype(np.int16) | ||||
| if stream is None: | if stream is None: | ||||
| stream = p.open( | stream = p.open( | ||||
| @@ -36,12 +35,27 @@ def main(): | |||||
| """Main function for the node""" | """Main function for the node""" | ||||
| node = Node() | node = Node() | ||||
| stream = None | stream = None | ||||
| for event in node: | |||||
| if event["type"] == "INPUT": | |||||
| audio = np.array([]) | |||||
| sr = SAMPLE_RATE | |||||
| i = 0 | |||||
| while True: | |||||
| event = node.next(timeout=0.01) | |||||
| if event is None: | |||||
| break | |||||
| elif event["type"] == "INPUT": | |||||
| if event["id"] == "audio": | if event["id"] == "audio": | ||||
| audio = event["value"].to_numpy() | audio = event["value"].to_numpy() | ||||
| sr = event["metadata"].get("sample_rate", SAMPLE_RATE) | sr = event["metadata"].get("sample_rate", SAMPLE_RATE) | ||||
| stream = play_audio(audio, sr, stream) | |||||
| stream = play_audio(audio[0 : sr // 10], sr, stream) | |||||
| i = sr // 10 | |||||
| else: | |||||
| audio = np.array([]) | |||||
| i = 0 | |||||
| elif event["type"] == "ERROR": | |||||
| if i < len(audio): | |||||
| stream = play_audio(audio[i : i + sr // 10], sr, stream) | |||||
| i += sr // 10 | |||||
| if stream is not None: | if stream is not None: | ||||
| stream.stop_stream() | stream.stop_stream() | ||||
| @@ -37,7 +37,7 @@ def get_model_huggingface(): | |||||
| return model, tokenizer | return model, tokenizer | ||||
| ACTIVATION_WORDS = ["what", "how", "who", "where", "you"] | |||||
| ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() | |||||
| def generate_hf(model, tokenizer, prompt: str, history) -> str: | def generate_hf(model, tokenizer, prompt: str, history) -> str: | ||||
| @@ -7,11 +7,11 @@ from dora import Node | |||||
| from silero_vad import get_speech_timestamps, load_silero_vad | from silero_vad import get_speech_timestamps, load_silero_vad | ||||
| model = load_silero_vad() | model = load_silero_vad() | ||||
| MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "100")) | |||||
| MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "200")) | |||||
| MIN_SPEECH_DURATION_MS = int(os.getenv("MIN_SPEECH_DURATION_MS", "300")) | MIN_SPEECH_DURATION_MS = int(os.getenv("MIN_SPEECH_DURATION_MS", "300")) | ||||
| MIN_AUDIO_SAMPLING_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "20")) | |||||
| MAX_AUDIO_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "75")) | |||||
| THRESHOLD = float(os.getenv("THRESHOLD", "0.4")) | |||||
| MAX_AUDIO_DURATION_S = float(os.getenv("MAX_AUDIO_DURATION_S", "75")) | |||||
| MIN_AUDIO_SAMPLING_DURATION_MS = int(os.getenv("MIN_AUDIO_SAMPLING_DURATION_MS", "500")) | |||||
| def main(): | def main(): | ||||
| @@ -26,10 +26,11 @@ def main(): | |||||
| last_audios += [audio] | last_audios += [audio] | ||||
| last_audios = last_audios[-100:] | last_audios = last_audios[-100:] | ||||
| audio = np.concatenate(last_audios) | audio = np.concatenate(last_audios) | ||||
| sr = event["metadata"].get("sample_rate", 16000) | |||||
| speech_timestamps = get_speech_timestamps( | speech_timestamps = get_speech_timestamps( | ||||
| torch.from_numpy(audio), | torch.from_numpy(audio), | ||||
| model, | model, | ||||
| threshold=0.2, | |||||
| threshold=THRESHOLD, | |||||
| min_speech_duration_ms=MIN_SPEECH_DURATION_MS, | min_speech_duration_ms=MIN_SPEECH_DURATION_MS, | ||||
| min_silence_duration_ms=MIN_SILENCE_DURATION_MS, | min_silence_duration_ms=MIN_SILENCE_DURATION_MS, | ||||
| ) | ) | ||||
| @@ -37,16 +38,20 @@ def main(): | |||||
| # Check ig there is timestamp | # Check ig there is timestamp | ||||
| if ( | if ( | ||||
| len(speech_timestamps) > 0 | len(speech_timestamps) > 0 | ||||
| and len(last_audios) > MIN_AUDIO_SAMPLING_DURAION_S | |||||
| and len(audio) > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000 | |||||
| ): | ): | ||||
| # Check if the audio is not cut at the end. And only return if there is a long time spent | # Check if the audio is not cut at the end. And only return if there is a long time spent | ||||
| if speech_timestamps[-1]["end"] == len(audio): | if speech_timestamps[-1]["end"] == len(audio): | ||||
| node.send_output( | |||||
| "timestamp_start", | |||||
| pa.array([speech_timestamps[-1]["start"]]), | |||||
| ) | |||||
| continue | continue | ||||
| audio = audio[0 : speech_timestamps[-1]["end"]] | audio = audio[0 : speech_timestamps[-1]["end"]] | ||||
| node.send_output("audio", pa.array(audio)) | node.send_output("audio", pa.array(audio)) | ||||
| last_audios = [audio[speech_timestamps[-1]["end"] :]] | last_audios = [audio[speech_timestamps[-1]["end"] :]] | ||||
| # If there is no sound for too long return the audio | # If there is no sound for too long return the audio | ||||
| elif len(last_audios) > 75: | |||||
| elif len(last_audios) > MAX_AUDIO_DURATION_S: | |||||
| node.send_output("audio", pa.array(audio)) | node.send_output("audio", pa.array(audio)) | ||||
| last_audios = [] | last_audios = [] | ||||