diff --git a/Cargo.lock b/Cargo.lock index aead92e5..6d442471 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7566,7 +7566,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" dependencies = [ "once_cell", - "python3-dll-a", "target-lexicon", ] @@ -7628,15 +7627,6 @@ dependencies = [ "syn 2.0.94", ] -[[package]] -name = "python3-dll-a" -version = "0.2.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49fe4227a288cf9493942ad0220ea3f185f4d1f2a14f197f7344d6d02f4ed4ed" -dependencies = [ - "cc", -] - [[package]] name = "pythonize" version = "0.22.0" diff --git a/examples/llm/qwen-dev.yml b/examples/llm/qwen-dev.yml index d83cac52..128466c5 100755 --- a/examples/llm/qwen-dev.yml +++ b/examples/llm/qwen-dev.yml @@ -14,6 +14,7 @@ nodes: audio: dora-microphone/audio outputs: - audio + - timestamp_start - id: dora-distil-whisper build: pip install -e ../../node-hub/dora-distil-whisper @@ -55,3 +56,4 @@ nodes: path: dora-pyaudio inputs: audio: dora-kokoro-tts/audio + timestamp_start: dora-vad/timestamp_start diff --git a/node-hub/dora-microphone/dora_microphone/main.py b/node-hub/dora-microphone/dora_microphone/main.py index 10a82688..16160bcb 100644 --- a/node-hub/dora-microphone/dora_microphone/main.py +++ b/node-hub/dora-microphone/dora_microphone/main.py @@ -35,7 +35,10 @@ def main(): # Start recording with sd.InputStream( - callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE, + callback=callback, + dtype=np.int16, + channels=1, + samplerate=SAMPLE_RATE, ): while not finished: sd.sleep(1000) diff --git a/node-hub/dora-pyaudio/dora_pyaudio/main.py b/node-hub/dora-pyaudio/dora_pyaudio/main.py index a1895f24..c3341b18 100644 --- a/node-hub/dora-pyaudio/dora_pyaudio/main.py +++ b/node-hub/dora-pyaudio/dora_pyaudio/main.py @@ -18,8 +18,7 @@ def play_audio( ) -> pyaudio.Stream: """Play audio using pyaudio and replace stream if already exists""" if np.issubdtype(audio_array.dtype, np.floating): - max_val = np.max(np.abs(audio_array)) - audio_array = (audio_array / max_val) * 32767 + audio_array = audio_array * 70_000 audio_array = audio_array.astype(np.int16) if stream is None: stream = p.open( @@ -36,12 +35,27 @@ def main(): """Main function for the node""" node = Node() stream = None - for event in node: - if event["type"] == "INPUT": + audio = np.array([]) + sr = SAMPLE_RATE + i = 0 + while True: + event = node.next(timeout=0.01) + if event is None: + break + elif event["type"] == "INPUT": if event["id"] == "audio": audio = event["value"].to_numpy() sr = event["metadata"].get("sample_rate", SAMPLE_RATE) - stream = play_audio(audio, sr, stream) + stream = play_audio(audio[0 : sr // 10], sr, stream) + i = sr // 10 + + else: + audio = np.array([]) + i = 0 + elif event["type"] == "ERROR": + if i < len(audio): + stream = play_audio(audio[i : i + sr // 10], sr, stream) + i += sr // 10 if stream is not None: stream.stop_stream() diff --git a/node-hub/dora-qwen/dora_qwen/main.py b/node-hub/dora-qwen/dora_qwen/main.py index 7ca76f82..8d202456 100644 --- a/node-hub/dora-qwen/dora_qwen/main.py +++ b/node-hub/dora-qwen/dora_qwen/main.py @@ -37,7 +37,7 @@ def get_model_huggingface(): return model, tokenizer -ACTIVATION_WORDS = ["what", "how", "who", "where", "you"] +ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split() def generate_hf(model, tokenizer, prompt: str, history) -> str: diff --git a/node-hub/dora-vad/dora_vad/main.py b/node-hub/dora-vad/dora_vad/main.py index 38d52a41..05e14773 100644 --- a/node-hub/dora-vad/dora_vad/main.py +++ b/node-hub/dora-vad/dora_vad/main.py @@ -7,11 +7,11 @@ from dora import Node from silero_vad import get_speech_timestamps, load_silero_vad model = load_silero_vad() -MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "100")) +MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "200")) MIN_SPEECH_DURATION_MS = int(os.getenv("MIN_SPEECH_DURATION_MS", "300")) - -MIN_AUDIO_SAMPLING_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "20")) -MAX_AUDIO_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "75")) +THRESHOLD = float(os.getenv("THRESHOLD", "0.4")) +MAX_AUDIO_DURATION_S = float(os.getenv("MAX_AUDIO_DURATION_S", "75")) +MIN_AUDIO_SAMPLING_DURATION_MS = int(os.getenv("MIN_AUDIO_SAMPLING_DURATION_MS", "500")) def main(): @@ -26,10 +26,11 @@ def main(): last_audios += [audio] last_audios = last_audios[-100:] audio = np.concatenate(last_audios) + sr = event["metadata"].get("sample_rate", 16000) speech_timestamps = get_speech_timestamps( torch.from_numpy(audio), model, - threshold=0.2, + threshold=THRESHOLD, min_speech_duration_ms=MIN_SPEECH_DURATION_MS, min_silence_duration_ms=MIN_SILENCE_DURATION_MS, ) @@ -37,16 +38,20 @@ def main(): # Check ig there is timestamp if ( len(speech_timestamps) > 0 - and len(last_audios) > MIN_AUDIO_SAMPLING_DURAION_S + and len(audio) > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000 ): # Check if the audio is not cut at the end. And only return if there is a long time spent if speech_timestamps[-1]["end"] == len(audio): + node.send_output( + "timestamp_start", + pa.array([speech_timestamps[-1]["start"]]), + ) continue audio = audio[0 : speech_timestamps[-1]["end"]] node.send_output("audio", pa.array(audio)) last_audios = [audio[speech_timestamps[-1]["end"] :]] # If there is no sound for too long return the audio - elif len(last_audios) > 75: + elif len(last_audios) > MAX_AUDIO_DURATION_S: node.send_output("audio", pa.array(audio)) last_audios = []