Add interruption within llm audio demo

1 year ago · 469c4ce77c
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7566,7 +7566,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179"
 dependencies = [
 "once_cell",
 "python3-dll-a",
 "target-lexicon",
 ]

@@ -7628,15 +7627,6 @@ dependencies = [
 "syn 2.0.94",
 ]

 [[package]]
 name = "python3-dll-a"
 version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49fe4227a288cf9493942ad0220ea3f185f4d1f2a14f197f7344d6d02f4ed4ed"
 dependencies = [
 "cc",
 ]

 [[package]]
 name = "pythonize"
 version = "0.22.0"
--- a/examples/llm/qwen-dev.yml
+++ b/examples/llm/qwen-dev.yml
@@ -14,6 +14,7 @@ nodes:
      audio: dora-microphone/audio
    outputs:
      - audio
      - timestamp_start

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
@@ -55,3 +56,4 @@ nodes:
    path: dora-pyaudio
    inputs:
      audio: dora-kokoro-tts/audio
      timestamp_start: dora-vad/timestamp_start
--- a/node-hub/dora-microphone/dora_microphone/main.py
+++ b/node-hub/dora-microphone/dora_microphone/main.py
@@ -35,7 +35,10 @@ def main():

    # Start recording
    with sd.InputStream(
        callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE,
        callback=callback,
        dtype=np.int16,
        channels=1,
        samplerate=SAMPLE_RATE,
    ):
        while not finished:
            sd.sleep(1000)
--- a/node-hub/dora-pyaudio/dora_pyaudio/main.py
+++ b/node-hub/dora-pyaudio/dora_pyaudio/main.py
@@ -18,8 +18,7 @@ def play_audio(
 ) -> pyaudio.Stream:
    """Play audio using pyaudio and replace stream if already exists"""
    if np.issubdtype(audio_array.dtype, np.floating):
        max_val = np.max(np.abs(audio_array))
        audio_array = (audio_array / max_val) * 32767
        audio_array = audio_array * 70_000
        audio_array = audio_array.astype(np.int16)
    if stream is None:
        stream = p.open(
@@ -36,12 +35,27 @@ def main():
    """Main function for the node"""
    node = Node()
    stream = None
    for event in node:
        if event["type"] == "INPUT":
    audio = np.array([])
    sr = SAMPLE_RATE
    i = 0
    while True:
        event = node.next(timeout=0.01)
        if event is None:
            break
        elif event["type"] == "INPUT":
            if event["id"] == "audio":
                audio = event["value"].to_numpy()
                sr = event["metadata"].get("sample_rate", SAMPLE_RATE)
                stream = play_audio(audio, sr, stream)
                stream = play_audio(audio[0 : sr // 10], sr, stream)
                i = sr // 10

            else:
                audio = np.array([])
                i = 0
        elif event["type"] == "ERROR":
            if i < len(audio):
                stream = play_audio(audio[i : i + sr // 10], sr, stream)
                i += sr // 10

    if stream is not None:
        stream.stop_stream()
--- a/node-hub/dora-qwen/dora_qwen/main.py
+++ b/node-hub/dora-qwen/dora_qwen/main.py
@@ -37,7 +37,7 @@ def get_model_huggingface():
    return model, tokenizer


 ACTIVATION_WORDS = ["what", "how", "who", "where", "you"]
 ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()


 def generate_hf(model, tokenizer, prompt: str, history) -> str:
--- a/node-hub/dora-vad/dora_vad/main.py
+++ b/node-hub/dora-vad/dora_vad/main.py
@@ -7,11 +7,11 @@ from dora import Node
 from silero_vad import get_speech_timestamps, load_silero_vad

 model = load_silero_vad()
 MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "100"))
 MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "200"))
 MIN_SPEECH_DURATION_MS = int(os.getenv("MIN_SPEECH_DURATION_MS", "300"))

 MIN_AUDIO_SAMPLING_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "20"))
 MAX_AUDIO_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "75"))
 THRESHOLD = float(os.getenv("THRESHOLD", "0.4"))
 MAX_AUDIO_DURATION_S = float(os.getenv("MAX_AUDIO_DURATION_S", "75"))
 MIN_AUDIO_SAMPLING_DURATION_MS = int(os.getenv("MIN_AUDIO_SAMPLING_DURATION_MS", "500"))


 def main():
@@ -26,10 +26,11 @@ def main():
            last_audios += [audio]
            last_audios = last_audios[-100:]
            audio = np.concatenate(last_audios)
            sr = event["metadata"].get("sample_rate", 16000)
            speech_timestamps = get_speech_timestamps(
                torch.from_numpy(audio),
                model,
                threshold=0.2,
                threshold=THRESHOLD,
                min_speech_duration_ms=MIN_SPEECH_DURATION_MS,
                min_silence_duration_ms=MIN_SILENCE_DURATION_MS,
            )
@@ -37,16 +38,20 @@ def main():
            # Check ig there is timestamp
            if (
                len(speech_timestamps) > 0
                and len(last_audios) > MIN_AUDIO_SAMPLING_DURAION_S
                and len(audio) > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000
            ):
                # Check if the audio is not cut at the end. And only return if there is a long time spent
                if speech_timestamps[-1]["end"] == len(audio):
                    node.send_output(
                        "timestamp_start",
                        pa.array([speech_timestamps[-1]["start"]]),
                    )
                    continue
                audio = audio[0 : speech_timestamps[-1]["end"]]
                node.send_output("audio", pa.array(audio))
                last_audios = [audio[speech_timestamps[-1]["end"] :]]

            # If there is no sound for too long return the audio
            elif len(last_audios) > 75:
            elif len(last_audios) > MAX_AUDIO_DURATION_S:
                node.send_output("audio", pa.array(audio))
                last_audios = []