Browse Source

Add interruption within llm audio demo

tags/v0.3.10-rc3
haixuantao 10 months ago
parent
commit
469c4ce77c
6 changed files with 38 additions and 24 deletions
  1. +0
    -10
      Cargo.lock
  2. +2
    -0
      examples/llm/qwen-dev.yml
  3. +4
    -1
      node-hub/dora-microphone/dora_microphone/main.py
  4. +19
    -5
      node-hub/dora-pyaudio/dora_pyaudio/main.py
  5. +1
    -1
      node-hub/dora-qwen/dora_qwen/main.py
  6. +12
    -7
      node-hub/dora-vad/dora_vad/main.py

+ 0
- 10
Cargo.lock View File

@@ -7566,7 +7566,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179"
dependencies = [ dependencies = [
"once_cell", "once_cell",
"python3-dll-a",
"target-lexicon", "target-lexicon",
] ]


@@ -7628,15 +7627,6 @@ dependencies = [
"syn 2.0.94", "syn 2.0.94",
] ]


[[package]]
name = "python3-dll-a"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49fe4227a288cf9493942ad0220ea3f185f4d1f2a14f197f7344d6d02f4ed4ed"
dependencies = [
"cc",
]

[[package]] [[package]]
name = "pythonize" name = "pythonize"
version = "0.22.0" version = "0.22.0"


+ 2
- 0
examples/llm/qwen-dev.yml View File

@@ -14,6 +14,7 @@ nodes:
audio: dora-microphone/audio audio: dora-microphone/audio
outputs: outputs:
- audio - audio
- timestamp_start


- id: dora-distil-whisper - id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper build: pip install -e ../../node-hub/dora-distil-whisper
@@ -55,3 +56,4 @@ nodes:
path: dora-pyaudio path: dora-pyaudio
inputs: inputs:
audio: dora-kokoro-tts/audio audio: dora-kokoro-tts/audio
timestamp_start: dora-vad/timestamp_start

+ 4
- 1
node-hub/dora-microphone/dora_microphone/main.py View File

@@ -35,7 +35,10 @@ def main():


# Start recording # Start recording
with sd.InputStream( with sd.InputStream(
callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE,
callback=callback,
dtype=np.int16,
channels=1,
samplerate=SAMPLE_RATE,
): ):
while not finished: while not finished:
sd.sleep(1000) sd.sleep(1000)

+ 19
- 5
node-hub/dora-pyaudio/dora_pyaudio/main.py View File

@@ -18,8 +18,7 @@ def play_audio(
) -> pyaudio.Stream: ) -> pyaudio.Stream:
"""Play audio using pyaudio and replace stream if already exists""" """Play audio using pyaudio and replace stream if already exists"""
if np.issubdtype(audio_array.dtype, np.floating): if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
audio_array = (audio_array / max_val) * 32767
audio_array = audio_array * 70_000
audio_array = audio_array.astype(np.int16) audio_array = audio_array.astype(np.int16)
if stream is None: if stream is None:
stream = p.open( stream = p.open(
@@ -36,12 +35,27 @@ def main():
"""Main function for the node""" """Main function for the node"""
node = Node() node = Node()
stream = None stream = None
for event in node:
if event["type"] == "INPUT":
audio = np.array([])
sr = SAMPLE_RATE
i = 0
while True:
event = node.next(timeout=0.01)
if event is None:
break
elif event["type"] == "INPUT":
if event["id"] == "audio": if event["id"] == "audio":
audio = event["value"].to_numpy() audio = event["value"].to_numpy()
sr = event["metadata"].get("sample_rate", SAMPLE_RATE) sr = event["metadata"].get("sample_rate", SAMPLE_RATE)
stream = play_audio(audio, sr, stream)
stream = play_audio(audio[0 : sr // 10], sr, stream)
i = sr // 10

else:
audio = np.array([])
i = 0
elif event["type"] == "ERROR":
if i < len(audio):
stream = play_audio(audio[i : i + sr // 10], sr, stream)
i += sr // 10


if stream is not None: if stream is not None:
stream.stop_stream() stream.stop_stream()


+ 1
- 1
node-hub/dora-qwen/dora_qwen/main.py View File

@@ -37,7 +37,7 @@ def get_model_huggingface():
return model, tokenizer return model, tokenizer




ACTIVATION_WORDS = ["what", "how", "who", "where", "you"]
ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()




def generate_hf(model, tokenizer, prompt: str, history) -> str: def generate_hf(model, tokenizer, prompt: str, history) -> str:


+ 12
- 7
node-hub/dora-vad/dora_vad/main.py View File

@@ -7,11 +7,11 @@ from dora import Node
from silero_vad import get_speech_timestamps, load_silero_vad from silero_vad import get_speech_timestamps, load_silero_vad


model = load_silero_vad() model = load_silero_vad()
MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "100"))
MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "200"))
MIN_SPEECH_DURATION_MS = int(os.getenv("MIN_SPEECH_DURATION_MS", "300")) MIN_SPEECH_DURATION_MS = int(os.getenv("MIN_SPEECH_DURATION_MS", "300"))
MIN_AUDIO_SAMPLING_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "20"))
MAX_AUDIO_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "75"))
THRESHOLD = float(os.getenv("THRESHOLD", "0.4"))
MAX_AUDIO_DURATION_S = float(os.getenv("MAX_AUDIO_DURATION_S", "75"))
MIN_AUDIO_SAMPLING_DURATION_MS = int(os.getenv("MIN_AUDIO_SAMPLING_DURATION_MS", "500"))




def main(): def main():
@@ -26,10 +26,11 @@ def main():
last_audios += [audio] last_audios += [audio]
last_audios = last_audios[-100:] last_audios = last_audios[-100:]
audio = np.concatenate(last_audios) audio = np.concatenate(last_audios)
sr = event["metadata"].get("sample_rate", 16000)
speech_timestamps = get_speech_timestamps( speech_timestamps = get_speech_timestamps(
torch.from_numpy(audio), torch.from_numpy(audio),
model, model,
threshold=0.2,
threshold=THRESHOLD,
min_speech_duration_ms=MIN_SPEECH_DURATION_MS, min_speech_duration_ms=MIN_SPEECH_DURATION_MS,
min_silence_duration_ms=MIN_SILENCE_DURATION_MS, min_silence_duration_ms=MIN_SILENCE_DURATION_MS,
) )
@@ -37,16 +38,20 @@ def main():
# Check ig there is timestamp # Check ig there is timestamp
if ( if (
len(speech_timestamps) > 0 len(speech_timestamps) > 0
and len(last_audios) > MIN_AUDIO_SAMPLING_DURAION_S
and len(audio) > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000
): ):
# Check if the audio is not cut at the end. And only return if there is a long time spent # Check if the audio is not cut at the end. And only return if there is a long time spent
if speech_timestamps[-1]["end"] == len(audio): if speech_timestamps[-1]["end"] == len(audio):
node.send_output(
"timestamp_start",
pa.array([speech_timestamps[-1]["start"]]),
)
continue continue
audio = audio[0 : speech_timestamps[-1]["end"]] audio = audio[0 : speech_timestamps[-1]["end"]]
node.send_output("audio", pa.array(audio)) node.send_output("audio", pa.array(audio))
last_audios = [audio[speech_timestamps[-1]["end"] :]] last_audios = [audio[speech_timestamps[-1]["end"] :]]


# If there is no sound for too long return the audio # If there is no sound for too long return the audio
elif len(last_audios) > 75:
elif len(last_audios) > MAX_AUDIO_DURATION_S:
node.send_output("audio", pa.array(audio)) node.send_output("audio", pa.array(audio))
last_audios = [] last_audios = []

Loading…
Cancel
Save