Browse Source

Add interruption within llm audio demo

tags/v0.3.10-rc3
haixuantao 10 months ago
parent
commit
469c4ce77c
6 changed files with 38 additions and 24 deletions
  1. +0
    -10
      Cargo.lock
  2. +2
    -0
      examples/llm/qwen-dev.yml
  3. +4
    -1
      node-hub/dora-microphone/dora_microphone/main.py
  4. +19
    -5
      node-hub/dora-pyaudio/dora_pyaudio/main.py
  5. +1
    -1
      node-hub/dora-qwen/dora_qwen/main.py
  6. +12
    -7
      node-hub/dora-vad/dora_vad/main.py

+ 0
- 10
Cargo.lock View File

@@ -7566,7 +7566,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179"
dependencies = [
"once_cell",
"python3-dll-a",
"target-lexicon",
]

@@ -7628,15 +7627,6 @@ dependencies = [
"syn 2.0.94",
]

[[package]]
name = "python3-dll-a"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49fe4227a288cf9493942ad0220ea3f185f4d1f2a14f197f7344d6d02f4ed4ed"
dependencies = [
"cc",
]

[[package]]
name = "pythonize"
version = "0.22.0"


+ 2
- 0
examples/llm/qwen-dev.yml View File

@@ -14,6 +14,7 @@ nodes:
audio: dora-microphone/audio
outputs:
- audio
- timestamp_start

- id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper
@@ -55,3 +56,4 @@ nodes:
path: dora-pyaudio
inputs:
audio: dora-kokoro-tts/audio
timestamp_start: dora-vad/timestamp_start

+ 4
- 1
node-hub/dora-microphone/dora_microphone/main.py View File

@@ -35,7 +35,10 @@ def main():

# Start recording
with sd.InputStream(
callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE,
callback=callback,
dtype=np.int16,
channels=1,
samplerate=SAMPLE_RATE,
):
while not finished:
sd.sleep(1000)

+ 19
- 5
node-hub/dora-pyaudio/dora_pyaudio/main.py View File

@@ -18,8 +18,7 @@ def play_audio(
) -> pyaudio.Stream:
"""Play audio using pyaudio and replace stream if already exists"""
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
audio_array = (audio_array / max_val) * 32767
audio_array = audio_array * 70_000
audio_array = audio_array.astype(np.int16)
if stream is None:
stream = p.open(
@@ -36,12 +35,27 @@ def main():
"""Main function for the node"""
node = Node()
stream = None
for event in node:
if event["type"] == "INPUT":
audio = np.array([])
sr = SAMPLE_RATE
i = 0
while True:
event = node.next(timeout=0.01)
if event is None:
break
elif event["type"] == "INPUT":
if event["id"] == "audio":
audio = event["value"].to_numpy()
sr = event["metadata"].get("sample_rate", SAMPLE_RATE)
stream = play_audio(audio, sr, stream)
stream = play_audio(audio[0 : sr // 10], sr, stream)
i = sr // 10

else:
audio = np.array([])
i = 0
elif event["type"] == "ERROR":
if i < len(audio):
stream = play_audio(audio[i : i + sr // 10], sr, stream)
i += sr // 10

if stream is not None:
stream.stop_stream()


+ 1
- 1
node-hub/dora-qwen/dora_qwen/main.py View File

@@ -37,7 +37,7 @@ def get_model_huggingface():
return model, tokenizer


ACTIVATION_WORDS = ["what", "how", "who", "where", "you"]
ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split()


def generate_hf(model, tokenizer, prompt: str, history) -> str:


+ 12
- 7
node-hub/dora-vad/dora_vad/main.py View File

@@ -7,11 +7,11 @@ from dora import Node
from silero_vad import get_speech_timestamps, load_silero_vad

model = load_silero_vad()
MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "100"))
MIN_SILENCE_DURATION_MS = int(os.getenv("MIN_SILENCE_DURATION_MS", "200"))
MIN_SPEECH_DURATION_MS = int(os.getenv("MIN_SPEECH_DURATION_MS", "300"))
MIN_AUDIO_SAMPLING_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "20"))
MAX_AUDIO_DURAION_S = int(os.getenv("MAX_AUDIO_DURATION_S", "75"))
THRESHOLD = float(os.getenv("THRESHOLD", "0.4"))
MAX_AUDIO_DURATION_S = float(os.getenv("MAX_AUDIO_DURATION_S", "75"))
MIN_AUDIO_SAMPLING_DURATION_MS = int(os.getenv("MIN_AUDIO_SAMPLING_DURATION_MS", "500"))


def main():
@@ -26,10 +26,11 @@ def main():
last_audios += [audio]
last_audios = last_audios[-100:]
audio = np.concatenate(last_audios)
sr = event["metadata"].get("sample_rate", 16000)
speech_timestamps = get_speech_timestamps(
torch.from_numpy(audio),
model,
threshold=0.2,
threshold=THRESHOLD,
min_speech_duration_ms=MIN_SPEECH_DURATION_MS,
min_silence_duration_ms=MIN_SILENCE_DURATION_MS,
)
@@ -37,16 +38,20 @@ def main():
# Check ig there is timestamp
if (
len(speech_timestamps) > 0
and len(last_audios) > MIN_AUDIO_SAMPLING_DURAION_S
and len(audio) > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000
):
# Check if the audio is not cut at the end. And only return if there is a long time spent
if speech_timestamps[-1]["end"] == len(audio):
node.send_output(
"timestamp_start",
pa.array([speech_timestamps[-1]["start"]]),
)
continue
audio = audio[0 : speech_timestamps[-1]["end"]]
node.send_output("audio", pa.array(audio))
last_audios = [audio[speech_timestamps[-1]["end"] :]]

# If there is no sound for too long return the audio
elif len(last_audios) > 75:
elif len(last_audios) > MAX_AUDIO_DURATION_S:
node.send_output("audio", pa.array(audio))
last_audios = []

Loading…
Cancel
Save