|
- """Module for speech recognition using Whisper.
-
- This module provides functionality for capturing audio input and converting
- it to text using the Whisper speech recognition model.
- """
-
- import numpy as np
- import pyarrow as pa
- import sounddevice as sd
- import whisper
- from dora import Node
- from pynput import keyboard
- from pynput.keyboard import Events, Key
-
- model = whisper.load_model("base")
-
- SAMPLE_RATE = 16000
-
- node = Node()
-
-
- def get_text(duration) -> str:
- """Capture audio and convert it to text using Whisper.
-
- Args:
- duration: Duration of audio to capture in seconds
-
- Returns:
- str: Transcribed text from the audio input
-
- """
- ## Microphone
- audio_data = sd.rec(
- int(SAMPLE_RATE * duration),
- samplerate=SAMPLE_RATE,
- channels=1,
- dtype=np.int16,
- blocking=True,
- )
-
- audio = audio_data.ravel().astype(np.float32) / 32768.0
-
- ## Speech to text
- audio = whisper.pad_or_trim(audio)
- return model.transcribe(audio, language="en")
-
-
- ## Check for keyboard event
- with keyboard.Events() as events:
- for dora_event in node:
- if dora_event["type"] == "INPUT":
- event = events.get(0.1)
- if (
- event is not None
- and (event.key == Key.alt_r or event.key == Key.ctrl_r)
- and isinstance(event, Events.Press)
- ):
- if event.key == Key.alt_r:
- result = get_text(5)
- node.send_output(
- "text_llm", pa.array([result["text"]]), dora_event["metadata"],
- )
- elif event.key == Key.ctrl_r:
- result = get_text(3)
- node.send_output(
- "text_policy",
- pa.array([result["text"]]),
- dora_event["metadata"],
- )
|