Browse Source

Improve vlm by adding speech to text within example

tags/v0.3.9-rc1
haixuantao 1 year ago
parent
commit
fc8bc8a4fe
5 changed files with 50 additions and 16 deletions
  1. +6
    -1
      examples/vlm/README.md
  2. +36
    -6
      examples/vlm/dataflow.yml
  3. +0
    -0
      examples/vlm/vision_only.yml
  4. +7
    -8
      node-hub/dora-microphone/dora_microphone/main.py
  5. +1
    -1
      node-hub/dora-qwenvl/pyproject.toml

+ 6
- 1
examples/vlm/README.md View File

@@ -3,8 +3,13 @@
Make sure to have, dora, pip and cargo installed.

```bash
dora build vision_only.yml
dora run vision_only.yml

# Wait for the qwenvl model to download which can takes a bit of time.

dora build dataflow.yml
dora run dataflow.yml

# Wait for the qwenvl model to download which can takes a bit of time.
# Wait for the qwenvl, whisper model to download which can takes a bit of time.
```

+ 36
- 6
examples/vlm/dataflow.yml View File

@@ -1,4 +1,30 @@
nodes:
- id: dora-microphone
build: pip install -e ../../node-hub/dora-microphone
path: dora-microphone
inputs:
tick: dora/timer/millis/2000
outputs:
- audio

- id: dora-vad
build: pip install -e ../../node-hub/dora-vad
path: dora-vad
inputs:
audio: dora-microphone/audio
outputs:
- audio

- id: dora-distil-whisper
build: pip install -e ../../node-hub/dora-distil-whisper
path: dora-distil-whisper
inputs:
input: dora-vad/audio
outputs:
- text
env:
TARGET_LANGUAGE: english

- id: camera
build: pip install -e ../../node-hub/opencv-video-capture
path: opencv-video-capture
@@ -18,20 +44,24 @@ nodes:
image:
source: camera/image
queue_size: 1
tick: dora/timer/millis/400
text: dora-distil-whisper/text
outputs:
- text
- tick
env:
DEFAULT_QUESTION: Describe the image in a very short sentence.
# For China
# USE_MODELSCOPE_HUB: true

- id: plot
build: pip install -e ../../node-hub/opencv-plot
path: opencv-plot
build: cargo build -p dora-rerun --release
path: dora-rerun
inputs:
image:
source: camera/image
queue_size: 1
text: dora-qwenvl/tick
text_qwenvl: dora-qwenvl/text
text_whisper: dora-distil-whisper/text
env:
IMAGE_WIDTH: 640
IMAGE_HEIGHT: 480
README: |
# Visualization of QwenVL2

examples/vlm/dataflow_rerun.yml → examples/vlm/vision_only.yml View File


+ 7
- 8
node-hub/dora-microphone/dora_microphone/main.py View File

@@ -17,14 +17,18 @@ def main():
node = Node()

always_none = node.next(timeout=0.001) is None
finished = False

# pylint: disable=unused-argument
def callback(indata, frames, time, status):
nonlocal buffer, node, start_recording_time
nonlocal buffer, node, start_recording_time, finished

if tm.time() - start_recording_time > MAX_DURATION:
audio_data = np.array(buffer).ravel().astype(np.float32) / 32768.0
node.send_output("audio", pa.array(audio_data))
if not always_none:
event = node.next(timeout=0.001)
finished = event is None
buffer = []
start_recording_time = tm.time()
else:
@@ -34,10 +38,5 @@ def main():
with sd.InputStream(
callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE
):
event_stream_is_none = False
while not event_stream_is_none:
if not always_none:
event = node.next()
event_stream_is_none = event is None
else:
sd.sleep(int(1000))
while not finished:
sd.sleep(int(1000))

+ 1
- 1
node-hub/dora-qwenvl/pyproject.toml View File

@@ -15,7 +15,7 @@ python = "^3.7"
dora-rs = "^0.3.6"
numpy = "< 2.0.0"
torch = "^2.2.0"
torchvision = "^0.19"
torchvision = "^0.20"
transformers = "^4.45"
qwen-vl-utils = "^0.0.2"
accelerate = "^0.33"


Loading…
Cancel
Save