Improve vlm by adding speech to text within example

1 year ago · fc8bc8a4fe
--- a/examples/vlm/README.md
+++ b/examples/vlm/README.md
@@ -3,8 +3,13 @@
 Make sure to have, dora, pip and cargo installed.

 ```bash
 dora build vision_only.yml
 dora run vision_only.yml

 # Wait for the qwenvl model to download which can takes a bit of time.

 dora build dataflow.yml
 dora run dataflow.yml

 # Wait for the qwenvl model to download which can takes a bit of time.
 # Wait for the qwenvl, whisper model to download which can takes a bit of time.
 ```
--- a/examples/vlm/dataflow.yml
+++ b/examples/vlm/dataflow.yml
@@ -1,4 +1,30 @@
 nodes:
  - id: dora-microphone
    build: pip install -e ../../node-hub/dora-microphone
    path: dora-microphone
    inputs:
      tick: dora/timer/millis/2000
    outputs:
      - audio

  - id: dora-vad
    build: pip install -e ../../node-hub/dora-vad
    path: dora-vad
    inputs:
      audio: dora-microphone/audio
    outputs:
      - audio

  - id: dora-distil-whisper
    build: pip install -e ../../node-hub/dora-distil-whisper
    path: dora-distil-whisper
    inputs:
      input: dora-vad/audio
    outputs:
      - text
    env:
      TARGET_LANGUAGE: english

  - id: camera
    build: pip install -e ../../node-hub/opencv-video-capture
    path: opencv-video-capture
@@ -18,20 +44,24 @@ nodes:
      image:
        source: camera/image
        queue_size: 1
      tick: dora/timer/millis/400
      text: dora-distil-whisper/text
    outputs:
      - text
      - tick
    env:
      DEFAULT_QUESTION: Describe the image in a very short sentence.
      # For China
      # USE_MODELSCOPE_HUB: true

  - id: plot
    build: pip install -e ../../node-hub/opencv-plot
    path: opencv-plot
    build: cargo build -p dora-rerun --release
    path: dora-rerun
    inputs:
      image:
        source: camera/image
        queue_size: 1
      text: dora-qwenvl/tick
      text_qwenvl: dora-qwenvl/text
      text_whisper: dora-distil-whisper/text
    env:
      IMAGE_WIDTH: 640
      IMAGE_HEIGHT: 480
      README: |
        # Visualization of QwenVL2
--- a/examples/vlm/dataflow_rerun.yml
+++ b/examples/vlm/dataflow_rerun.yml
--- a/node-hub/dora-microphone/dora_microphone/main.py
+++ b/node-hub/dora-microphone/dora_microphone/main.py
@@ -17,14 +17,18 @@ def main():
    node = Node()

    always_none = node.next(timeout=0.001) is None
    finished = False

    # pylint: disable=unused-argument
    def callback(indata, frames, time, status):
        nonlocal buffer, node, start_recording_time
        nonlocal buffer, node, start_recording_time, finished

        if tm.time() - start_recording_time > MAX_DURATION:
            audio_data = np.array(buffer).ravel().astype(np.float32) / 32768.0
            node.send_output("audio", pa.array(audio_data))
            if not always_none:
                event = node.next(timeout=0.001)
                finished = event is None
            buffer = []
            start_recording_time = tm.time()
        else:
@@ -34,10 +38,5 @@ def main():
    with sd.InputStream(
        callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE
    ):
        event_stream_is_none = False
        while not event_stream_is_none:
            if not always_none:
                event = node.next()
                event_stream_is_none = event is None
            else:
                sd.sleep(int(1000))
        while not finished:
            sd.sleep(int(1000))
--- a/node-hub/dora-qwenvl/pyproject.toml
+++ b/node-hub/dora-qwenvl/pyproject.toml
@@ -15,7 +15,7 @@ python = "^3.7"
 dora-rs = "^0.3.6"
 numpy = "< 2.0.0"
 torch = "^2.2.0"
 torchvision = "^0.19"
 torchvision = "^0.20"
 transformers = "^4.45"
 qwen-vl-utils = "^0.0.2"
 accelerate = "^0.33"