diff --git a/examples/vlm/README.md b/examples/vlm/README.md index 471586de..92b22669 100644 --- a/examples/vlm/README.md +++ b/examples/vlm/README.md @@ -3,8 +3,13 @@ Make sure to have, dora, pip and cargo installed. ```bash +dora build vision_only.yml +dora run vision_only.yml + +# Wait for the qwenvl model to download which can takes a bit of time. + dora build dataflow.yml dora run dataflow.yml -# Wait for the qwenvl model to download which can takes a bit of time. +# Wait for the qwenvl, whisper model to download which can takes a bit of time. ``` diff --git a/examples/vlm/dataflow.yml b/examples/vlm/dataflow.yml index 51908b37..2c0846bb 100644 --- a/examples/vlm/dataflow.yml +++ b/examples/vlm/dataflow.yml @@ -1,4 +1,30 @@ nodes: + - id: dora-microphone + build: pip install -e ../../node-hub/dora-microphone + path: dora-microphone + inputs: + tick: dora/timer/millis/2000 + outputs: + - audio + + - id: dora-vad + build: pip install -e ../../node-hub/dora-vad + path: dora-vad + inputs: + audio: dora-microphone/audio + outputs: + - audio + + - id: dora-distil-whisper + build: pip install -e ../../node-hub/dora-distil-whisper + path: dora-distil-whisper + inputs: + input: dora-vad/audio + outputs: + - text + env: + TARGET_LANGUAGE: english + - id: camera build: pip install -e ../../node-hub/opencv-video-capture path: opencv-video-capture @@ -18,20 +44,24 @@ nodes: image: source: camera/image queue_size: 1 - tick: dora/timer/millis/400 + text: dora-distil-whisper/text outputs: - text - - tick env: DEFAULT_QUESTION: Describe the image in a very short sentence. - # For China # USE_MODELSCOPE_HUB: true - id: plot - build: pip install -e ../../node-hub/opencv-plot - path: opencv-plot + build: cargo build -p dora-rerun --release + path: dora-rerun inputs: image: source: camera/image queue_size: 1 - text: dora-qwenvl/tick + text_qwenvl: dora-qwenvl/text + text_whisper: dora-distil-whisper/text + env: + IMAGE_WIDTH: 640 + IMAGE_HEIGHT: 480 + README: | + # Visualization of QwenVL2 diff --git a/examples/vlm/dataflow_rerun.yml b/examples/vlm/vision_only.yml similarity index 100% rename from examples/vlm/dataflow_rerun.yml rename to examples/vlm/vision_only.yml diff --git a/node-hub/dora-microphone/dora_microphone/main.py b/node-hub/dora-microphone/dora_microphone/main.py index d709dc7b..aa8e03c6 100644 --- a/node-hub/dora-microphone/dora_microphone/main.py +++ b/node-hub/dora-microphone/dora_microphone/main.py @@ -17,14 +17,18 @@ def main(): node = Node() always_none = node.next(timeout=0.001) is None + finished = False # pylint: disable=unused-argument def callback(indata, frames, time, status): - nonlocal buffer, node, start_recording_time + nonlocal buffer, node, start_recording_time, finished if tm.time() - start_recording_time > MAX_DURATION: audio_data = np.array(buffer).ravel().astype(np.float32) / 32768.0 node.send_output("audio", pa.array(audio_data)) + if not always_none: + event = node.next(timeout=0.001) + finished = event is None buffer = [] start_recording_time = tm.time() else: @@ -34,10 +38,5 @@ def main(): with sd.InputStream( callback=callback, dtype=np.int16, channels=1, samplerate=SAMPLE_RATE ): - event_stream_is_none = False - while not event_stream_is_none: - if not always_none: - event = node.next() - event_stream_is_none = event is None - else: - sd.sleep(int(1000)) + while not finished: + sd.sleep(int(1000)) diff --git a/node-hub/dora-qwenvl/pyproject.toml b/node-hub/dora-qwenvl/pyproject.toml index a662ded8..346c5f9f 100644 --- a/node-hub/dora-qwenvl/pyproject.toml +++ b/node-hub/dora-qwenvl/pyproject.toml @@ -15,7 +15,7 @@ python = "^3.7" dora-rs = "^0.3.6" numpy = "< 2.0.0" torch = "^2.2.0" -torchvision = "^0.19" +torchvision = "^0.20" transformers = "^4.45" qwen-vl-utils = "^0.0.2" accelerate = "^0.33"