Improvement of the overall language pipeline to be more resilient

7 months ago · 2f89cf811d
--- a/node-hub/dora-distil-whisper/dora_distil_whisper/main.py
+++ b/node-hub/dora-distil-whisper/dora_distil_whisper/main.py
@@ -48,35 +48,6 @@ def remove_text_noise(text: str, text_noise="") -> str:
    text_words = normalized_text.split()
    noise_words = normalized_noise.split()

    # Function to find and remove noise sequence flexibly
    def remove_flexible(text_list, noise_list):
        i = 0
        while i <= len(text_list) - len(noise_list):
            match = True
            extra_words = 0
            for j, noise_word in enumerate(noise_list):
                if i + j + extra_words >= len(text_list):
                    match = False
                    break
                # Allow skipping extra words in text_list
                while (
                    i + j + extra_words < len(text_list)
                    and text_list[i + j + extra_words] != noise_word
                ):
                    extra_words += 1
                    if i + j + extra_words >= len(text_list):
                        match = False
                        break
                if not match:
                    break
            if match:
                # Remove matched part
                del text_list[i : i + len(noise_list) + extra_words]
                i = max(0, i - len(noise_list))  # Adjust index after removal
            else:
                i += 1
        return text_list

    # Only remove parts of text_noise that are found in text
    cleaned_words = text_words[:]
    for noise_word in noise_words:
@@ -126,7 +97,26 @@ BAD_SENTENCES = [
    "",
    " so",
    " So.",
    " So, let's go.",
    " so so",
    " What?",
    " We'll see you next time.",
    " I'll see you next time.",
    " We're going to come back.",
    " let's move on.",
    " Here we go.",
    " my",
    " All right. Thank you.",
    " That's what we're doing.",
    " That's what I wanted to do.",
    " I'll be back.",
    " Hold this. Hold this.",
    " Hold this one. Hold this one.",
    " And we'll see you next time.",
    " strength.",
    " Length.",
    " Let's go.",
    " Let's do it.",
    "You",
    "You ",
    " You",
@@ -199,6 +189,12 @@ def main():
            append_punctuations=".",
            language=TARGET_LANGUAGE,
        )
        result = mlx_whisper.transcribe(
            np.array([]),
            path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
            append_punctuations=".",
            language=TARGET_LANGUAGE,
        )

    node = Node()
    noise_timestamp = time.time()
@@ -244,6 +240,8 @@ def main():
                        generate_kwargs=confg,
                    )
                if result["text"] in BAD_SENTENCES:
                    print("Discarded text: ", result["text"])
                    # cache_audio = None
                    continue
                text = cut_repetition(result["text"])

@@ -258,20 +256,27 @@ def main():
                    continue

                if (
                    (
                        text.endswith(".")
                        or text.endswith("!")
                        or text.endswith("?")
                        or text.endswith('."')
                        or text.endswith('!"')
                        or text.endswith('?"')
                    )
                    and not text.endswith("...")  # Avoid ending with ellipsis
                ):
                    text.endswith(".")
                    or text.endswith("!")
                    or text.endswith("?")
                    or text.endswith('."')
                    or text.endswith('!"')
                    or text.endswith('?"')
                ) and not text.endswith("..."):
                    node.send_output(
                        "text",
                        pa.array([text]),
                    )
                    node.send_output(
                        "stop",
                        pa.array([text]),
                    )
                    cache_audio = None
                else:
                    audio = None
                    print("Text:", text)
                elif text.endswith("..."):
                    print(
                        "Keeping audio in cache for next text output with punctuation"
                    )
                    print("Discarded text", text)
                    cache_audio = audio
--- a/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
+++ b/node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
@@ -1,4 +1,5 @@
 """TODO: Add docstring."""

 import os
 import re

@@ -8,11 +9,12 @@ from kokoro import KPipeline

 LANGUAGE = os.getenv("LANGUAGE", "en")


 def main():
    """TODO: Add docstring."""
    if LANGUAGE in ["en", "english"]:
        pipeline = KPipeline(lang_code="a")
    elif LANGUAGE in ["zh","ch","chinese"]:
    elif LANGUAGE in ["zh", "ch", "chinese"]:
        pipeline = KPipeline(lang_code="z")
    else:
        print("warning: Defaulting to english speaker as language not found")
@@ -22,22 +24,23 @@ def main():

    for event in node:
        if event["type"] == "INPUT":
            if event["id"] == "text":
                text = event["value"][0].as_py()
                if re.findall(r'[\u4e00-\u9fff]+', text):
                    pipeline = KPipeline(lang_code="z")
                elif pipeline.lang_code != "a":
                    pipeline = KPipeline(lang_code="a")  # <= make sure lang_code matches voice

                generator = pipeline(
                    text,
                    voice="af_heart",  # <= change voice here
                    speed=1.2,
                    split_pattern=r"\n+",
                )
                for _, (_, _, audio) in enumerate(generator):
                    audio = audio.numpy()
                    node.send_output("audio", pa.array(audio), {"sample_rate": 24000})
            text = event["value"][0].as_py()
            if re.findall(r"[\u4e00-\u9fff]+", text):
                pipeline = KPipeline(lang_code="z")
            elif pipeline.lang_code != "a":
                pipeline = KPipeline(
                    lang_code="a"
                )  # <= make sure lang_code matches voice

            generator = pipeline(
                text,
                voice="af_heart",  # <= change voice here
                speed=1.2,
                split_pattern=r"\n+",
            )
            for _, (_, _, audio) in enumerate(generator):
                audio = audio.numpy()
                node.send_output("audio", pa.array(audio), {"sample_rate": 24000})


 if __name__ == "__main__":
--- a/node-hub/dora-openai-websocket/src/main.rs
+++ b/node-hub/dora-openai-websocket/src/main.rs
@@ -30,6 +30,10 @@ use fastwebsockets::Frame;
 use fastwebsockets::OpCode;
 use fastwebsockets::Payload;
 use fastwebsockets::WebSocketError;
 use futures_concurrency::future::Race;
 use futures_util::future;
 use futures_util::future::Either;
 use futures_util::FutureExt;
 use http_body_util::Empty;
 use hyper::body::Bytes;
 use hyper::body::Incoming;
@@ -45,9 +49,7 @@ use std::fs;
 use std::io::{self, Write};
 use std::net::IpAddr;
 use std::net::Ipv4Addr;
 use std::time::Duration;
 use tokio::net::TcpListener;

 #[derive(Serialize, Deserialize, Debug)]
 pub struct ErrorDetails {
    pub code: Option<String>,
@@ -310,125 +312,153 @@ async fn handle_client(fut: upgrade::UpgradeFut) -> Result<(), WebSocketError> {
    let frame = Frame::text(payload);
    ws.write_frame(frame).await?;
    loop {
        let mut frame = ws.read_frame().await?;
        let event_fut = events.recv_async().map(Either::Left);
        let frame_fut = ws.read_frame().map(Either::Right);
        let event_stream = (event_fut, frame_fut).race();
        let mut finished = false;
        match frame.opcode {
            OpCode::Close => break,
            OpCode::Text | OpCode::Binary => {
                let data: OpenAIRealtimeMessage = serde_json::from_slice(&frame.payload).unwrap();

                match data {
                    OpenAIRealtimeMessage::InputAudioBufferAppend { audio } => {
                        // println!("Received audio data: {}", audio);
                        let f32_data = audio;
                        // Decode base64 encoded audio data
                        let f32_data = f32_data.trim();
                        if f32_data.is_empty() {
                            continue;
                        }
        let frame = match event_stream.await {
            future::Either::Left(Some(ev)) => {
                let frame = match ev {
                    dora_node_api::Event::Input {
                        id,
                        metadata: _,
                        data,
                    } => {
                        if data.data_type() == &DataType::Utf8 {
                            let data = data.as_string::<i32>();
                            let str = data.value(0);
                            let serialized_data =
                                OpenAIRealtimeResponse::ResponseAudioTranscriptDelta {
                                    response_id: "123".to_string(),
                                    item_id: "123".to_string(),
                                    output_index: 123,
                                    content_index: 123,
                                    delta: str.to_string(),
                                };

                        if let Ok(f32_data) = general_purpose::STANDARD.decode(f32_data) {
                            let f32_data = convert_pcm16_to_f32(&f32_data);
                            // Downsample to 16 kHz from 24 kHz
                            let f32_data = f32_data
                                .into_iter()
                                .enumerate()
                                .filter(|(i, _)| i % 3 != 0)
                                .map(|(_, v)| v)
                                .collect::<Vec<f32>>();
                            let mut parameter = MetadataParameters::default();
                            parameter.insert(
                                "sample_rate".to_string(),
                                dora_node_api::Parameter::Integer(16000),
                            );
                            node.send_output(
                                DataId::from("audio".to_string()),
                                parameter,
                                f32_data.into_arrow(),
                            )
                            .unwrap();
                            let ev = events.recv_async_timeout(Duration::from_millis(10)).await;

                            // println!("Received event: {:?}", ev);
                            let frame = match ev {
                                Some(dora_node_api::Event::Input {
                                    id,
                                    metadata: _,
                                    data,
                                }) => {
                                    if data.data_type() == &DataType::Utf8 {
                                        let data = data.as_string::<i32>();
                                        let str = data.value(0);
                                        let serialized_data =
                                            OpenAIRealtimeResponse::ResponseAudioTranscriptDelta {
                                                response_id: "123".to_string(),
                                                item_id: "123".to_string(),
                                                output_index: 123,
                                                content_index: 123,
                                                delta: str.to_string(),
                                            };

                                        frame.payload = Payload::Bytes(
                                            Bytes::from(
                                                serde_json::to_string(&serialized_data).unwrap(),
                                            )
                                            .into(),
                                        );
                                        frame.opcode = OpCode::Text;
                                        frame
                                    } else if id.contains("audio") {
                                        let data: Vec<f32> = into_vec(&data).unwrap();
                                        let data = convert_f32_to_pcm16(&data);
                                        let serialized_data =
                                            OpenAIRealtimeResponse::ResponseAudioDelta {
                                                response_id: "123".to_string(),
                                                item_id: "123".to_string(),
                                                output_index: 123,
                                                content_index: 123,
                                                delta: general_purpose::STANDARD.encode(data),
                                            };
                                        finished = true;

                                        frame.payload = Payload::Bytes(
                                            Bytes::from(
                                                serde_json::to_string(&serialized_data).unwrap(),
                                            )
                                            .into(),
                                        );
                                        frame.opcode = OpCode::Text;
                                        frame
                                    } else {
                                        unimplemented!()
                                    }
                                }
                                Some(dora_node_api::Event::Error(_)) => {
                                    // println!("Error in input: {}", s);
                                    continue;
                                }
                                _ => break,
                            let frame = Frame::text(Payload::Bytes(
                                Bytes::from(serde_json::to_string(&serialized_data).unwrap())
                                    .into(),
                            ));
                            frame
                        } else if id.contains("audio") {
                            let data: Vec<f32> = into_vec(&data).unwrap();
                            let data = convert_f32_to_pcm16(&data);
                            let serialized_data = OpenAIRealtimeResponse::ResponseAudioDelta {
                                response_id: "123".to_string(),
                                item_id: "123".to_string(),
                                output_index: 123,
                                content_index: 123,
                                delta: general_purpose::STANDARD.encode(data),
                            };
                            ws.write_frame(frame).await?;
                            if finished {
                                let serialized_data = OpenAIRealtimeResponse::ResponseDone {
                                    response: serde_json::Value::Null,
                            finished = true;

                            let frame = Frame::text(Payload::Bytes(
                                Bytes::from(serde_json::to_string(&serialized_data).unwrap())
                                    .into(),
                            ));
                            frame
                        } else if id.contains("stop") {
                            let serialized_data =
                                OpenAIRealtimeResponse::InputAudioBufferSpeechStopped {
                                    audio_end_ms: 123,
                                    item_id: "123".to_string(),
                                };
                            finished = true;

                            let frame = Frame::text(Payload::Bytes(
                                Bytes::from(serde_json::to_string(&serialized_data).unwrap())
                                    .into(),
                            ));
                            frame
                        } else {
                            unimplemented!()
                        }
                    }
                    dora_node_api::Event::Error(_) => {
                        // println!("Error in input: {}", s);
                        continue;
                    }
                    _ => break,
                };
                Some(frame)
            }
            future::Either::Left(None) => break,
            future::Either::Right(Ok(frame)) => {
                match frame.opcode {
                    OpCode::Close => break,
                    OpCode::Text | OpCode::Binary => {
                        let data: OpenAIRealtimeMessage =
                            serde_json::from_slice(&frame.payload).unwrap();

                        match data {
                            OpenAIRealtimeMessage::InputAudioBufferAppend { audio } => {
                                // println!("Received audio data: {}", audio);
                                let f32_data = audio;
                                // Decode base64 encoded audio data
                                let f32_data = f32_data.trim();
                                if f32_data.is_empty() {
                                    continue;
                                }

                                let payload = Payload::Bytes(
                                    Bytes::from(serde_json::to_string(&serialized_data).unwrap())
                                        .into(),
                                );
                                println!("Sending response done: {:?}", serialized_data);
                                let frame = Frame::text(payload);
                                ws.write_frame(frame).await?;
                                if let Ok(f32_data) = general_purpose::STANDARD.decode(f32_data) {
                                    let f32_data = convert_pcm16_to_f32(&f32_data);
                                    // Downsample to 16 kHz from 24 kHz
                                    let f32_data = f32_data
                                        .into_iter()
                                        .enumerate()
                                        .filter(|(i, _)| i % 3 != 0)
                                        .map(|(_, v)| v)
                                        .collect::<Vec<f32>>();

                                    let mut parameter = MetadataParameters::default();
                                    parameter.insert(
                                        "sample_rate".to_string(),
                                        dora_node_api::Parameter::Integer(16000),
                                    );
                                    node.send_output(
                                        DataId::from("audio".to_string()),
                                        parameter,
                                        f32_data.into_arrow(),
                                    )
                                    .unwrap();
                                }
                            }
                            OpenAIRealtimeMessage::InputAudioBufferCommit => break,
                            OpenAIRealtimeMessage::ResponseCreate { response } => {
                                if let Some(text) = response.instructions {
                                    node.send_output(
                                        DataId::from("text".to_string()),
                                        Default::default(),
                                        text.into_arrow(),
                                    )
                                    .unwrap();
                                }
                            }
                            _ => {}
                        }
                    }
                    OpenAIRealtimeMessage::InputAudioBufferCommit => break,
                    _ => {}
                    _ => break,
                }
                None
            }
            _ => break,
            future::Either::Right(Err(_)) => break,
        };
        if let Some(frame) = frame {
            ws.write_frame(frame).await?;
        }
        if finished {
            let serialized_data = OpenAIRealtimeResponse::ResponseDone {
                response: serde_json::Value::Null,
            };

            let payload = Payload::Bytes(
                Bytes::from(serde_json::to_string(&serialized_data).unwrap()).into(),
            );
            println!("Sending response done: {:?}", serialized_data);
            let frame = Frame::text(payload);
            ws.write_frame(frame).await?;
        };
    }

    Ok(())
--- a/node-hub/dora-vad/dora_vad/main.py
+++ b/node-hub/dora-vad/dora_vad/main.py
@@ -38,11 +38,21 @@ def main():
                min_silence_duration_ms=MIN_SILENCE_DURATION_MS,
                sampling_rate=sr,
            )

            if len(speech_timestamps) == 0:
                # If there is no speech, return the audio
                continue
            arg_max = np.argmax([ts["end"] - ts["start"] for ts in speech_timestamps])
            # Check ig there is timestamp
            if (
                len(speech_timestamps) > 0
                and len(audio) > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000
                and len(
                    audio[speech_timestamps[0]["start"] : speech_timestamps[-1]["end"]]
                )
                > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000
                and (
                    (len(audio) - speech_timestamps[arg_max]["end"])
                    > MIN_SILENCE_DURATION_MS / 1000 * sr * 5
                )
            ):
                # Check if the audio is not cut at the end. And only return if there is a long time spent
                if speech_timestamps[-1]["end"] == len(audio):
@@ -51,7 +61,7 @@ def main():
                        pa.array([speech_timestamps[-1]["start"]]),
                        metadata={"sample_rate": sr},
                    )
                audio = audio[0 : speech_timestamps[-1]["end"]]
                audio = audio[: speech_timestamps[-1]["end"]]
                node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr})
                last_audios = [audio[speech_timestamps[-1]["end"] :]]