Browse Source

Improvement of the overall language pipeline to be more resilient

make-qwen-llm-configurable
haixuantao 5 months ago
parent
commit
2f89cf811d
4 changed files with 218 additions and 170 deletions
  1. +45
    -40
      node-hub/dora-distil-whisper/dora_distil_whisper/main.py
  2. +20
    -17
      node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py
  3. +140
    -110
      node-hub/dora-openai-websocket/src/main.rs
  4. +13
    -3
      node-hub/dora-vad/dora_vad/main.py

+ 45
- 40
node-hub/dora-distil-whisper/dora_distil_whisper/main.py View File

@@ -48,35 +48,6 @@ def remove_text_noise(text: str, text_noise="") -> str:
text_words = normalized_text.split()
noise_words = normalized_noise.split()

# Function to find and remove noise sequence flexibly
def remove_flexible(text_list, noise_list):
i = 0
while i <= len(text_list) - len(noise_list):
match = True
extra_words = 0
for j, noise_word in enumerate(noise_list):
if i + j + extra_words >= len(text_list):
match = False
break
# Allow skipping extra words in text_list
while (
i + j + extra_words < len(text_list)
and text_list[i + j + extra_words] != noise_word
):
extra_words += 1
if i + j + extra_words >= len(text_list):
match = False
break
if not match:
break
if match:
# Remove matched part
del text_list[i : i + len(noise_list) + extra_words]
i = max(0, i - len(noise_list)) # Adjust index after removal
else:
i += 1
return text_list

# Only remove parts of text_noise that are found in text
cleaned_words = text_words[:]
for noise_word in noise_words:
@@ -126,7 +97,26 @@ BAD_SENTENCES = [
"",
" so",
" So.",
" So, let's go.",
" so so",
" What?",
" We'll see you next time.",
" I'll see you next time.",
" We're going to come back.",
" let's move on.",
" Here we go.",
" my",
" All right. Thank you.",
" That's what we're doing.",
" That's what I wanted to do.",
" I'll be back.",
" Hold this. Hold this.",
" Hold this one. Hold this one.",
" And we'll see you next time.",
" strength.",
" Length.",
" Let's go.",
" Let's do it.",
"You",
"You ",
" You",
@@ -199,6 +189,12 @@ def main():
append_punctuations=".",
language=TARGET_LANGUAGE,
)
result = mlx_whisper.transcribe(
np.array([]),
path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
append_punctuations=".",
language=TARGET_LANGUAGE,
)

node = Node()
noise_timestamp = time.time()
@@ -244,6 +240,8 @@ def main():
generate_kwargs=confg,
)
if result["text"] in BAD_SENTENCES:
print("Discarded text: ", result["text"])
# cache_audio = None
continue
text = cut_repetition(result["text"])

@@ -258,20 +256,27 @@ def main():
continue

if (
(
text.endswith(".")
or text.endswith("!")
or text.endswith("?")
or text.endswith('."')
or text.endswith('!"')
or text.endswith('?"')
)
and not text.endswith("...") # Avoid ending with ellipsis
):
text.endswith(".")
or text.endswith("!")
or text.endswith("?")
or text.endswith('."')
or text.endswith('!"')
or text.endswith('?"')
) and not text.endswith("..."):
node.send_output(
"text",
pa.array([text]),
)
node.send_output(
"stop",
pa.array([text]),
)
cache_audio = None
else:
audio = None
print("Text:", text)
elif text.endswith("..."):
print(
"Keeping audio in cache for next text output with punctuation"
)
print("Discarded text", text)
cache_audio = audio

+ 20
- 17
node-hub/dora-kokoro-tts/dora_kokoro_tts/main.py View File

@@ -1,4 +1,5 @@
"""TODO: Add docstring."""

import os
import re

@@ -8,11 +9,12 @@ from kokoro import KPipeline

LANGUAGE = os.getenv("LANGUAGE", "en")


def main():
"""TODO: Add docstring."""
if LANGUAGE in ["en", "english"]:
pipeline = KPipeline(lang_code="a")
elif LANGUAGE in ["zh","ch","chinese"]:
elif LANGUAGE in ["zh", "ch", "chinese"]:
pipeline = KPipeline(lang_code="z")
else:
print("warning: Defaulting to english speaker as language not found")
@@ -22,22 +24,23 @@ def main():

for event in node:
if event["type"] == "INPUT":
if event["id"] == "text":
text = event["value"][0].as_py()
if re.findall(r'[\u4e00-\u9fff]+', text):
pipeline = KPipeline(lang_code="z")
elif pipeline.lang_code != "a":
pipeline = KPipeline(lang_code="a") # <= make sure lang_code matches voice

generator = pipeline(
text,
voice="af_heart", # <= change voice here
speed=1.2,
split_pattern=r"\n+",
)
for _, (_, _, audio) in enumerate(generator):
audio = audio.numpy()
node.send_output("audio", pa.array(audio), {"sample_rate": 24000})
text = event["value"][0].as_py()
if re.findall(r"[\u4e00-\u9fff]+", text):
pipeline = KPipeline(lang_code="z")
elif pipeline.lang_code != "a":
pipeline = KPipeline(
lang_code="a"
) # <= make sure lang_code matches voice

generator = pipeline(
text,
voice="af_heart", # <= change voice here
speed=1.2,
split_pattern=r"\n+",
)
for _, (_, _, audio) in enumerate(generator):
audio = audio.numpy()
node.send_output("audio", pa.array(audio), {"sample_rate": 24000})


if __name__ == "__main__":


+ 140
- 110
node-hub/dora-openai-websocket/src/main.rs View File

@@ -30,6 +30,10 @@ use fastwebsockets::Frame;
use fastwebsockets::OpCode;
use fastwebsockets::Payload;
use fastwebsockets::WebSocketError;
use futures_concurrency::future::Race;
use futures_util::future;
use futures_util::future::Either;
use futures_util::FutureExt;
use http_body_util::Empty;
use hyper::body::Bytes;
use hyper::body::Incoming;
@@ -45,9 +49,7 @@ use std::fs;
use std::io::{self, Write};
use std::net::IpAddr;
use std::net::Ipv4Addr;
use std::time::Duration;
use tokio::net::TcpListener;

#[derive(Serialize, Deserialize, Debug)]
pub struct ErrorDetails {
pub code: Option<String>,
@@ -310,125 +312,153 @@ async fn handle_client(fut: upgrade::UpgradeFut) -> Result<(), WebSocketError> {
let frame = Frame::text(payload);
ws.write_frame(frame).await?;
loop {
let mut frame = ws.read_frame().await?;
let event_fut = events.recv_async().map(Either::Left);
let frame_fut = ws.read_frame().map(Either::Right);
let event_stream = (event_fut, frame_fut).race();
let mut finished = false;
match frame.opcode {
OpCode::Close => break,
OpCode::Text | OpCode::Binary => {
let data: OpenAIRealtimeMessage = serde_json::from_slice(&frame.payload).unwrap();

match data {
OpenAIRealtimeMessage::InputAudioBufferAppend { audio } => {
// println!("Received audio data: {}", audio);
let f32_data = audio;
// Decode base64 encoded audio data
let f32_data = f32_data.trim();
if f32_data.is_empty() {
continue;
}
let frame = match event_stream.await {
future::Either::Left(Some(ev)) => {
let frame = match ev {
dora_node_api::Event::Input {
id,
metadata: _,
data,
} => {
if data.data_type() == &DataType::Utf8 {
let data = data.as_string::<i32>();
let str = data.value(0);
let serialized_data =
OpenAIRealtimeResponse::ResponseAudioTranscriptDelta {
response_id: "123".to_string(),
item_id: "123".to_string(),
output_index: 123,
content_index: 123,
delta: str.to_string(),
};

if let Ok(f32_data) = general_purpose::STANDARD.decode(f32_data) {
let f32_data = convert_pcm16_to_f32(&f32_data);
// Downsample to 16 kHz from 24 kHz
let f32_data = f32_data
.into_iter()
.enumerate()
.filter(|(i, _)| i % 3 != 0)
.map(|(_, v)| v)
.collect::<Vec<f32>>();
let mut parameter = MetadataParameters::default();
parameter.insert(
"sample_rate".to_string(),
dora_node_api::Parameter::Integer(16000),
);
node.send_output(
DataId::from("audio".to_string()),
parameter,
f32_data.into_arrow(),
)
.unwrap();
let ev = events.recv_async_timeout(Duration::from_millis(10)).await;

// println!("Received event: {:?}", ev);
let frame = match ev {
Some(dora_node_api::Event::Input {
id,
metadata: _,
data,
}) => {
if data.data_type() == &DataType::Utf8 {
let data = data.as_string::<i32>();
let str = data.value(0);
let serialized_data =
OpenAIRealtimeResponse::ResponseAudioTranscriptDelta {
response_id: "123".to_string(),
item_id: "123".to_string(),
output_index: 123,
content_index: 123,
delta: str.to_string(),
};

frame.payload = Payload::Bytes(
Bytes::from(
serde_json::to_string(&serialized_data).unwrap(),
)
.into(),
);
frame.opcode = OpCode::Text;
frame
} else if id.contains("audio") {
let data: Vec<f32> = into_vec(&data).unwrap();
let data = convert_f32_to_pcm16(&data);
let serialized_data =
OpenAIRealtimeResponse::ResponseAudioDelta {
response_id: "123".to_string(),
item_id: "123".to_string(),
output_index: 123,
content_index: 123,
delta: general_purpose::STANDARD.encode(data),
};
finished = true;

frame.payload = Payload::Bytes(
Bytes::from(
serde_json::to_string(&serialized_data).unwrap(),
)
.into(),
);
frame.opcode = OpCode::Text;
frame
} else {
unimplemented!()
}
}
Some(dora_node_api::Event::Error(_)) => {
// println!("Error in input: {}", s);
continue;
}
_ => break,
let frame = Frame::text(Payload::Bytes(
Bytes::from(serde_json::to_string(&serialized_data).unwrap())
.into(),
));
frame
} else if id.contains("audio") {
let data: Vec<f32> = into_vec(&data).unwrap();
let data = convert_f32_to_pcm16(&data);
let serialized_data = OpenAIRealtimeResponse::ResponseAudioDelta {
response_id: "123".to_string(),
item_id: "123".to_string(),
output_index: 123,
content_index: 123,
delta: general_purpose::STANDARD.encode(data),
};
ws.write_frame(frame).await?;
if finished {
let serialized_data = OpenAIRealtimeResponse::ResponseDone {
response: serde_json::Value::Null,
finished = true;

let frame = Frame::text(Payload::Bytes(
Bytes::from(serde_json::to_string(&serialized_data).unwrap())
.into(),
));
frame
} else if id.contains("stop") {
let serialized_data =
OpenAIRealtimeResponse::InputAudioBufferSpeechStopped {
audio_end_ms: 123,
item_id: "123".to_string(),
};
finished = true;

let frame = Frame::text(Payload::Bytes(
Bytes::from(serde_json::to_string(&serialized_data).unwrap())
.into(),
));
frame
} else {
unimplemented!()
}
}
dora_node_api::Event::Error(_) => {
// println!("Error in input: {}", s);
continue;
}
_ => break,
};
Some(frame)
}
future::Either::Left(None) => break,
future::Either::Right(Ok(frame)) => {
match frame.opcode {
OpCode::Close => break,
OpCode::Text | OpCode::Binary => {
let data: OpenAIRealtimeMessage =
serde_json::from_slice(&frame.payload).unwrap();

match data {
OpenAIRealtimeMessage::InputAudioBufferAppend { audio } => {
// println!("Received audio data: {}", audio);
let f32_data = audio;
// Decode base64 encoded audio data
let f32_data = f32_data.trim();
if f32_data.is_empty() {
continue;
}

let payload = Payload::Bytes(
Bytes::from(serde_json::to_string(&serialized_data).unwrap())
.into(),
);
println!("Sending response done: {:?}", serialized_data);
let frame = Frame::text(payload);
ws.write_frame(frame).await?;
if let Ok(f32_data) = general_purpose::STANDARD.decode(f32_data) {
let f32_data = convert_pcm16_to_f32(&f32_data);
// Downsample to 16 kHz from 24 kHz
let f32_data = f32_data
.into_iter()
.enumerate()
.filter(|(i, _)| i % 3 != 0)
.map(|(_, v)| v)
.collect::<Vec<f32>>();

let mut parameter = MetadataParameters::default();
parameter.insert(
"sample_rate".to_string(),
dora_node_api::Parameter::Integer(16000),
);
node.send_output(
DataId::from("audio".to_string()),
parameter,
f32_data.into_arrow(),
)
.unwrap();
}
}
OpenAIRealtimeMessage::InputAudioBufferCommit => break,
OpenAIRealtimeMessage::ResponseCreate { response } => {
if let Some(text) = response.instructions {
node.send_output(
DataId::from("text".to_string()),
Default::default(),
text.into_arrow(),
)
.unwrap();
}
}
_ => {}
}
}
OpenAIRealtimeMessage::InputAudioBufferCommit => break,
_ => {}
_ => break,
}
None
}
_ => break,
future::Either::Right(Err(_)) => break,
};
if let Some(frame) = frame {
ws.write_frame(frame).await?;
}
if finished {
let serialized_data = OpenAIRealtimeResponse::ResponseDone {
response: serde_json::Value::Null,
};

let payload = Payload::Bytes(
Bytes::from(serde_json::to_string(&serialized_data).unwrap()).into(),
);
println!("Sending response done: {:?}", serialized_data);
let frame = Frame::text(payload);
ws.write_frame(frame).await?;
};
}

Ok(())


+ 13
- 3
node-hub/dora-vad/dora_vad/main.py View File

@@ -38,11 +38,21 @@ def main():
min_silence_duration_ms=MIN_SILENCE_DURATION_MS,
sampling_rate=sr,
)

if len(speech_timestamps) == 0:
# If there is no speech, return the audio
continue
arg_max = np.argmax([ts["end"] - ts["start"] for ts in speech_timestamps])
# Check ig there is timestamp
if (
len(speech_timestamps) > 0
and len(audio) > MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000
and len(
audio[speech_timestamps[0]["start"] : speech_timestamps[-1]["end"]]
)
> MIN_AUDIO_SAMPLING_DURATION_MS * sr / 1000
and (
(len(audio) - speech_timestamps[arg_max]["end"])
> MIN_SILENCE_DURATION_MS / 1000 * sr * 5
)
):
# Check if the audio is not cut at the end. And only return if there is a long time spent
if speech_timestamps[-1]["end"] == len(audio):
@@ -51,7 +61,7 @@ def main():
pa.array([speech_timestamps[-1]["start"]]),
metadata={"sample_rate": sr},
)
audio = audio[0 : speech_timestamps[-1]["end"]]
audio = audio[: speech_timestamps[-1]["end"]]
node.send_output("audio", pa.array(audio), metadata={"sample_rate": sr})
last_audios = [audio[speech_timestamps[-1]["end"] :]]



Loading…
Cancel
Save