Browse Source

Improve translation

tags/v0.3.11-rc1
haixuanTao 10 months ago
parent
commit
dec278a6b2
2 changed files with 97 additions and 4 deletions
  1. +1
    -1
      examples/translation/phi4-dev.yml
  2. +96
    -3
      node-hub/dora-phi4/dora_phi4/main.py

+ 1
- 1
examples/translation/phi4-dev.yml View File

@@ -34,7 +34,7 @@ nodes:
outputs:
- data
env:
DATA: "Translate this chinese audio to english"
DATA: "Translate this audio to english."

- id: dora-rerun
build: pip install -e ../../node-hub/dora-rerun


+ 96
- 3
node-hub/dora-phi4/dora_phi4/main.py View File

@@ -1,6 +1,8 @@
"""TODO: Add docstring."""

import os
import re
import time

import cv2
import numpy as np
@@ -76,20 +78,98 @@ BAD_SENTENCES = [
"The first time I saw the sea.",
"The first time I saw the sea was in the movie.",
"The first time I saw the movie.",
"the first time saw the video i was like my god",
"I don't know what to do.",
"I don't know.",
]


def remove_text_noise(text: str, text_noise="") -> str:
"""Remove noise from text.

Args:
----
text (str): Original text
text_noise (str): text to remove from the original text

Returns:
-------
str: Cleaned text

"""
# Handle the case where text_noise is empty
if not text_noise.strip():
return (
text # Return the original text if text_noise is empty or just whitespace
)

# Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
def normalize(s):
# Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
s = re.sub(r"-", " ", s)
# Remove other punctuation and convert to lowercase
s = re.sub(r"[^\w\s]", "", s).lower()
return s

# Normalize both text and text_noise
normalized_text = normalize(text)
normalized_noise = normalize(text_noise)

# Split into words
text_words = normalized_text.split()
noise_words = normalized_noise.split()

# Function to find and remove noise sequence flexibly
def remove_flexible(text_list, noise_list):
i = 0
while i <= len(text_list) - len(noise_list):
match = True
extra_words = 0
for j, noise_word in enumerate(noise_list):
if i + j + extra_words >= len(text_list):
match = False
break
# Allow skipping extra words in text_list
while (
i + j + extra_words < len(text_list)
and text_list[i + j + extra_words] != noise_word
):
extra_words += 1
if i + j + extra_words >= len(text_list):
match = False
break
if not match:
break
if match:
# Remove matched part
del text_list[i : i + len(noise_list) + extra_words]
i = max(0, i - len(noise_list)) # Adjust index after removal
else:
i += 1
return text_list

# Only remove parts of text_noise that are found in text
cleaned_words = text_words[:]
for noise_word in noise_words:
if noise_word in cleaned_words:
cleaned_words.remove(noise_word)

# Reconstruct the cleaned text
cleaned_text = " ".join(cleaned_words)
return cleaned_text


def main():
"""TODO: Add docstring."""
node = Node()

frames = {}
image_id = None
image = None
audios = None
text = ""
noise_timestamp = time.time()
text_noise = ""

for event in node:
if event["type"] == "INPUT":
input_id = event["id"]
@@ -176,8 +256,21 @@ def main():
clean_up_tokenization_spaces=False,
)[0]

if response not in BAD_SENTENCES:
node.send_output("text", pa.array([response]))
# Remove noise filter after some time
if time.time() - noise_timestamp > (
len(text_noise.split()) / 1.5
): # WPS
text_noise = ""

if response in BAD_SENTENCES:
continue
## Remove text noise independently of casing
response = remove_text_noise(response, text_noise)
if response.strip() == "" or response.strip() == ".":
continue
node.send_output("text", pa.array([response]))
noise_timestamp = time.time()
text_noise = response


if __name__ == "__main__":


Loading…
Cancel
Save