From dec278a6b201e1f5eb58bf4cafba66589fc5a9f5 Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Tue, 18 Mar 2025 15:59:06 +0100 Subject: [PATCH] Improve translation --- examples/translation/phi4-dev.yml | 2 +- node-hub/dora-phi4/dora_phi4/main.py | 99 +++++++++++++++++++++++++++- 2 files changed, 97 insertions(+), 4 deletions(-) diff --git a/examples/translation/phi4-dev.yml b/examples/translation/phi4-dev.yml index 5d20819f..749591e6 100644 --- a/examples/translation/phi4-dev.yml +++ b/examples/translation/phi4-dev.yml @@ -34,7 +34,7 @@ nodes: outputs: - data env: - DATA: "Translate this chinese audio to english" + DATA: "Translate this audio to english." - id: dora-rerun build: pip install -e ../../node-hub/dora-rerun diff --git a/node-hub/dora-phi4/dora_phi4/main.py b/node-hub/dora-phi4/dora_phi4/main.py index 80b9ed71..676b9317 100644 --- a/node-hub/dora-phi4/dora_phi4/main.py +++ b/node-hub/dora-phi4/dora_phi4/main.py @@ -1,6 +1,8 @@ """TODO: Add docstring.""" import os +import re +import time import cv2 import numpy as np @@ -76,20 +78,98 @@ BAD_SENTENCES = [ "The first time I saw the sea.", "The first time I saw the sea was in the movie.", "The first time I saw the movie.", + "the first time saw the video i was like my god", "I don't know what to do.", "I don't know.", ] +def remove_text_noise(text: str, text_noise="") -> str: + """Remove noise from text. + + Args: + ---- + text (str): Original text + text_noise (str): text to remove from the original text + + Returns: + ------- + str: Cleaned text + + """ + # Handle the case where text_noise is empty + if not text_noise.strip(): + return ( + text # Return the original text if text_noise is empty or just whitespace + ) + + # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens) + def normalize(s): + # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent + s = re.sub(r"-", " ", s) + # Remove other punctuation and convert to lowercase + s = re.sub(r"[^\w\s]", "", s).lower() + return s + + # Normalize both text and text_noise + normalized_text = normalize(text) + normalized_noise = normalize(text_noise) + + # Split into words + text_words = normalized_text.split() + noise_words = normalized_noise.split() + + # Function to find and remove noise sequence flexibly + def remove_flexible(text_list, noise_list): + i = 0 + while i <= len(text_list) - len(noise_list): + match = True + extra_words = 0 + for j, noise_word in enumerate(noise_list): + if i + j + extra_words >= len(text_list): + match = False + break + # Allow skipping extra words in text_list + while ( + i + j + extra_words < len(text_list) + and text_list[i + j + extra_words] != noise_word + ): + extra_words += 1 + if i + j + extra_words >= len(text_list): + match = False + break + if not match: + break + if match: + # Remove matched part + del text_list[i : i + len(noise_list) + extra_words] + i = max(0, i - len(noise_list)) # Adjust index after removal + else: + i += 1 + return text_list + + # Only remove parts of text_noise that are found in text + cleaned_words = text_words[:] + for noise_word in noise_words: + if noise_word in cleaned_words: + cleaned_words.remove(noise_word) + + # Reconstruct the cleaned text + cleaned_text = " ".join(cleaned_words) + return cleaned_text + + def main(): """TODO: Add docstring.""" node = Node() frames = {} - image_id = None image = None audios = None text = "" + noise_timestamp = time.time() + text_noise = "" + for event in node: if event["type"] == "INPUT": input_id = event["id"] @@ -176,8 +256,21 @@ def main(): clean_up_tokenization_spaces=False, )[0] - if response not in BAD_SENTENCES: - node.send_output("text", pa.array([response])) + # Remove noise filter after some time + if time.time() - noise_timestamp > ( + len(text_noise.split()) / 1.5 + ): # WPS + text_noise = "" + + if response in BAD_SENTENCES: + continue + ## Remove text noise independently of casing + response = remove_text_noise(response, text_noise) + if response.strip() == "" or response.strip() == ".": + continue + node.send_output("text", pa.array([response])) + noise_timestamp = time.time() + text_noise = response if __name__ == "__main__":