Improve translation

11 months ago · dec278a6b2
--- a/examples/translation/phi4-dev.yml
+++ b/examples/translation/phi4-dev.yml
@@ -34,7 +34,7 @@ nodes:
    outputs:
      - data
    env:
      DATA: "Translate this chinese audio to english"
      DATA: "Translate this audio to english."

  - id: dora-rerun
    build: pip install -e ../../node-hub/dora-rerun
--- a/node-hub/dora-phi4/dora_phi4/main.py
+++ b/node-hub/dora-phi4/dora_phi4/main.py
@@ -1,6 +1,8 @@
 """TODO: Add docstring."""

 import os
 import re
 import time

 import cv2
 import numpy as np
@@ -76,20 +78,98 @@ BAD_SENTENCES = [
    "The first time I saw the sea.",
    "The first time I saw the sea was in the movie.",
    "The first time I saw the movie.",
    "the first time saw the video i was like my god",
    "I don't know what to do.",
    "I don't know.",
 ]


 def remove_text_noise(text: str, text_noise="") -> str:
    """Remove noise from text.

    Args:
    ----
        text (str): Original text
        text_noise (str): text to remove from the original text

    Returns:
    -------
        str: Cleaned text

    """
    # Handle the case where text_noise is empty
    if not text_noise.strip():
        return (
            text  # Return the original text if text_noise is empty or just whitespace
        )

    # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
    def normalize(s):
        # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
        s = re.sub(r"-", " ", s)
        # Remove other punctuation and convert to lowercase
        s = re.sub(r"[^\w\s]", "", s).lower()
        return s

    # Normalize both text and text_noise
    normalized_text = normalize(text)
    normalized_noise = normalize(text_noise)

    # Split into words
    text_words = normalized_text.split()
    noise_words = normalized_noise.split()

    # Function to find and remove noise sequence flexibly
    def remove_flexible(text_list, noise_list):
        i = 0
        while i <= len(text_list) - len(noise_list):
            match = True
            extra_words = 0
            for j, noise_word in enumerate(noise_list):
                if i + j + extra_words >= len(text_list):
                    match = False
                    break
                # Allow skipping extra words in text_list
                while (
                    i + j + extra_words < len(text_list)
                    and text_list[i + j + extra_words] != noise_word
                ):
                    extra_words += 1
                    if i + j + extra_words >= len(text_list):
                        match = False
                        break
                if not match:
                    break
            if match:
                # Remove matched part
                del text_list[i : i + len(noise_list) + extra_words]
                i = max(0, i - len(noise_list))  # Adjust index after removal
            else:
                i += 1
        return text_list

    # Only remove parts of text_noise that are found in text
    cleaned_words = text_words[:]
    for noise_word in noise_words:
        if noise_word in cleaned_words:
            cleaned_words.remove(noise_word)

    # Reconstruct the cleaned text
    cleaned_text = " ".join(cleaned_words)
    return cleaned_text


 def main():
    """TODO: Add docstring."""
    node = Node()

    frames = {}
    image_id = None
    image = None
    audios = None
    text = ""
    noise_timestamp = time.time()
    text_noise = ""

    for event in node:
        if event["type"] == "INPUT":
            input_id = event["id"]
@@ -176,8 +256,21 @@ def main():
                    clean_up_tokenization_spaces=False,
                )[0]

                if response not in BAD_SENTENCES:
                    node.send_output("text", pa.array([response]))
                # Remove noise filter after some time
                if time.time() - noise_timestamp > (
                    len(text_noise.split()) / 1.5
                ):  # WPS
                    text_noise = ""

                if response in BAD_SENTENCES:
                    continue
                ## Remove text noise independently of casing
                response = remove_text_noise(response, text_noise)
                if response.strip() == "" or response.strip() == ".":
                    continue
                node.send_output("text", pa.array([response]))
                noise_timestamp = time.time()
                text_noise = response


 if __name__ == "__main__":