From bbd8518760a3e676ffe04c3e818eef884c84fb5a Mon Sep 17 00:00:00 2001
From: ShashwatPatil <shashwatpatil974@gmail.com>
Date: Sun, 16 Mar 2025 15:26:20 +0530
Subject: [PATCH] resolved Recursive fallback of dora-transformer node

---
 node-hub/dora-transformer/README.md           |   4 +-
 .../dora-transformer/dora_transformer/main.py | 115 ++++++++++++------
 2 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/node-hub/dora-transformer/README.md b/node-hub/dora-transformer/README.md
index f31fce7a..ad31c6a8 100644
--- a/node-hub/dora-transformer/README.md
+++ b/node-hub/dora-transformer/README.md
@@ -127,8 +127,8 @@ nodes:
 ### Running the Example
 
 ```bash
-dora build example.yml
-dora run example.yml
+dora build test.yml
+dora run test.yml
 ```
 
 ### Troubleshooting
diff --git a/node-hub/dora-transformer/dora_transformer/main.py b/node-hub/dora-transformer/dora_transformer/main.py
index 38abe8a8..1fe4ee4d 100644
--- a/node-hub/dora-transformer/dora_transformer/main.py
+++ b/node-hub/dora-transformer/dora_transformer/main.py
@@ -63,47 +63,84 @@ def load_model():
         logging.error(f"Error loading model: {e}")
         raise
 
-def generate_response(model, tokenizer, text: str, history) -> tuple[str, list]:
-    """Generate text using the transformer model."""
-    try:
-        history += [{"role": "user", "content": text}]
-        prompt = tokenizer.apply_chat_template(
-            history, tokenize=False, add_generation_prompt=True
-        )
-        
-        model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
-        
-        with torch.inference_mode():
-            generated_ids = model.generate(
-                **model_inputs, 
-                max_new_tokens=MAX_TOKENS,
-                pad_token_id=tokenizer.pad_token_id,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9,         
-                repetition_penalty=1.2,  # Reduce repetition
-                length_penalty=0.5,
+def generate_response(model, tokenizer, text: str, history, max_retries: int = 3) -> tuple[str, list]:
+    """Generate text using the transformer model with safe fallback mechanisms."""
+    retry_count = 0
+    current_device = DEVICE
+    original_history = history.copy()  # Keep original history safe
+    
+    while retry_count < max_retries:
+        try:
+            # Reset history to original state on retries
+            history = original_history.copy()
+            history += [{"role": "user", "content": text}]
+            
+            prompt = tokenizer.apply_chat_template(
+                history, tokenize=False, add_generation_prompt=True
             )
-        
-        generated_ids = [
-            output_ids[len(input_ids):]
-            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-        ]
-        
-        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        history += [{"role": "assistant", "content": response}]
-        
-        # Clear CUDA cache after generation if enabled
-        if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
-            torch.cuda.empty_cache()
             
-        return response, history
-    except RuntimeError as e:
-        if "CUDA out of memory" in str(e):
-            logging.error("CUDA out of memory during generation. Falling back to CPU")
-            model.to("cpu")
-            return generate_response(model, tokenizer, text, history)
-        raise
+            model_inputs = tokenizer([prompt], return_tensors="pt").to(current_device)
+            
+            with torch.inference_mode():
+                generated_ids = model.generate(
+                    **model_inputs, 
+                    max_new_tokens=MAX_TOKENS,
+                    pad_token_id=tokenizer.pad_token_id,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,         
+                    repetition_penalty=1.2,
+                    length_penalty=0.5,
+                )
+            
+            generated_ids = [
+                output_ids[len(input_ids):]
+                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+            ]
+            
+            response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            history += [{"role": "assistant", "content": response}]
+            
+            # Clear CUDA cache after successful generation
+            if ENABLE_MEMORY_EFFICIENT and current_device == "cuda":
+                torch.cuda.empty_cache()
+                
+            return response, history
+            
+        except RuntimeError as e:
+            if "CUDA out of memory" in str(e):
+                retry_count += 1
+                logging.warning(f"CUDA OOM error (attempt {retry_count}/{max_retries})")
+                
+                # Clear CUDA cache
+                if current_device == "cuda":
+                    torch.cuda.empty_cache()
+                
+                # Strategy for each retry
+                if retry_count == 1:
+                    # First retry: Clear cache and try again on CUDA
+                    continue
+                elif retry_count == 2:
+                    # Second retry: Move model to CPU
+                    logging.info("Moving model to CPU for fallback")
+                    current_device = "cpu"
+                    model = model.to("cpu")
+                    continue
+                else:
+                    # Final retry: Reduce token count
+                    logging.info("Reducing token count for final attempt")
+                    global MAX_TOKENS
+                    MAX_TOKENS = max(32, MAX_TOKENS // 2)
+                    continue
+            else:
+                # For non-CUDA OOM errors, raise immediately
+                raise
+    
+    # If we've exhausted all retries
+    raise RuntimeError(
+        "Failed to generate response after multiple attempts. "
+        "Try reducing model size or using CPU inference."
+    )
 
 def main():
     # Initialize model and conversation history