removed unnecessary error handling

10 months ago · 7db765370b
--- a/node-hub/dora-transformer/README.md
+++ b/node-hub/dora-transformer/README.md
@@ -58,7 +58,6 @@ Configure the node in your dataflow YAML file:
 The node includes several memory optimization features:
 - 8-bit quantization for CUDA devices
 - Automatic CUDA cache clearing
 - Dynamic CPU fallback on OOM errors
 - Memory-efficient model loading
 - Half-precision support

--- a/node-hub/dora-transformer/dora_transformer/main.py
+++ b/node-hub/dora-transformer/dora_transformer/main.py
@@ -31,117 +31,67 @@ ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split

 def load_model():
    """Load the transformer model and tokenizer."""
    try:
        logging.info(f"Loading model {MODEL_NAME} on {DEVICE}")
        
        # Memory efficient loading
        model_kwargs = {
            "torch_dtype": TORCH_DTYPE,
            "device_map": DEVICE,
        }
        
        if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
            model_kwargs.update({
                "low_cpu_mem_usage": True,
                "offload_folder": "offload",
                "load_in_8bit": True
            })
        
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            **model_kwargs
        )
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        logging.info("Model loaded successfully")
        return model, tokenizer
    except Exception as e:
        if "CUDA out of memory" in str(e):
            logging.error("CUDA out of memory error. Try:")
            logging.error("1. Setting DEVICE=cpu")
            logging.error("2. Using a smaller model")
            logging.error("3. Setting ENABLE_MEMORY_EFFICIENT=true")
        logging.error(f"Error loading model: {e}")
        raise

 def generate_response(model, tokenizer, text: str, history, max_retries: int = 3) -> tuple[str, list]:
    """Generate text using the transformer model with safe fallback mechanisms."""
    global MAX_TOKENS  # Declare global at the start of function
    retry_count = 0
    current_device = DEVICE
    original_history = history.copy()
    current_max_tokens = MAX_TOKENS  # Local copy for this generation attempt
    logging.info(f"Loading model {MODEL_NAME} on {DEVICE}")
    
    while retry_count < max_retries:
        try:
            # Reset history to original state on retries
            history = original_history.copy()
            history += [{"role": "user", "content": text}]
            
            prompt = tokenizer.apply_chat_template(
                history, tokenize=False, add_generation_prompt=True
            )
            
            model_inputs = tokenizer([prompt], return_tensors="pt").to(current_device)
            
            with torch.inference_mode():
                generated_ids = model.generate(
                    **model_inputs, 
                    max_new_tokens=current_max_tokens,  # Use local copy
                    pad_token_id=tokenizer.pad_token_id,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,         
                    repetition_penalty=1.2,
                    length_penalty=0.5,
                )
            
            generated_ids = [
                output_ids[len(input_ids):]
                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
            ]
            
            response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            history += [{"role": "assistant", "content": response}]
            
            # Clear CUDA cache after successful generation
            if ENABLE_MEMORY_EFFICIENT and current_device == "cuda":
                torch.cuda.empty_cache()
                
            return response, history
            
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                retry_count += 1
                logging.warning(f"CUDA OOM error (attempt {retry_count}/{max_retries})")
                
                # Clear CUDA cache
                if current_device == "cuda":
                    torch.cuda.empty_cache()
                
                # Strategy for each retry
                if retry_count == 1:
                    # First retry: Clear cache and try again on CUDA
                    continue
                elif retry_count == 2:
                    # Second retry: Move model to CPU
                    logging.info("Moving model to CPU for fallback")
                    current_device = "cpu"
                    model = model.to("cpu")
                    continue
                else:
                    # Final retry: Reduce token count
                    logging.info("Reducing token count for final attempt")
                    current_max_tokens = max(24, current_max_tokens // 2)  # Reduce tokens but keep minimum
                    continue
            else:
                # For non-CUDA OOM errors, raise immediately
                raise
    # Memory efficient loading
    model_kwargs = {
        "torch_dtype": TORCH_DTYPE,
        "device_map": DEVICE,
    }
    
    if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
        model_kwargs.update({
            "low_cpu_mem_usage": True,
            "offload_folder": "offload",
            "load_in_8bit": True
        })
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        **model_kwargs
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    logging.info("Model loaded successfully")
    return model, tokenizer



 def generate_response(model, tokenizer, text: str, history) -> tuple[str, list]:
    """Generate text using the transformer model."""
    history += [{"role": "user", "content": text}]
    
    # If we've exhausted all retries
    raise RuntimeError(
        "Failed to generate response after multiple attempts. "
        "Try reducing model size or using CPU inference."
    prompt = tokenizer.apply_chat_template(
        history, tokenize=False, add_generation_prompt=True
    )
    
    model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
    
    with torch.inference_mode():
        generated_ids = model.generate(
            **model_inputs, 
            max_new_tokens=MAX_TOKENS,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,         
            repetition_penalty=1.2,
            length_penalty=0.5,
        )
    
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    history += [{"role": "assistant", "content": response}]
    
    # Clear CUDA cache after successful generation if enabled
    if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
        torch.cuda.empty_cache()
        
    return response, history

 def main():
    # Initialize model and conversation history
--- a/node-hub/dora-transformer/pyproject.toml
+++ b/node-hub/dora-transformer/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dora-transformer"
 version = "0.0.0"
 version = "1.0.0"
 authors = [{ name = "Shashwat Patil", email = "email@email.com" }]
 description = "dora-transformer"
 license = { text = "MIT" }