Browse Source

removed unnecessary error handling

tags/v0.3.11-rc1
ShashwatPatil 10 months ago
parent
commit
7db765370b
3 changed files with 58 additions and 109 deletions
  1. +0
    -1
      node-hub/dora-transformer/README.md
  2. +57
    -107
      node-hub/dora-transformer/dora_transformer/main.py
  3. +1
    -1
      node-hub/dora-transformer/pyproject.toml

+ 0
- 1
node-hub/dora-transformer/README.md View File

@@ -58,7 +58,6 @@ Configure the node in your dataflow YAML file:
The node includes several memory optimization features:
- 8-bit quantization for CUDA devices
- Automatic CUDA cache clearing
- Dynamic CPU fallback on OOM errors
- Memory-efficient model loading
- Half-precision support



+ 57
- 107
node-hub/dora-transformer/dora_transformer/main.py View File

@@ -31,117 +31,67 @@ ACTIVATION_WORDS = os.getenv("ACTIVATION_WORDS", "what how who where you").split

def load_model():
"""Load the transformer model and tokenizer."""
try:
logging.info(f"Loading model {MODEL_NAME} on {DEVICE}")
# Memory efficient loading
model_kwargs = {
"torch_dtype": TORCH_DTYPE,
"device_map": DEVICE,
}
if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
model_kwargs.update({
"low_cpu_mem_usage": True,
"offload_folder": "offload",
"load_in_8bit": True
})
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
**model_kwargs
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
logging.info("Model loaded successfully")
return model, tokenizer
except Exception as e:
if "CUDA out of memory" in str(e):
logging.error("CUDA out of memory error. Try:")
logging.error("1. Setting DEVICE=cpu")
logging.error("2. Using a smaller model")
logging.error("3. Setting ENABLE_MEMORY_EFFICIENT=true")
logging.error(f"Error loading model: {e}")
raise

def generate_response(model, tokenizer, text: str, history, max_retries: int = 3) -> tuple[str, list]:
"""Generate text using the transformer model with safe fallback mechanisms."""
global MAX_TOKENS # Declare global at the start of function
retry_count = 0
current_device = DEVICE
original_history = history.copy()
current_max_tokens = MAX_TOKENS # Local copy for this generation attempt
logging.info(f"Loading model {MODEL_NAME} on {DEVICE}")
while retry_count < max_retries:
try:
# Reset history to original state on retries
history = original_history.copy()
history += [{"role": "user", "content": text}]
prompt = tokenizer.apply_chat_template(
history, tokenize=False, add_generation_prompt=True
)
model_inputs = tokenizer([prompt], return_tensors="pt").to(current_device)
with torch.inference_mode():
generated_ids = model.generate(
**model_inputs,
max_new_tokens=current_max_tokens, # Use local copy
pad_token_id=tokenizer.pad_token_id,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.2,
length_penalty=0.5,
)
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
history += [{"role": "assistant", "content": response}]
# Clear CUDA cache after successful generation
if ENABLE_MEMORY_EFFICIENT and current_device == "cuda":
torch.cuda.empty_cache()
return response, history
except RuntimeError as e:
if "CUDA out of memory" in str(e):
retry_count += 1
logging.warning(f"CUDA OOM error (attempt {retry_count}/{max_retries})")
# Clear CUDA cache
if current_device == "cuda":
torch.cuda.empty_cache()
# Strategy for each retry
if retry_count == 1:
# First retry: Clear cache and try again on CUDA
continue
elif retry_count == 2:
# Second retry: Move model to CPU
logging.info("Moving model to CPU for fallback")
current_device = "cpu"
model = model.to("cpu")
continue
else:
# Final retry: Reduce token count
logging.info("Reducing token count for final attempt")
current_max_tokens = max(24, current_max_tokens // 2) # Reduce tokens but keep minimum
continue
else:
# For non-CUDA OOM errors, raise immediately
raise
# Memory efficient loading
model_kwargs = {
"torch_dtype": TORCH_DTYPE,
"device_map": DEVICE,
}
if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
model_kwargs.update({
"low_cpu_mem_usage": True,
"offload_folder": "offload",
"load_in_8bit": True
})
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
**model_kwargs
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
logging.info("Model loaded successfully")
return model, tokenizer



def generate_response(model, tokenizer, text: str, history) -> tuple[str, list]:
"""Generate text using the transformer model."""
history += [{"role": "user", "content": text}]
# If we've exhausted all retries
raise RuntimeError(
"Failed to generate response after multiple attempts. "
"Try reducing model size or using CPU inference."
prompt = tokenizer.apply_chat_template(
history, tokenize=False, add_generation_prompt=True
)
model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
with torch.inference_mode():
generated_ids = model.generate(
**model_inputs,
max_new_tokens=MAX_TOKENS,
pad_token_id=tokenizer.pad_token_id,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.2,
length_penalty=0.5,
)
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
history += [{"role": "assistant", "content": response}]
# Clear CUDA cache after successful generation if enabled
if ENABLE_MEMORY_EFFICIENT and DEVICE == "cuda":
torch.cuda.empty_cache()
return response, history

def main():
# Initialize model and conversation history


+ 1
- 1
node-hub/dora-transformer/pyproject.toml View File

@@ -1,6 +1,6 @@
[project]
name = "dora-transformer"
version = "0.0.0"
version = "1.0.0"
authors = [{ name = "Shashwat Patil", email = "email@email.com" }]
description = "dora-transformer"
license = { text = "MIT" }


Loading…
Cancel
Save