Spaces:

jason-moore
/

sum-soap-demo

Paused

App Files Files Community

jason-moore commited on Mar 31

Commit

0fcb40c

1 Parent(s): f44fff8

fallback

Browse files

Files changed (1) hide show

app.py +63 -42

app.py CHANGED Viewed

@@ -9,61 +9,82 @@ logger = logging.get_logger("transformers")
 # Load model directly from your Hugging Face repository
 def load_model():
-    tokenizer = AutoTokenizer.from_pretrained("omi-health/sum-small", trust_remote_code=False)
-    model = AutoModelForCausalLM.from_pretrained("omi-health/sum-small", trust_remote_code=False)
-    # Move model to GPU if available
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device)
-    print(f"Using device: {device}")
-    if device == "cuda":
-        print(f"GPU: {torch.cuda.get_device_name(0)}")
-        print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
     return model, tokenizer
 def generate_soap_note(doctor_patient_conversation):
     if not doctor_patient_conversation.strip():
         return "Please enter a doctor-patient conversation."
-    # Create a properly formatted prompt with instructions
-    prompt = f"""<|user|>
 Please generate a structured SOAP (Subjective, Objective, Assessment, Plan) note based on the following doctor-patient conversation:
 {doctor_patient_conversation}
 <|assistant|>"""
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    # Tokenize and generate with explicit padding settings
-    inputs = tokenizer(
-        prompt,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=tokenizer.model_max_length
-    )
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    generate_ids = model.generate(
-        inputs["input_ids"],
-        attention_mask=inputs["attention_mask"],  # Explicitly pass attention mask
-        max_length=2048,
-        num_beams=5,
-        no_repeat_ngram_size=2,
-        early_stopping=True
-    )
-    # Decode and extract the response part
-    decoded_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    # Extract only the assistant's response (remove the prompt part)
-    if "<|assistant|>" in decoded_response:
-        decoded_response = decoded_response.split("<|assistant|>")[1].strip()
-    logger.debug(f"Decoded response: {decoded_response}")
-    return decoded_response
 # Load model and tokenizer (this will run once when the app starts)
 model, tokenizer = load_model()

 # Load model directly from your Hugging Face repository
 def load_model():
+    try:
+        # First try loading with half precision to save memory
+        tokenizer = AutoTokenizer.from_pretrained("omi-health/sum-small", trust_remote_code=False)
+        # Try to use GPU with half precision first
+        if torch.cuda.is_available():
+            model = AutoModelForCausalLM.from_pretrained(
+                "omi-health/sum-small",
+                trust_remote_code=False,
+                torch_dtype=torch.float16,  # Half precision
+                device_map="auto"  # Let the library decide best device mapping
+            )
+            print(f"Model loaded with float16 precision on GPU")
+            print(f"GPU: {torch.cuda.get_device_name(0)}")
+            print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
+        else:
+            # Fall back to CPU
+            model = AutoModelForCausalLM.from_pretrained("omi-health/sum-small", trust_remote_code=False)
+            print("Using CPU (no GPU available)")
+    except Exception as e:
+        print(f"Error loading model with GPU/half-precision: {e}")
+        print("Falling back to CPU...")
+        model = AutoModelForCausalLM.from_pretrained("omi-health/sum-small", trust_remote_code=False)
     return model, tokenizer
 def generate_soap_note(doctor_patient_conversation):
     if not doctor_patient_conversation.strip():
         return "Please enter a doctor-patient conversation."
+    try:
+        # Create a properly formatted prompt with instructions
+        prompt = f"""<|user|>
 Please generate a structured SOAP (Subjective, Objective, Assessment, Plan) note based on the following doctor-patient conversation:
 {doctor_patient_conversation}
 <|assistant|>"""
+        # Tokenize with reasonable max length
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=tokenizer.model_max_length - 512  # Reserve space for generation
+        )
+        # Move inputs to the correct device
+        device = next(model.parameters()).device  # Get device from model
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Use more memory-efficient generation settings
+        generate_ids = model.generate(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_length=1024,  # Reduced from 2048
+            num_beams=2,      # Reduced from 5
+            no_repeat_ngram_size=2,
+            early_stopping=True
+        )
+        # Decode and extract the response part
+        decoded_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True)[0]
+        # Extract only the assistant's response
+        if "<|assistant|>" in decoded_response:
+            decoded_response = decoded_response.split("<|assistant|>")[1].strip()
+        return decoded_response
+    except RuntimeError as e:
+        if "CUDA out of memory" in str(e):
+            return "Error: GPU ran out of memory. Try with a shorter conversation or on a machine with more GPU memory."
+        else:
+            return f"Error during generation: {str(e)}"
 # Load model and tokenizer (this will run once when the app starts)
 model, tokenizer = load_model()