jason-moore commited on
Commit
0fcb40c
·
1 Parent(s): f44fff8
Files changed (1) hide show
  1. app.py +63 -42
app.py CHANGED
@@ -9,61 +9,82 @@ logger = logging.get_logger("transformers")
9
 
10
  # Load model directly from your Hugging Face repository
11
  def load_model():
12
- tokenizer = AutoTokenizer.from_pretrained("omi-health/sum-small", trust_remote_code=False)
13
- model = AutoModelForCausalLM.from_pretrained("omi-health/sum-small", trust_remote_code=False)
14
-
15
- # Move model to GPU if available
16
- device = "cuda" if torch.cuda.is_available() else "cpu"
17
- model = model.to(device)
18
-
19
- print(f"Using device: {device}")
20
- if device == "cuda":
21
- print(f"GPU: {torch.cuda.get_device_name(0)}")
22
- print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
23
-
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  return model, tokenizer
25
 
26
  def generate_soap_note(doctor_patient_conversation):
27
  if not doctor_patient_conversation.strip():
28
  return "Please enter a doctor-patient conversation."
29
 
30
- # Create a properly formatted prompt with instructions
31
- prompt = f"""<|user|>
 
32
  Please generate a structured SOAP (Subjective, Objective, Assessment, Plan) note based on the following doctor-patient conversation:
33
 
34
  {doctor_patient_conversation}
35
  <|assistant|>"""
36
- device = "cuda" if torch.cuda.is_available() else "cpu"
37
 
38
- # Tokenize and generate with explicit padding settings
39
- inputs = tokenizer(
40
- prompt,
41
- return_tensors="pt",
42
- padding=True,
43
- truncation=True,
44
- max_length=tokenizer.model_max_length
45
- )
46
-
47
- inputs = {k: v.to(device) for k, v in inputs.items()}
48
-
49
- generate_ids = model.generate(
50
- inputs["input_ids"],
51
- attention_mask=inputs["attention_mask"], # Explicitly pass attention mask
52
- max_length=2048,
53
- num_beams=5,
54
- no_repeat_ngram_size=2,
55
- early_stopping=True
56
- )
57
 
58
- # Decode and extract the response part
59
- decoded_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
60
-
61
- # Extract only the assistant's response (remove the prompt part)
62
- if "<|assistant|>" in decoded_response:
63
- decoded_response = decoded_response.split("<|assistant|>")[1].strip()
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- logger.debug(f"Decoded response: {decoded_response}")
66
- return decoded_response
 
 
 
67
 
68
  # Load model and tokenizer (this will run once when the app starts)
69
  model, tokenizer = load_model()
 
9
 
10
  # Load model directly from your Hugging Face repository
11
  def load_model():
12
+ try:
13
+ # First try loading with half precision to save memory
14
+ tokenizer = AutoTokenizer.from_pretrained("omi-health/sum-small", trust_remote_code=False)
15
+
16
+ # Try to use GPU with half precision first
17
+ if torch.cuda.is_available():
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ "omi-health/sum-small",
20
+ trust_remote_code=False,
21
+ torch_dtype=torch.float16, # Half precision
22
+ device_map="auto" # Let the library decide best device mapping
23
+ )
24
+ print(f"Model loaded with float16 precision on GPU")
25
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
26
+ print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
27
+ else:
28
+ # Fall back to CPU
29
+ model = AutoModelForCausalLM.from_pretrained("omi-health/sum-small", trust_remote_code=False)
30
+ print("Using CPU (no GPU available)")
31
+
32
+ except Exception as e:
33
+ print(f"Error loading model with GPU/half-precision: {e}")
34
+ print("Falling back to CPU...")
35
+ model = AutoModelForCausalLM.from_pretrained("omi-health/sum-small", trust_remote_code=False)
36
+
37
  return model, tokenizer
38
 
39
  def generate_soap_note(doctor_patient_conversation):
40
  if not doctor_patient_conversation.strip():
41
  return "Please enter a doctor-patient conversation."
42
 
43
+ try:
44
+ # Create a properly formatted prompt with instructions
45
+ prompt = f"""<|user|>
46
  Please generate a structured SOAP (Subjective, Objective, Assessment, Plan) note based on the following doctor-patient conversation:
47
 
48
  {doctor_patient_conversation}
49
  <|assistant|>"""
 
50
 
51
+ # Tokenize with reasonable max length
52
+ inputs = tokenizer(
53
+ prompt,
54
+ return_tensors="pt",
55
+ padding=True,
56
+ truncation=True,
57
+ max_length=tokenizer.model_max_length - 512 # Reserve space for generation
58
+ )
59
+
60
+ # Move inputs to the correct device
61
+ device = next(model.parameters()).device # Get device from model
62
+ inputs = {k: v.to(device) for k, v in inputs.items()}
 
 
 
 
 
 
 
63
 
64
+ # Use more memory-efficient generation settings
65
+ generate_ids = model.generate(
66
+ inputs["input_ids"],
67
+ attention_mask=inputs["attention_mask"],
68
+ max_length=1024, # Reduced from 2048
69
+ num_beams=2, # Reduced from 5
70
+ no_repeat_ngram_size=2,
71
+ early_stopping=True
72
+ )
73
+
74
+ # Decode and extract the response part
75
+ decoded_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True)[0]
76
+
77
+ # Extract only the assistant's response
78
+ if "<|assistant|>" in decoded_response:
79
+ decoded_response = decoded_response.split("<|assistant|>")[1].strip()
80
+
81
+ return decoded_response
82
 
83
+ except RuntimeError as e:
84
+ if "CUDA out of memory" in str(e):
85
+ return "Error: GPU ran out of memory. Try with a shorter conversation or on a machine with more GPU memory."
86
+ else:
87
+ return f"Error during generation: {str(e)}"
88
 
89
  # Load model and tokenizer (this will run once when the app starts)
90
  model, tokenizer = load_model()