BioWhisper / app.py
ashhal's picture
Update app.py
e75c198 verified
raw
history blame
1.84 kB
import gradio as gr
import fitz # PyMuPDF
from transformers import pipeline
import re
# Use a faster and lighter summarization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = "".join(page.get_text() + "\n" for page in doc)
return text
def chunk_text(text, max_words=500):
words = text.split()
return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
def simplify_summary(summary):
# Remove repetitive hospital info
summary = re.sub(r"\b(?:Mayo Hospital|Lahore Hospital|submitted by Dr\.).+\n?", "", summary, flags=re.IGNORECASE)
return "🩺 In simple terms:\n" + summary.strip()
def process_report(pdf_file):
text = extract_text_from_pdf(pdf_file)
if not text.strip():
return "❌ Couldn't extract text from the PDF.", ""
# Remove irrelevant boilerplate
header, *rest = text.split("\n\n", 1)
core_text = rest[0] if rest else text
chunks = chunk_text(core_text, max_words=600)
summaries = [summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
for chunk in chunks]
final_summary = " ".join(summaries)
simple = simplify_summary(final_summary)
return final_summary, simple
demo = gr.Interface(
fn=process_report,
inputs=gr.File(label="Upload Medical Report PDF"),
outputs=[
gr.Textbox(label="AI-Generated Summary", lines=8),
gr.Textbox(label="Simplified Explanation", lines=8)
],
title="πŸ₯ Medical Report Summarizer",
description="Speeds up summarization by chunking text & uses a lighter distil-BART model, focusing on core medical findings."
)
if __name__ == "__main__":
demo.launch()