BioWhisper / app.py
ashhal's picture
Update app.py
d37d30b verified
import gradio as gr
import fitz # PyMuPDF
import pdfplumber
import pytesseract
from pdf2image import convert_from_bytes
from transformers import pipeline
from PIL import Image
import io
# Load summarizer from Hugging Face (free model)
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
# Optional: Configure Tesseract path for Windows users
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def extract_text_from_pdf(pdf_file):
try:
pdf_file.seek(0)
pdf_bytes = pdf_file.read()
# Step 1: Try PyMuPDF
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
if text.strip():
return "text", text
# Step 2: Try pdfplumber
pdf_file.seek(0)
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
if text.strip():
return "text", text
# Step 3: OCR via pdf2image + pytesseract
images = convert_from_bytes(pdf_bytes)
ocr_text = ""
for img in images:
ocr_text += pytesseract.image_to_string(img)
if ocr_text.strip():
return "ocr", ocr_text
return "error", "❌ Could not extract any text from PDF."
except Exception as e:
return "error", f"❌ Failed to read PDF. Error: {str(e)}"
def process_pdf(pdf_file):
method, extracted_text = extract_text_from_pdf(pdf_file)
if method == "error":
return extracted_text, "Error", "Error"
try:
# Shorten for model input
short_text = extracted_text[:1000]
summary = summarizer(short_text, max_length=120, min_length=30, do_sample=False)[0]["summary_text"]
explanation = (
"This summary simplifies the medical content extracted from your report. "
"If there are specific medical terms or values (e.g. Hemoglobin, WBC), the app tries to interpret them. "
"For full interpretation, consult a doctor."
)
return extracted_text, summary.strip(), explanation
except Exception as e:
return extracted_text, "❌ Summarization failed.", f"Error: {str(e)}"
# Gradio UI
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload Medical Report (PDF)", type="file"),
outputs=[
gr.Textbox(label="πŸ“„ Extracted Report Text"),
gr.Textbox(label="🧠 AI-Generated Summary"),
gr.Textbox(label="πŸ“˜ Simplified Explanation")
],
title="πŸ§ͺ Medical Report Reader (Free)",
description=(
"Upload a medical report in PDF (scanned or digital). The app will extract the text, summarize it using AI, "
"and give a simplified explanation."
)
)
if __name__ == "__main__":
iface.launch()