Spaces:
Runtime error
Runtime error
import gradio as gr | |
import fitz # PyMuPDF | |
import pdfplumber | |
import pytesseract | |
from pdf2image import convert_from_bytes | |
from transformers import pipeline | |
from PIL import Image | |
import io | |
# Load summarizer from Hugging Face (free model) | |
summarizer = pipeline("summarization", model="Falconsai/text_summarization") | |
# Optional: Configure Tesseract path for Windows users | |
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" | |
def extract_text_from_pdf(pdf_file): | |
try: | |
pdf_file.seek(0) | |
pdf_bytes = pdf_file.read() | |
# Step 1: Try PyMuPDF | |
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
if text.strip(): | |
return "text", text | |
# Step 2: Try pdfplumber | |
pdf_file.seek(0) | |
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() or "" | |
if text.strip(): | |
return "text", text | |
# Step 3: OCR via pdf2image + pytesseract | |
images = convert_from_bytes(pdf_bytes) | |
ocr_text = "" | |
for img in images: | |
ocr_text += pytesseract.image_to_string(img) | |
if ocr_text.strip(): | |
return "ocr", ocr_text | |
return "error", "β Could not extract any text from PDF." | |
except Exception as e: | |
return "error", f"β Failed to read PDF. Error: {str(e)}" | |
def process_pdf(pdf_file): | |
method, extracted_text = extract_text_from_pdf(pdf_file) | |
if method == "error": | |
return extracted_text, "Error", "Error" | |
try: | |
# Shorten for model input | |
short_text = extracted_text[:1000] | |
summary = summarizer(short_text, max_length=120, min_length=30, do_sample=False)[0]["summary_text"] | |
explanation = ( | |
"This summary simplifies the medical content extracted from your report. " | |
"If there are specific medical terms or values (e.g. Hemoglobin, WBC), the app tries to interpret them. " | |
"For full interpretation, consult a doctor." | |
) | |
return extracted_text, summary.strip(), explanation | |
except Exception as e: | |
return extracted_text, "β Summarization failed.", f"Error: {str(e)}" | |
# Gradio UI | |
iface = gr.Interface( | |
fn=process_pdf, | |
inputs=gr.File(label="Upload Medical Report (PDF)", type="file"), | |
outputs=[ | |
gr.Textbox(label="π Extracted Report Text"), | |
gr.Textbox(label="π§ AI-Generated Summary"), | |
gr.Textbox(label="π Simplified Explanation") | |
], | |
title="π§ͺ Medical Report Reader (Free)", | |
description=( | |
"Upload a medical report in PDF (scanned or digital). The app will extract the text, summarize it using AI, " | |
"and give a simplified explanation." | |
) | |
) | |
if __name__ == "__main__": | |
iface.launch() | |