Spaces:

ashhal
/

BioWhisper

Runtime error

File size: 2,924 Bytes

7a0cfef
0c0dd0e
d37d30b
 
 
0c0dd0e
d37d30b
 
7a0cfef
d37d30b
 
 
 
 
75faa01
0c0dd0e
46b466f
d37d30b
 
 
 
 
5aad2ae
 
 
d37d30b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46b466f
d37d30b
 
 
 
 
 
 
 
5aad2ae
d37d30b
 
 
5aad2ae
d37d30b
 
 
 
 
 
 
 
 
 
e75c198
d37d30b
 
 
 
7a0cfef
d37d30b
 
 
7a0cfef
d37d30b
 
 
 
 
7a0cfef
6b2202e
e75c198
d37d30b

import gradio as gr
import fitz  # PyMuPDF
import pdfplumber
import pytesseract
from pdf2image import convert_from_bytes
from transformers import pipeline
from PIL import Image
import io

# Load summarizer from Hugging Face (free model)
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

# Optional: Configure Tesseract path for Windows users
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_pdf(pdf_file):
    try:
        pdf_file.seek(0)
        pdf_bytes = pdf_file.read()

        # Step 1: Try PyMuPDF
        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        if text.strip():
            return "text", text

        # Step 2: Try pdfplumber
        pdf_file.seek(0)
        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
        if text.strip():
            return "text", text

        # Step 3: OCR via pdf2image + pytesseract
        images = convert_from_bytes(pdf_bytes)
        ocr_text = ""
        for img in images:
            ocr_text += pytesseract.image_to_string(img)
        if ocr_text.strip():
            return "ocr", ocr_text

        return "error", "❌ Could not extract any text from PDF."

    except Exception as e:
        return "error", f"❌ Failed to read PDF. Error: {str(e)}"

def process_pdf(pdf_file):
    method, extracted_text = extract_text_from_pdf(pdf_file)

    if method == "error":
        return extracted_text, "Error", "Error"

    try:
        # Shorten for model input
        short_text = extracted_text[:1000]
        summary = summarizer(short_text, max_length=120, min_length=30, do_sample=False)[0]["summary_text"]

        explanation = (
            "This summary simplifies the medical content extracted from your report. "
            "If there are specific medical terms or values (e.g. Hemoglobin, WBC), the app tries to interpret them. "
            "For full interpretation, consult a doctor."
        )

        return extracted_text, summary.strip(), explanation

    except Exception as e:
        return extracted_text, "❌ Summarization failed.", f"Error: {str(e)}"

# Gradio UI
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload Medical Report (PDF)", type="file"),
    outputs=[
        gr.Textbox(label="📄 Extracted Report Text"),
        gr.Textbox(label="🧠 AI-Generated Summary"),
        gr.Textbox(label="📘 Simplified Explanation")
    ],
    title="🧪 Medical Report Reader (Free)",
    description=(
        "Upload a medical report in PDF (scanned or digital). The app will extract the text, summarize it using AI, "
        "and give a simplified explanation."
    )
)

if __name__ == "__main__":
    iface.launch()