pdf-to-markdown

Running

Biifruu commited on 17 days ago

Commit

3920f3b

verified ·

1 Parent(s): b697ac0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,51 +1,27 @@
 import spaces
 import gradio as gr
-from pypdf import PdfReader
-import ocrmypdf
-def extract_text_from_pdf(reader):
-    full_text = ""
-    for idx, page in enumerate(reader.pages):
-        text = page.extract_text()
-        if len(text) > 0:
-            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
-    return full_text.strip()
 @spaces.GPU
 def convert(pdf_file):
-    reader = PdfReader(pdf_file)
-    # Extract metadata
-    metadata = {
-        "author": reader.metadata.author,
-        "creator": reader.metadata.creator,
-        "producer": reader.metadata.producer,
-        "subject": reader.metadata.subject,
-        "title": reader.metadata.title,
-    }
-    # Extract text
-    full_text = extract_text_from_pdf(reader)
-    # Check if there are any images
-    image_count = 0
-    for page in reader.pages:
-        image_count += len(page.images)
-    # If there are images and not much content, perform OCR on the document
-    if image_count > 0 and len(full_text) < 1000:
-        out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
-        ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
-        # Re-extract text
-        reader = PdfReader(pdf_file)
-        full_text = extract_text_from_pdf(reader)
-    return full_text, metadata
 gr.Interface(
     convert,

 import spaces
 import gradio as gr
+from pdf2image import convert_from_path
+import pytesseract
+from PIL import Image
+import os
 @spaces.GPU
 def convert(pdf_file):
+    pages = convert_from_path(pdf_file)
+    markdown_output = ""
+    metadata = {}  # Opcional: puedes extraer metadata con PyMuPDF si lo deseas
+    for idx, page_image in enumerate(pages):
+        # Realizar OCR
+        text = pytesseract.image_to_string(page_image)
+        if text.strip() == "":
+            # Si no hay texto, insertar un enlace vacío
+            markdown_output += f"[imagen]()\n\n"
+        else:
+            markdown_output += text.strip() + "\n\n"
+    return markdown_output.strip(), metadata
 gr.Interface(
     convert,