pdf-to-markdown

Sleeping

Biifruu commited on Jun 2

Commit

f79c813

verified ·

1 Parent(s): d7cc8b9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
@@ -45,22 +47,27 @@ def needs_ocr(doc):
 @spaces.GPU
 def convert(pdf_file):
-    original_doc = fitz.open(pdf_file)
-    if needs_ocr(original_doc):
-        try:
-            ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
-            ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
-            doc = fitz.open(ocr_temp_path)
-            os.remove(ocr_temp_path)
-        except Exception as e:
-            return f"Error al aplicar OCR: {e}", {}
-    else:
-        doc = original_doc
-    markdown = extract_text_markdown(doc)
-    metadata = {}  # Puedes agregar metadatos aquí si lo necesitas
-    return markdown, metadata
 gr.Interface(
     fn=convert,

+from PIL import Image
+import pytesseract
 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
 @spaces.GPU
 def convert(pdf_file):
+    doc = fitz.open(pdf_file)
+    markdown_output = ""
+    image_counter = 1
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        text = page.get_text("text").strip()
+        if len(text) > 30:
+            # Página con texto normal
+            markdown_output += extract_text_markdown([page]) + "\n"
+        else:
+            # Página sin texto: usar OCR por imagen
+            pix = page.get_pixmap(dpi=300)
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            ocr_text = pytesseract.image_to_string(img, lang="spa")
+            markdown_output += ocr_text.strip() + "\n"
+        markdown_output += "\n---\n\n"
+    return markdown_output.strip(), {}
 gr.Interface(
     fn=convert,