pdf-to-markdown

Running

Biifruu commited on 19 days ago

Commit

9e2e286

verified ·

1 Parent(s): 565985f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,20 +1,16 @@
 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
-import tempfile
-import os
 from PIL import Image
 import pytesseract
 def clean_ocr_text(text):
     lines = text.splitlines()
     cleaned_lines = []
     for line in lines:
         line = line.strip()
         if line and not line.isspace():
             cleaned_lines.append(line)
     return "\n".join(cleaned_lines)
 def extract_text_markdown(doc):
@@ -68,10 +64,10 @@ def extract_text_markdown(doc):
 @spaces.GPU
 def convert(pdf_file):
-    image_paths.append(image_path)
     doc = fitz.open(pdf_file)
     markdown_output = ""
     image_counter = 1
     for page_num in range(len(doc)):
         page = doc[page_num]
@@ -81,13 +77,15 @@ def convert(pdf_file):
             # Página con texto normal
             markdown_output += extract_text_markdown([page]) + "\n"
         else:
-            # Página vacía o imagen: hacer OCR
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            # Guardar imagen escaneada completa
             image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
             img.save(image_path)
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
@@ -112,4 +110,3 @@ gr.Interface(
         gr.Gallery(label="Imágenes extraídas").style(grid=[2], height="auto")
     ],
 ).launch()

 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
 from PIL import Image
 import pytesseract
 def clean_ocr_text(text):
     lines = text.splitlines()
     cleaned_lines = []
     for line in lines:
         line = line.strip()
         if line and not line.isspace():
             cleaned_lines.append(line)
     return "\n".join(cleaned_lines)
 def extract_text_markdown(doc):
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
     markdown_output = ""
     image_counter = 1
+    image_paths = []
     for page_num in range(len(doc)):
         page = doc[page_num]
             # Página con texto normal
             markdown_output += extract_text_markdown([page]) + "\n"
         else:
+            # Página vacía o con imagen: hacer OCR
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # Guardar imagen completa
             image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
             img.save(image_path)
+            image_paths.append(image_path)
             markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
             try:
         gr.Gallery(label="Imágenes extraídas").style(grid=[2], height="auto")
     ],
 ).launch()