Spaces:
Sleeping
Sleeping
File size: 2,331 Bytes
f79c813 145d936 d4b4544 4337f3a 7b1bb08 dd29269 4337f3a e62d9f5 3920f3b c20f519 c1d7645 e62d9f5 d4b4544 145d936 d4b4544 e62d9f5 beb65ba d4b4544 479e852 beb65ba 891d450 beb65ba 7064c41 479e852 3e3d3c7 7b1bb08 3e3d3c7 b3fecd4 d4b4544 e62d9f5 c20f519 e62d9f5 f79c813 e62d9f5 f79c813 4337f3a f79c813 145d936 dd29269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from PIL import Image
import pytesseract
import spaces
import gradio as gr
import fitz # PyMuPDF
import ocrmypdf
import tempfile
import os
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
if line_text:
elements.append((line_y, line_text))
elif b["type"] == 1: # Imagen
elements.append((y, f"[imagen_{image_counter}]()"))
image_counter += 1
elements.sort(key=lambda x: x[0])
previous_y = None
for y, content in elements:
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
markdown_output += content + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
def needs_ocr(doc):
text_length = sum(len(page.get_text().strip()) for page in doc)
image_count = sum(len(page.get_images(full=True)) for page in doc)
return text_length < 500 or image_count > 0
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_counter = 1
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text").strip()
if len(text) > 30:
# Página con texto normal
markdown_output += extract_text_markdown([page]) + "\n"
else:
# Página sin texto: usar OCR por imagen
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
ocr_text = pytesseract.image_to_string(img, lang="spa")
markdown_output += ocr_text.strip() + "\n"
markdown_output += "\n---\n\n"
return markdown_output.strip(), {}
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()
|