Spaces:
Running
Running
File size: 2,094 Bytes
145d936 d4b4544 4337f3a 7b1bb08 4337f3a e62d9f5 3920f3b 891d450 c1d7645 e62d9f5 d4b4544 145d936 d4b4544 e62d9f5 beb65ba d4b4544 3e3d3c7 beb65ba 891d450 beb65ba 7b1bb08 d4b4544 891d450 3e3d3c7 7b1bb08 3e3d3c7 b3fecd4 d4b4544 e62d9f5 7b1bb08 e62d9f5 4337f3a e62d9f5 7b1bb08 e62d9f5 145d936 e62d9f5 7b1bb08 145d936 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import spaces
import gradio as gr
import fitz # PyMuPDF
import ocrmypdf
import tempfile
import os
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1 # Contador de im谩genes
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
if line_text:
elements.append((line_y, line_text))
elif b["type"] == 1: # Imagen
# A帽ade un enlace con nombre 煤nico
elements.append((y, f"[imagen_{image_counter}]()"))
image_counter += 1
# Ordenar por posici贸n vertical
elements.sort(key=lambda x: x[0])
# Reconstrucci贸n con saltos l贸gicos
previous_y = None
for y, content in elements:
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
markdown_output += content + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
# Aplicar OCR solo si el PDF no tiene texto
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Si necesitas metadatos, se pueden agregar aqu铆
return markdown, metadata
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()
|