Spaces:
Running
Running
File size: 4,187 Bytes
145d936 d4b4544 75d0452 584dc82 75d0452 4337f3a 8ff21c1 0a2dbbc c20f519 0a2dbbc 8ff21c1 0a2dbbc 8ff21c1 0a2dbbc 8ff21c1 0a2dbbc c1d7645 0a2dbbc e62d9f5 f79c813 9e2e286 8ff21c1 f79c813 e62d9f5 f79c813 8ff21c1 f79c813 0a2dbbc f79c813 6e5a37b 9e2e286 6e5a37b 4f878aa 6e5a37b f79c813 0a2dbbc 4337f3a 4f878aa 145d936 dd29269 565985f 1de2023 565985f 4f878aa 565985f dd29269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import spaces
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import os
def clean_ocr_text(text):
lines = text.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
if line and not line.isspace():
cleaned_lines.append(line)
return "\n".join(cleaned_lines)
def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
markdown_output = f"\n## Página {page_index + 1}\n\n"
image_counter = 1
elements = []
page = doc[0] # solo una página en cada llamada
blocks = page.get_text("dict")["blocks"]
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
max_font_size = max([span.get("size", 10) for span in line["spans"]])
if line_text:
elements.append((line_y, line_text, max_font_size))
# Extraer imágenes únicas (por xref, global)
images_on_page = page.get_images(full=True)
for img_index, img in enumerate(images_on_page):
xref = img[0]
if xref in seen_xrefs:
continue # ya extraída
seen_xrefs.add(xref)
try:
base_image = page.parent.extract_image(xref)
image_bytes = base_image["image"]
ext = base_image["ext"]
image_path = f"/tmp/imagen_p{page_index + 1}_{img_index + 1}.{ext}"
with open(image_path, "wb") as f:
f.write(image_bytes)
image_paths.append(image_path)
elements.append((float("inf") - img_index, f"\n\n\n", 10))
image_counter += 1
except Exception as e:
elements.append((float("inf"), f"[Error imagen: {e}]", 10))
# Ordenar por posición
elements.sort(key=lambda x: x[0])
previous_y = None
for y, text, font_size in elements:
is_header = font_size >= 14
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
if is_header:
markdown_output += f"\n### {text.strip()}\n"
else:
markdown_output += text.strip() + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_paths = []
seen_xrefs = set() # <<-- GLOBAL para todo el PDF
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text").strip()
if len(text) > 30:
markdown_output += extract_text_markdown([page], image_paths, page_num, seen_xrefs) + "\n"
else:
markdown_output += f"\n## Página {page_num + 1}\n\n"
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
img.save(image_path)
image_paths.append(image_path)
markdown_output += f"\n"
try:
ocr_text = pytesseract.image_to_string(img, lang="spa")
except pytesseract.TesseractError:
ocr_text = pytesseract.image_to_string(img)
ocr_text = clean_ocr_text(ocr_text)
if ocr_text.strip():
markdown_output += ocr_text + "\n"
markdown_output += "\n---\n\n"
# Guardar como archivo .md
markdown_path = "/tmp/resultado.md"
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_output)
return markdown_output.strip(), {}, image_paths, markdown_path
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[
gr.Markdown(label="Markdown estructurado"),
gr.JSON(label="Metadata"),
gr.Gallery(label="Imágenes extraídas", type="file"),
gr.File(label="Descargar .md")
],
).launch()
|