Spaces:
Running
Running
File size: 4,157 Bytes
145d936 d4b4544 75d0452 584dc82 75d0452 4337f3a 584dc82 3920f3b c20f519 c1d7645 6e5a37b d4b4544 145d936 4f878aa d4b4544 e62d9f5 6e5a37b beb65ba d4b4544 479e852 75d0452 479e852 75d0452 6e5a37b 4f878aa 7064c41 3e3d3c7 75d0452 7b1bb08 3e3d3c7 75d0452 3e3d3c7 b3fecd4 d4b4544 e62d9f5 f79c813 9e2e286 f79c813 e62d9f5 f79c813 584dc82 f79c813 6e5a37b 9e2e286 6e5a37b 4f878aa 6e5a37b f79c813 4337f3a 4f878aa 145d936 dd29269 565985f 1de2023 565985f 4f878aa 565985f dd29269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import spaces
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import os
def clean_ocr_text(text):
lines = text.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
if line and not line.isspace():
cleaned_lines.append(line)
return "\n".join(cleaned_lines)
def extract_text_markdown(doc, image_paths):
markdown_output = ""
image_counter = 1
for page_num, page in enumerate(doc):
blocks = page.get_text("dict")["blocks"]
elements = []
# 🔁 Añadir texto normal (bloques)
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
max_font_size = max([span.get("size", 10) for span in line["spans"]])
if line_text:
elements.append((line_y, line_text, max_font_size))
# 🖼️ Extraer imágenes reales de la página (xref)
images_on_page = page.get_images(full=True)
for img_index, img in enumerate(images_on_page):
xref = img[0]
try:
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
ext = base_image["ext"]
image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
with open(image_path, "wb") as f:
f.write(image_bytes)
image_paths.append(image_path)
y_pos = 50 + img_index * 10 # Posición estimada para ordenar
elements.append((y_pos, f"", 10))
image_counter += 1
except Exception as e:
elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
# Ordenar y construir Markdown
elements.sort(key=lambda x: x[0])
previous_y = None
for y, text, font_size in elements:
is_header = font_size >= 14
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
if is_header:
markdown_output += f"\n### {text.strip()}\n"
else:
markdown_output += text.strip() + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_paths = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text").strip()
if len(text) > 30:
markdown_output += extract_text_markdown([page], image_paths) + "\n"
else:
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
img.save(image_path)
image_paths.append(image_path)
markdown_output += f"\n"
try:
ocr_text = pytesseract.image_to_string(img, lang="spa")
except pytesseract.TesseractError:
ocr_text = pytesseract.image_to_string(img)
ocr_text = clean_ocr_text(ocr_text)
if ocr_text.strip():
markdown_output += ocr_text + "\n"
markdown_output += "\n---\n\n"
# Guardar como archivo .md
markdown_path = "/tmp/resultado.md"
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_output)
return markdown_output.strip(), {}, image_paths, markdown_path
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[
gr.Markdown(label="Markdown estructurado"),
gr.JSON(label="Metadata"),
gr.Gallery(label="Imágenes extraídas", type="file"),
gr.File(label="Descargar .md")
],
).launch()
|