Spaces:
Running
Running
File size: 5,094 Bytes
145d936 d4b4544 75d0452 584dc82 dd8d861 75d0452 4337f3a 8ff21c1 0a2dbbc c20f519 0a2dbbc 8ff21c1 0a2dbbc dd8d861 0a2dbbc dd8d861 0a2dbbc c1d7645 0a2dbbc e62d9f5 f79c813 9e2e286 dd8d861 f79c813 e62d9f5 f79c813 8ff21c1 f79c813 0a2dbbc f79c813 6e5a37b 9e2e286 6e5a37b dd8d861 6e5a37b 4f878aa dd8d861 6e5a37b f79c813 dd8d861 0a2dbbc 4337f3a 4f878aa 145d936 dd29269 565985f 1de2023 565985f 4f878aa 565985f dd29269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import spaces
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import os
import numpy as np
import cv2
def clean_ocr_text(text):
lines = text.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
if line and not line.isspace():
cleaned_lines.append(line)
return "\n".join(cleaned_lines)
def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
markdown_output = f"\n## Página {page_index + 1}\n\n"
image_counter = 1
elements = []
page = doc[0] # solo una página en cada llamada
blocks = page.get_text("dict")["blocks"]
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
max_font_size = max([span.get("size", 10) for span in line["spans"]])
if line_text:
elements.append((line_y, line_text, max_font_size))
# Extraer imágenes únicas (por xref)
images_on_page = page.get_images(full=True)
for img_index, img in enumerate(images_on_page):
xref = img[0]
if xref in seen_xrefs:
continue
seen_xrefs.add(xref)
try:
base_image = page.parent.extract_image(xref)
image_bytes = base_image["image"]
ext = base_image["ext"]
image_path = f"/tmp/imagen_p{page_index + 1}_{img_index + 1}.{ext}"
with open(image_path, "wb") as f:
f.write(image_bytes)
image_paths.append(image_path)
elements.append((float("inf") - img_index, f"\n\n\n", 10))
image_counter += 1
except Exception as e:
elements.append((float("inf"), f"[Error imagen: {e}]", 10))
elements.sort(key=lambda x: x[0])
previous_y = None
for y, text, font_size in elements:
is_header = font_size >= 14
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
if is_header:
markdown_output += f"\n### {text.strip()}\n"
else:
markdown_output += text.strip() + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_paths = []
seen_xrefs = set()
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text").strip()
if len(text) > 30:
markdown_output += extract_text_markdown([page], image_paths, page_num, seen_xrefs) + "\n"
else:
markdown_output += f"\n## Página {page_num + 1}\n\n"
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
img.save(image_path)
image_paths.append(image_path)
markdown_output += f"\n"
# OCR
try:
ocr_text = pytesseract.image_to_string(img)
except pytesseract.TesseractError:
ocr_text = ""
ocr_text = clean_ocr_text(ocr_text)
if ocr_text.strip():
markdown_output += ocr_text + "\n"
# Detección de imágenes dentro de la imagen completa (por contornos)
try:
img_cv = np.array(img)
gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for i, cnt in enumerate(contours):
x, y, w, h = cv2.boundingRect(cnt)
if w > 50 and h > 50:
region = img_cv[y:y+h, x:x+w]
detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
Image.fromarray(region).save(detected_path)
image_paths.append(detected_path)
markdown_output += f"\n\n\n"
except Exception as e:
markdown_output += f"\n\n[Error al detectar imágenes embebidas: {e}]\n"
markdown_output += "\n---\n\n"
markdown_path = "/tmp/resultado.md"
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_output)
return markdown_output.strip(), {}, image_paths, markdown_path
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[
gr.Markdown(label="Markdown estructurado"),
gr.JSON(label="Metadata"),
gr.Gallery(label="Imágenes extraídas", type="file"),
gr.File(label="Descargar .md")
],
).launch()
|