Spaces:
Running
Running
File size: 5,270 Bytes
145d936 d4b4544 75d0452 584dc82 dd8d861 75d0452 4337f3a 8ff21c1 0a2dbbc c20f519 0a2dbbc 3c39da9 0a2dbbc 3c39da9 0a2dbbc dd8d861 0a2dbbc c1d7645 0a2dbbc e62d9f5 f79c813 9e2e286 dd8d861 f79c813 e62d9f5 f79c813 8ff21c1 f79c813 0a2dbbc f79c813 6e5a37b 9e2e286 6e5a37b 4f878aa dd8d861 6e5a37b f79c813 dd8d861 3c39da9 dd8d861 3c39da9 dd8d861 0a2dbbc 4337f3a 4f878aa 86d7cbb 145d936 2f1a912 fe33891 7ad1608 2f1a912 86d7cbb fe33891 7ad1608 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import spaces
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
import pytesseract
import os
import numpy as np
import cv2
def clean_ocr_text(text):
lines = text.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
if line and not line.isspace():
cleaned_lines.append(line)
return "\n".join(cleaned_lines)
def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
markdown_output = f"\n## Página {page_index + 1}\n\n"
image_counter = 1
elements = []
page = doc[0]
blocks = page.get_text("dict")["blocks"]
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0:
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
max_font_size = max([span.get("size", 10) for span in line["spans"]])
if line_text:
elements.append((line_y, line_text, max_font_size))
images_on_page = page.get_images(full=True)
for img_index, img in enumerate(images_on_page):
xref = img[0]
if xref in seen_xrefs:
continue
seen_xrefs.add(xref)
try:
base_image = page.parent.extract_image(xref)
image_bytes = base_image["image"]
ext = base_image["ext"]
image_path = f"/tmp/imagen_p{page_index + 1}_{img_index + 1}.{ext}"
with open(image_path, "wb") as f:
f.write(image_bytes)
image_paths.append(image_path)
elements.append((float("inf") - img_index, f"\n\n\n", 10))
image_counter += 1
except Exception as e:
elements.append((float("inf"), f"[Error imagen: {e}]", 10))
elements.sort(key=lambda x: x[0])
previous_y = None
for y, text, font_size in elements:
is_header = font_size >= 14
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
if is_header:
markdown_output += f"\n### {text.strip()}\n"
else:
markdown_output += text.strip() + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_paths = []
seen_xrefs = set()
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text").strip()
if len(text) > 30:
markdown_output += extract_text_markdown([page], image_paths, page_num, seen_xrefs) + "\n"
else:
markdown_output += f"\n## Página {page_num + 1}\n\n"
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
img.save(image_path)
image_paths.append(image_path)
markdown_output += f"\n"
try:
ocr_text = pytesseract.image_to_string(img)
except pytesseract.TesseractError:
ocr_text = ""
ocr_text = clean_ocr_text(ocr_text)
if ocr_text.strip():
markdown_output += ocr_text + "\n"
try:
img_cv = np.array(img)
gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
for i, cnt in enumerate(contours):
x, y, w, h = cv2.boundingRect(cnt)
area = w * h
if area > 5000:
region = img_cv[y:y+h, x:x+w]
detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
Image.fromarray(region).save(detected_path)
image_paths.append(detected_path)
markdown_output += f"\n\n\n"
except Exception as e:
markdown_output += f"\n\n[Error al detectar imágenes embebidas: {e}]\n"
markdown_output += "\n---\n\n"
markdown_path = "/tmp/resultado.md"
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_output)
return markdown_output.strip(), image_paths, markdown_path
# Interfaz Gradio compatible
with gr.Blocks() as demo:
with gr.Row():
pdf_input = gr.File(label="Sube tu PDF", type="filepath")
submit_btn = gr.Button("Procesar PDF")
# 🔄 Botón refrescar eliminado
markdown_output = gr.Textbox(label="Markdown estructurado", lines=25, interactive=True)
gallery_output = gr.Gallery(label="Imágenes extraídas", type="file")
download_md = gr.File(label="Descargar .md")
submit_btn.click(fn=convert, inputs=[pdf_input], outputs=[markdown_output, gallery_output, download_md])
demo.launch()
|