Spaces:
Running
Running
File size: 3,605 Bytes
145d936 d4b4544 75d0452 4337f3a e62d9f5 3920f3b c20f519 c1d7645 6e5a37b d4b4544 145d936 d4b4544 e62d9f5 6e5a37b beb65ba d4b4544 479e852 75d0452 479e852 75d0452 6e5a37b beb65ba 6e5a37b beb65ba 7064c41 3e3d3c7 75d0452 7b1bb08 3e3d3c7 75d0452 3e3d3c7 b3fecd4 d4b4544 e62d9f5 f79c813 9e2e286 f79c813 e62d9f5 f79c813 9e2e286 f79c813 6e5a37b 9e2e286 6e5a37b 9e2e286 6e5a37b f79c813 4337f3a 565985f 145d936 dd29269 565985f 1de2023 565985f 17e68a7 565985f dd29269 17e68a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import spaces
import gradio as gr
import fitz # PyMuPDF
from PIL import Image
import pytesseract
def clean_ocr_text(text):
lines = text.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
if line and not line.isspace():
cleaned_lines.append(line)
return "\n".join(cleaned_lines)
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page_num, page in enumerate(doc):
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
max_font_size = max([span.get("size", 10) for span in line["spans"]])
if line_text:
elements.append((line_y, line_text, max_font_size))
elif b["type"] == 1: # Imagen
try:
image = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=b["bbox"])
image_path = f"/tmp/imagen_embebida_{page_num + 1}_{image_counter}.jpg"
image.save(image_path)
elements.append((y, f"", 10))
image_counter += 1
except Exception as e:
elements.append((y, f"[Error al procesar imagen: {e}]", 10))
elements.sort(key=lambda x: x[0])
previous_y = None
for y, text, font_size in elements:
is_header = font_size >= 14
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
if is_header:
markdown_output += f"\n### {text.strip()}\n"
else:
markdown_output += text.strip() + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_counter = 1
image_paths = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text").strip()
if len(text) > 30:
# Página con texto normal
markdown_output += extract_text_markdown([page]) + "\n"
else:
# Página vacía o con imagen: hacer OCR
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Guardar imagen completa
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
img.save(image_path)
image_paths.append(image_path)
markdown_output += f"\n"
try:
ocr_text = pytesseract.image_to_string(img, lang="spa")
except pytesseract.TesseractError:
ocr_text = pytesseract.image_to_string(img) # fallback sin lang
ocr_text = clean_ocr_text(ocr_text)
if ocr_text.strip():
markdown_output += ocr_text + "\n"
markdown_output += "\n---\n\n"
return markdown_output.strip(), {}, image_paths
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[
gr.Markdown(label="Markdown estructurado"),
gr.JSON(label="Metadata"),
gr.Gallery(label="Imágenes extraídas")
],
).launch()
|