Spaces:
Running
Running
File size: 2,696 Bytes
145d936 d4b4544 7b1bb08 dd29269 75d0452 4337f3a e62d9f5 3920f3b c20f519 c1d7645 e62d9f5 d4b4544 145d936 d4b4544 e62d9f5 beb65ba d4b4544 479e852 75d0452 479e852 75d0452 beb65ba 75d0452 891d450 beb65ba 7064c41 479e852 3e3d3c7 75d0452 7b1bb08 3e3d3c7 75d0452 3e3d3c7 75d0452 3e3d3c7 b3fecd4 d4b4544 e62d9f5 f79c813 e62d9f5 f79c813 75d0452 f79c813 75d0452 f79c813 4337f3a f79c813 145d936 dd29269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import spaces
import gradio as gr
import fitz # PyMuPDF
import tempfile
import os
from PIL import Image
import pytesseract
def clean_ocr_text(text):
lines = text.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
if line and not line.isspace():
cleaned_lines.append(line)
return "\n".join(cleaned_lines)
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
max_font_size = max([span.get("size", 10) for span in line["spans"]])
if line_text:
elements.append((line_y, line_text, max_font_size))
elif b["type"] == 1: # Imagen
elements.append((y, f"", 10))
image_counter += 1
elements.sort(key=lambda x: x[0])
previous_y = None
previous_font = None
for y, text, font_size in elements:
is_header = font_size >= 14
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
if is_header:
markdown_output += f"\n### {text.strip()}\n"
else:
markdown_output += text.strip() + "\n"
previous_y = y
previous_font = font_size
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
doc = fitz.open(pdf_file)
markdown_output = ""
image_counter = 1
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text").strip()
if len(text) > 30:
# Página con texto normal
markdown_output += extract_text_markdown([page]) + "\n"
else:
# Página sin texto: usar OCR
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
ocr_text = pytesseract.image_to_string(img, lang="spa")
markdown_output += clean_ocr_text(ocr_text) + "\n"
markdown_output += "\n---\n\n"
return markdown_output.strip(), {}
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()
|