Spaces:
Running
Running
File size: 2,382 Bytes
145d936 d4b4544 beb65ba 4337f3a e62d9f5 3920f3b beb65ba c1d7645 e62d9f5 d4b4544 145d936 e62d9f5 dd21256 e62d9f5 dd21256 4337f3a dd21256 e62d9f5 dd21256 d4b4544 e62d9f5 beb65ba e62d9f5 d4b4544 e62d9f5 beb65ba e62d9f5 beb65ba d4b4544 beb65ba d4b4544 e62d9f5 4337f3a e62d9f5 145d936 e62d9f5 dd21256 e62d9f5 145d936 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import spaces
import gradio as gr
import fitz # PyMuPDF
import os
import tempfile
import ocrmypdf
def extract_text_markdown(doc):
markdown_output = ""
image_dir = "extracted_images"
os.makedirs(image_dir, exist_ok=True)
image_counter = 0
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
# Extraer imágenes y guardar para asignar link
image_list = page.get_images(full=True)
xref_to_placeholder = {}
for img in image_list:
xref = img[0]
pix = fitz.Pixmap(doc, xref)
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
if pix.n > 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save(img_path)
pix = None
xref_to_placeholder[xref] = f"![imagen]()"
image_counter += 1
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
paragraph = ""
for line in b["lines"]:
line_text = " ".join([span["text"].strip() for span in line["spans"]])
paragraph += line_text + " "
paragraph = paragraph.strip()
if paragraph:
elements.append((y, paragraph))
elif b["type"] == 1: # Imagen
xref = b.get("image")
elements.append((y, "![imagen]()"))
elements.sort(key=lambda x: x[0])
for _, content in elements:
markdown_output += content + "\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
# Si es imagen escaneada sin texto, aplicamos OCR
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Puedes agregar metadatos si quieres
return markdown, metadata
gr.Interface(
fn=convert,
inputs=[gr.File(label="Upload PDF", type="filepath")],
outputs=[gr.Text(label="Markdown crudo"), gr.JSON(label="Metadata")],
).launch()
|