File size: 5,270 Bytes
145d936
 
d4b4544
75d0452
 
584dc82
dd8d861
 
75d0452
 
 
 
 
 
 
 
 
4337f3a
8ff21c1
0a2dbbc
c20f519
0a2dbbc
 
3c39da9
0a2dbbc
 
 
 
 
3c39da9
0a2dbbc
 
 
 
 
 
 
 
 
 
 
dd8d861
0a2dbbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1d7645
0a2dbbc
e62d9f5
 
 
 
f79c813
 
9e2e286
dd8d861
f79c813
 
 
 
e62d9f5
f79c813
8ff21c1
f79c813
0a2dbbc
f79c813
 
6e5a37b
 
 
9e2e286
 
6e5a37b
 
 
4f878aa
dd8d861
 
6e5a37b
 
 
 
f79c813
dd8d861
 
 
 
 
 
3c39da9
 
dd8d861
 
3c39da9
 
dd8d861
 
 
 
 
 
 
 
0a2dbbc
4337f3a
4f878aa
 
 
 
86d7cbb
145d936
2f1a912
fe33891
 
 
 
 
7ad1608
2f1a912
86d7cbb
fe33891
 
 
 
 
 
7ad1608
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import spaces
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import os
import numpy as np
import cv2

def clean_ocr_text(text):
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if line and not line.isspace():
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
    markdown_output = f"\n## Página {page_index + 1}\n\n"
    image_counter = 1
    elements = []

    page = doc[0]

    blocks = page.get_text("dict")["blocks"]

    for b in blocks:
        y = b["bbox"][1]
        if b["type"] == 0:
            for line in b["lines"]:
                line_y = line["bbox"][1]
                line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                max_font_size = max([span.get("size", 10) for span in line["spans"]])
                if line_text:
                    elements.append((line_y, line_text, max_font_size))

    images_on_page = page.get_images(full=True)
    for img_index, img in enumerate(images_on_page):
        xref = img[0]
        if xref in seen_xrefs:
            continue
        seen_xrefs.add(xref)
        try:
            base_image = page.parent.extract_image(xref)
            image_bytes = base_image["image"]
            ext = base_image["ext"]
            image_path = f"/tmp/imagen_p{page_index + 1}_{img_index + 1}.{ext}"
            with open(image_path, "wb") as f:
                f.write(image_bytes)
            image_paths.append(image_path)
            elements.append((float("inf") - img_index, f"\n\n![imagen_{image_counter}]({image_path})\n", 10))
            image_counter += 1
        except Exception as e:
            elements.append((float("inf"), f"[Error imagen: {e}]", 10))

    elements.sort(key=lambda x: x[0])
    previous_y = None

    for y, text, font_size in elements:
        is_header = font_size >= 14
        if previous_y is not None and abs(y - previous_y) > 10:
            markdown_output += "\n"
        if is_header:
            markdown_output += f"\n### {text.strip()}\n"
        else:
            markdown_output += text.strip() + "\n"
        previous_y = y

    markdown_output += "\n---\n\n"
    return markdown_output.strip()

@spaces.GPU
def convert(pdf_file):
    doc = fitz.open(pdf_file)
    markdown_output = ""
    image_paths = []
    seen_xrefs = set()

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text").strip()

        if len(text) > 30:
            markdown_output += extract_text_markdown([page], image_paths, page_num, seen_xrefs) + "\n"
        else:
            markdown_output += f"\n## Página {page_num + 1}\n\n"
            pix = page.get_pixmap(dpi=300)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
            img.save(image_path)
            image_paths.append(image_path)

            markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"

            try:
                ocr_text = pytesseract.image_to_string(img)
            except pytesseract.TesseractError:
                ocr_text = ""

            ocr_text = clean_ocr_text(ocr_text)
            if ocr_text.strip():
                markdown_output += ocr_text + "\n"

            try:
                img_cv = np.array(img)
                gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
                _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
                contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

                contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]

                for i, cnt in enumerate(contours):
                    x, y, w, h = cv2.boundingRect(cnt)
                    area = w * h
                    if area > 5000:
                        region = img_cv[y:y+h, x:x+w]
                        detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
                        Image.fromarray(region).save(detected_path)
                        image_paths.append(detected_path)
                        markdown_output += f"\n\n![imagen_detectada]({detected_path})\n"
            except Exception as e:
                markdown_output += f"\n\n[Error al detectar imágenes embebidas: {e}]\n"

            markdown_output += "\n---\n\n"

    markdown_path = "/tmp/resultado.md"
    with open(markdown_path, "w", encoding="utf-8") as f:
        f.write(markdown_output)

    return markdown_output.strip(), image_paths, markdown_path

# Interfaz Gradio compatible
with gr.Blocks() as demo:
    with gr.Row():
        pdf_input = gr.File(label="Sube tu PDF", type="filepath")
        submit_btn = gr.Button("Procesar PDF")

    # 🔄 Botón refrescar eliminado

    markdown_output = gr.Textbox(label="Markdown estructurado", lines=25, interactive=True)
    gallery_output = gr.Gallery(label="Imágenes extraídas", type="file")
    download_md = gr.File(label="Descargar .md")

    submit_btn.click(fn=convert, inputs=[pdf_input], outputs=[markdown_output, gallery_output, download_md])

demo.launch()