Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -21,12 +21,13 @@ def extract_text_markdown(doc):
|
|
21 |
markdown_output = ""
|
22 |
image_counter = 1
|
23 |
|
24 |
-
for page in doc:
|
25 |
blocks = page.get_text("dict")["blocks"]
|
26 |
elements = []
|
27 |
|
28 |
for b in blocks:
|
29 |
y = b["bbox"][1]
|
|
|
30 |
if b["type"] == 0: # Texto
|
31 |
for line in b["lines"]:
|
32 |
line_y = line["bbox"][1]
|
@@ -34,14 +35,19 @@ def extract_text_markdown(doc):
|
|
34 |
max_font_size = max([span.get("size", 10) for span in line["spans"]])
|
35 |
if line_text:
|
36 |
elements.append((line_y, line_text, max_font_size))
|
|
|
37 |
elif b["type"] == 1: # Imagen
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
elements.sort(key=lambda x: x[0])
|
42 |
-
|
43 |
previous_y = None
|
44 |
-
previous_font = None
|
45 |
|
46 |
for y, text, font_size in elements:
|
47 |
is_header = font_size >= 14
|
@@ -55,7 +61,6 @@ def extract_text_markdown(doc):
|
|
55 |
markdown_output += text.strip() + "\n"
|
56 |
|
57 |
previous_y = y
|
58 |
-
previous_font = font_size
|
59 |
|
60 |
markdown_output += "\n---\n\n"
|
61 |
|
@@ -75,11 +80,23 @@ def convert(pdf_file):
|
|
75 |
# Página con texto normal
|
76 |
markdown_output += extract_text_markdown([page]) + "\n"
|
77 |
else:
|
78 |
-
# Página
|
79 |
pix = page.get_pixmap(dpi=300)
|
80 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
markdown_output += "\n---\n\n"
|
85 |
|
@@ -90,3 +107,4 @@ gr.Interface(
|
|
90 |
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
|
91 |
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
|
92 |
).launch()
|
|
|
|
21 |
markdown_output = ""
|
22 |
image_counter = 1
|
23 |
|
24 |
+
for page_num, page in enumerate(doc):
|
25 |
blocks = page.get_text("dict")["blocks"]
|
26 |
elements = []
|
27 |
|
28 |
for b in blocks:
|
29 |
y = b["bbox"][1]
|
30 |
+
|
31 |
if b["type"] == 0: # Texto
|
32 |
for line in b["lines"]:
|
33 |
line_y = line["bbox"][1]
|
|
|
35 |
max_font_size = max([span.get("size", 10) for span in line["spans"]])
|
36 |
if line_text:
|
37 |
elements.append((line_y, line_text, max_font_size))
|
38 |
+
|
39 |
elif b["type"] == 1: # Imagen
|
40 |
+
try:
|
41 |
+
image = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=b["bbox"])
|
42 |
+
image_path = f"/tmp/imagen_embebida_{page_num + 1}_{image_counter}.jpg"
|
43 |
+
image.save(image_path)
|
44 |
+
elements.append((y, f"", 10))
|
45 |
+
image_counter += 1
|
46 |
+
except Exception as e:
|
47 |
+
elements.append((y, f"[Error al procesar imagen: {e}]", 10))
|
48 |
|
49 |
elements.sort(key=lambda x: x[0])
|
|
|
50 |
previous_y = None
|
|
|
51 |
|
52 |
for y, text, font_size in elements:
|
53 |
is_header = font_size >= 14
|
|
|
61 |
markdown_output += text.strip() + "\n"
|
62 |
|
63 |
previous_y = y
|
|
|
64 |
|
65 |
markdown_output += "\n---\n\n"
|
66 |
|
|
|
80 |
# Página con texto normal
|
81 |
markdown_output += extract_text_markdown([page]) + "\n"
|
82 |
else:
|
83 |
+
# Página vacía o imagen: hacer OCR
|
84 |
pix = page.get_pixmap(dpi=300)
|
85 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
86 |
+
|
87 |
+
# Guardar imagen escaneada completa
|
88 |
+
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
|
89 |
+
img.save(image_path)
|
90 |
+
markdown_output += f"\n"
|
91 |
+
|
92 |
+
try:
|
93 |
+
ocr_text = pytesseract.image_to_string(img, lang="spa")
|
94 |
+
except pytesseract.TesseractError:
|
95 |
+
ocr_text = pytesseract.image_to_string(img) # fallback sin lang
|
96 |
+
|
97 |
+
ocr_text = clean_ocr_text(ocr_text)
|
98 |
+
if ocr_text.strip():
|
99 |
+
markdown_output += ocr_text + "\n"
|
100 |
|
101 |
markdown_output += "\n---\n\n"
|
102 |
|
|
|
107 |
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
|
108 |
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
|
109 |
).launch()
|
110 |
+
|