Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -17,15 +17,14 @@ def clean_ocr_text(text):
|
|
17 |
def extract_text_markdown(doc, image_paths):
|
18 |
markdown_output = ""
|
19 |
image_counter = 1
|
|
|
20 |
|
21 |
for page_num, page in enumerate(doc):
|
22 |
blocks = page.get_text("dict")["blocks"]
|
23 |
elements = []
|
24 |
|
25 |
-
# 🔁 Añadir texto normal (bloques)
|
26 |
for b in blocks:
|
27 |
y = b["bbox"][1]
|
28 |
-
|
29 |
if b["type"] == 0: # Texto
|
30 |
for line in b["lines"]:
|
31 |
line_y = line["bbox"][1]
|
@@ -34,30 +33,29 @@ def extract_text_markdown(doc, image_paths):
|
|
34 |
if line_text:
|
35 |
elements.append((line_y, line_text, max_font_size))
|
36 |
|
37 |
-
#
|
38 |
images_on_page = page.get_images(full=True)
|
39 |
for img_index, img in enumerate(images_on_page):
|
40 |
xref = img[0]
|
|
|
|
|
|
|
41 |
try:
|
42 |
base_image = page.parent.extract_image(xref)
|
43 |
image_bytes = base_image["image"]
|
44 |
ext = base_image["ext"]
|
45 |
image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
|
46 |
-
|
47 |
with open(image_path, "wb") as f:
|
48 |
f.write(image_bytes)
|
49 |
-
|
50 |
image_paths.append(image_path)
|
51 |
-
y_pos = 50 + img_index * 10
|
52 |
elements.append((y_pos, f"", 10))
|
53 |
image_counter += 1
|
54 |
except Exception as e:
|
55 |
elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
|
56 |
|
57 |
-
# Ordenar y construir Markdown
|
58 |
elements.sort(key=lambda x: x[0])
|
59 |
previous_y = None
|
60 |
-
|
61 |
for y, text, font_size in elements:
|
62 |
is_header = font_size >= 14
|
63 |
if previous_y is not None and abs(y - previous_y) > 10:
|
@@ -72,6 +70,7 @@ def extract_text_markdown(doc, image_paths):
|
|
72 |
|
73 |
return markdown_output.strip()
|
74 |
|
|
|
75 |
@spaces.GPU
|
76 |
def convert(pdf_file):
|
77 |
doc = fitz.open(pdf_file)
|
|
|
17 |
def extract_text_markdown(doc, image_paths):
|
18 |
markdown_output = ""
|
19 |
image_counter = 1
|
20 |
+
seen_xrefs = set()
|
21 |
|
22 |
for page_num, page in enumerate(doc):
|
23 |
blocks = page.get_text("dict")["blocks"]
|
24 |
elements = []
|
25 |
|
|
|
26 |
for b in blocks:
|
27 |
y = b["bbox"][1]
|
|
|
28 |
if b["type"] == 0: # Texto
|
29 |
for line in b["lines"]:
|
30 |
line_y = line["bbox"][1]
|
|
|
33 |
if line_text:
|
34 |
elements.append((line_y, line_text, max_font_size))
|
35 |
|
36 |
+
# Extraer imágenes únicas por xref
|
37 |
images_on_page = page.get_images(full=True)
|
38 |
for img_index, img in enumerate(images_on_page):
|
39 |
xref = img[0]
|
40 |
+
if xref in seen_xrefs:
|
41 |
+
continue # Saltar si ya se extrajo
|
42 |
+
seen_xrefs.add(xref)
|
43 |
try:
|
44 |
base_image = page.parent.extract_image(xref)
|
45 |
image_bytes = base_image["image"]
|
46 |
ext = base_image["ext"]
|
47 |
image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
|
|
|
48 |
with open(image_path, "wb") as f:
|
49 |
f.write(image_bytes)
|
|
|
50 |
image_paths.append(image_path)
|
51 |
+
y_pos = 50 + img_index * 10
|
52 |
elements.append((y_pos, f"", 10))
|
53 |
image_counter += 1
|
54 |
except Exception as e:
|
55 |
elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
|
56 |
|
|
|
57 |
elements.sort(key=lambda x: x[0])
|
58 |
previous_y = None
|
|
|
59 |
for y, text, font_size in elements:
|
60 |
is_header = font_size >= 14
|
61 |
if previous_y is not None and abs(y - previous_y) > 10:
|
|
|
70 |
|
71 |
return markdown_output.strip()
|
72 |
|
73 |
+
|
74 |
@spaces.GPU
|
75 |
def convert(pdf_file):
|
76 |
doc = fitz.open(pdf_file)
|