Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -14,63 +14,64 @@ def clean_ocr_text(text):
|
|
14 |
cleaned_lines.append(line)
|
15 |
return "\n".join(cleaned_lines)
|
16 |
|
17 |
-
def extract_text_markdown(doc, image_paths):
|
18 |
-
markdown_output = ""
|
19 |
image_counter = 1
|
20 |
seen_xrefs = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
blocks = page.get_text("dict")["blocks"]
|
24 |
-
elements = []
|
25 |
-
|
26 |
-
for b in blocks:
|
27 |
-
y = b["bbox"][1]
|
28 |
-
if b["type"] == 0: # Texto
|
29 |
-
for line in b["lines"]:
|
30 |
-
line_y = line["bbox"][1]
|
31 |
-
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
|
32 |
-
max_font_size = max([span.get("size", 10) for span in line["spans"]])
|
33 |
-
if line_text:
|
34 |
-
elements.append((line_y, line_text, max_font_size))
|
35 |
-
|
36 |
-
# Extraer imágenes únicas por xref
|
37 |
-
images_on_page = page.get_images(full=True)
|
38 |
-
for img_index, img in enumerate(images_on_page):
|
39 |
-
xref = img[0]
|
40 |
-
if xref in seen_xrefs:
|
41 |
-
continue # Saltar si ya se extrajo
|
42 |
-
seen_xrefs.add(xref)
|
43 |
-
try:
|
44 |
-
base_image = page.parent.extract_image(xref)
|
45 |
-
image_bytes = base_image["image"]
|
46 |
-
ext = base_image["ext"]
|
47 |
-
image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
|
48 |
-
with open(image_path, "wb") as f:
|
49 |
-
f.write(image_bytes)
|
50 |
-
image_paths.append(image_path)
|
51 |
-
y_pos = 50 + img_index * 10
|
52 |
-
elements.append((y_pos, f"", 10))
|
53 |
-
image_counter += 1
|
54 |
-
except Exception as e:
|
55 |
-
elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
|
56 |
-
|
57 |
-
elements.sort(key=lambda x: x[0])
|
58 |
-
previous_y = None
|
59 |
-
for y, text, font_size in elements:
|
60 |
-
is_header = font_size >= 14
|
61 |
-
if previous_y is not None and abs(y - previous_y) > 10:
|
62 |
-
markdown_output += "\n"
|
63 |
-
if is_header:
|
64 |
-
markdown_output += f"\n### {text.strip()}\n"
|
65 |
-
else:
|
66 |
-
markdown_output += text.strip() + "\n"
|
67 |
-
previous_y = y
|
68 |
-
|
69 |
-
markdown_output += "\n---\n\n"
|
70 |
-
|
71 |
return markdown_output.strip()
|
72 |
|
73 |
-
|
74 |
@spaces.GPU
|
75 |
def convert(pdf_file):
|
76 |
doc = fitz.open(pdf_file)
|
@@ -82,8 +83,9 @@ def convert(pdf_file):
|
|
82 |
text = page.get_text("text").strip()
|
83 |
|
84 |
if len(text) > 30:
|
85 |
-
markdown_output += extract_text_markdown([page], image_paths) + "\n"
|
86 |
else:
|
|
|
87 |
pix = page.get_pixmap(dpi=300)
|
88 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
89 |
|
@@ -102,7 +104,7 @@ def convert(pdf_file):
|
|
102 |
if ocr_text.strip():
|
103 |
markdown_output += ocr_text + "\n"
|
104 |
|
105 |
-
|
106 |
|
107 |
# Guardar como archivo .md
|
108 |
markdown_path = "/tmp/resultado.md"
|
|
|
14 |
cleaned_lines.append(line)
|
15 |
return "\n".join(cleaned_lines)
|
16 |
|
17 |
+
def extract_text_markdown(doc, image_paths, page_index):
|
18 |
+
markdown_output = f"\n## Página {page_index + 1}\n\n"
|
19 |
image_counter = 1
|
20 |
seen_xrefs = set()
|
21 |
+
elements = []
|
22 |
+
|
23 |
+
page = doc[0] # Solo se procesa una página cada vez
|
24 |
+
|
25 |
+
blocks = page.get_text("dict")["blocks"]
|
26 |
+
|
27 |
+
for b in blocks:
|
28 |
+
y = b["bbox"][1]
|
29 |
+
if b["type"] == 0: # Texto
|
30 |
+
for line in b["lines"]:
|
31 |
+
line_y = line["bbox"][1]
|
32 |
+
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
|
33 |
+
max_font_size = max([span.get("size", 10) for span in line["spans"]])
|
34 |
+
if line_text:
|
35 |
+
elements.append((line_y, line_text, max_font_size))
|
36 |
+
|
37 |
+
# Extraer imágenes únicas
|
38 |
+
images_on_page = page.get_images(full=True)
|
39 |
+
for img_index, img in enumerate(images_on_page):
|
40 |
+
xref = img[0]
|
41 |
+
if xref in seen_xrefs:
|
42 |
+
continue
|
43 |
+
seen_xrefs.add(xref)
|
44 |
+
try:
|
45 |
+
base_image = page.parent.extract_image(xref)
|
46 |
+
image_bytes = base_image["image"]
|
47 |
+
ext = base_image["ext"]
|
48 |
+
image_path = f"/tmp/imagen_p{page_index + 1}_{img_index + 1}.{ext}"
|
49 |
+
with open(image_path, "wb") as f:
|
50 |
+
f.write(image_bytes)
|
51 |
+
image_paths.append(image_path)
|
52 |
+
# Usar posición alta para insertar al final del Markdown
|
53 |
+
elements.append((float("inf") - img_index, f"\n\n\n", 10))
|
54 |
+
image_counter += 1
|
55 |
+
except Exception as e:
|
56 |
+
elements.append((float("inf"), f"[Error imagen: {e}]", 10))
|
57 |
+
|
58 |
+
# Ordenar por posición
|
59 |
+
elements.sort(key=lambda x: x[0])
|
60 |
+
previous_y = None
|
61 |
+
|
62 |
+
for y, text, font_size in elements:
|
63 |
+
is_header = font_size >= 14
|
64 |
+
if previous_y is not None and abs(y - previous_y) > 10:
|
65 |
+
markdown_output += "\n"
|
66 |
+
if is_header:
|
67 |
+
markdown_output += f"\n### {text.strip()}\n"
|
68 |
+
else:
|
69 |
+
markdown_output += text.strip() + "\n"
|
70 |
+
previous_y = y
|
71 |
|
72 |
+
markdown_output += "\n---\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
return markdown_output.strip()
|
74 |
|
|
|
75 |
@spaces.GPU
|
76 |
def convert(pdf_file):
|
77 |
doc = fitz.open(pdf_file)
|
|
|
83 |
text = page.get_text("text").strip()
|
84 |
|
85 |
if len(text) > 30:
|
86 |
+
markdown_output += extract_text_markdown([page], image_paths, page_num) + "\n"
|
87 |
else:
|
88 |
+
markdown_output += f"\n## Página {page_num + 1}\n\n"
|
89 |
pix = page.get_pixmap(dpi=300)
|
90 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
91 |
|
|
|
104 |
if ocr_text.strip():
|
105 |
markdown_output += ocr_text + "\n"
|
106 |
|
107 |
+
markdown_output += "\n---\n\n"
|
108 |
|
109 |
# Guardar como archivo .md
|
110 |
markdown_path = "/tmp/resultado.md"
|