Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -14,13 +14,12 @@ def clean_ocr_text(text):
|
|
14 |
cleaned_lines.append(line)
|
15 |
return "\n".join(cleaned_lines)
|
16 |
|
17 |
-
def extract_text_markdown(doc, image_paths, page_index):
|
18 |
markdown_output = f"\n## Página {page_index + 1}\n\n"
|
19 |
image_counter = 1
|
20 |
-
seen_xrefs = set()
|
21 |
elements = []
|
22 |
|
23 |
-
page = doc[0] #
|
24 |
|
25 |
blocks = page.get_text("dict")["blocks"]
|
26 |
|
@@ -34,12 +33,12 @@ def extract_text_markdown(doc, image_paths, page_index):
|
|
34 |
if line_text:
|
35 |
elements.append((line_y, line_text, max_font_size))
|
36 |
|
37 |
-
# Extraer imágenes únicas
|
38 |
images_on_page = page.get_images(full=True)
|
39 |
for img_index, img in enumerate(images_on_page):
|
40 |
xref = img[0]
|
41 |
if xref in seen_xrefs:
|
42 |
-
continue
|
43 |
seen_xrefs.add(xref)
|
44 |
try:
|
45 |
base_image = page.parent.extract_image(xref)
|
@@ -49,7 +48,6 @@ def extract_text_markdown(doc, image_paths, page_index):
|
|
49 |
with open(image_path, "wb") as f:
|
50 |
f.write(image_bytes)
|
51 |
image_paths.append(image_path)
|
52 |
-
# Usar posición alta para insertar al final del Markdown
|
53 |
elements.append((float("inf") - img_index, f"\n\n\n", 10))
|
54 |
image_counter += 1
|
55 |
except Exception as e:
|
@@ -77,13 +75,14 @@ def convert(pdf_file):
|
|
77 |
doc = fitz.open(pdf_file)
|
78 |
markdown_output = ""
|
79 |
image_paths = []
|
|
|
80 |
|
81 |
for page_num in range(len(doc)):
|
82 |
page = doc[page_num]
|
83 |
text = page.get_text("text").strip()
|
84 |
|
85 |
if len(text) > 30:
|
86 |
-
markdown_output += extract_text_markdown([page], image_paths, page_num) + "\n"
|
87 |
else:
|
88 |
markdown_output += f"\n## Página {page_num + 1}\n\n"
|
89 |
pix = page.get_pixmap(dpi=300)
|
|
|
14 |
cleaned_lines.append(line)
|
15 |
return "\n".join(cleaned_lines)
|
16 |
|
17 |
+
def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
|
18 |
markdown_output = f"\n## Página {page_index + 1}\n\n"
|
19 |
image_counter = 1
|
|
|
20 |
elements = []
|
21 |
|
22 |
+
page = doc[0] # solo una página en cada llamada
|
23 |
|
24 |
blocks = page.get_text("dict")["blocks"]
|
25 |
|
|
|
33 |
if line_text:
|
34 |
elements.append((line_y, line_text, max_font_size))
|
35 |
|
36 |
+
# Extraer imágenes únicas (por xref, global)
|
37 |
images_on_page = page.get_images(full=True)
|
38 |
for img_index, img in enumerate(images_on_page):
|
39 |
xref = img[0]
|
40 |
if xref in seen_xrefs:
|
41 |
+
continue # ya extraída
|
42 |
seen_xrefs.add(xref)
|
43 |
try:
|
44 |
base_image = page.parent.extract_image(xref)
|
|
|
48 |
with open(image_path, "wb") as f:
|
49 |
f.write(image_bytes)
|
50 |
image_paths.append(image_path)
|
|
|
51 |
elements.append((float("inf") - img_index, f"\n\n\n", 10))
|
52 |
image_counter += 1
|
53 |
except Exception as e:
|
|
|
75 |
doc = fitz.open(pdf_file)
|
76 |
markdown_output = ""
|
77 |
image_paths = []
|
78 |
+
seen_xrefs = set() # <<-- GLOBAL para todo el PDF
|
79 |
|
80 |
for page_num in range(len(doc)):
|
81 |
page = doc[page_num]
|
82 |
text = page.get_text("text").strip()
|
83 |
|
84 |
if len(text) > 30:
|
85 |
+
markdown_output += extract_text_markdown([page], image_paths, page_num, seen_xrefs) + "\n"
|
86 |
else:
|
87 |
markdown_output += f"\n## Página {page_num + 1}\n\n"
|
88 |
pix = page.get_pixmap(dpi=300)
|