pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 12 days ago

Commit

650dd50

verified ·

1 Parent(s): 0ef1375

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -8

app.py CHANGED Viewed

@@ -17,15 +17,14 @@ def clean_ocr_text(text):
 def extract_text_markdown(doc, image_paths):
     markdown_output = ""
     image_counter = 1
     for page_num, page in enumerate(doc):
         blocks = page.get_text("dict")["blocks"]
         elements = []
-        # 🔁 Añadir texto normal (bloques)
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
@@ -34,30 +33,29 @@ def extract_text_markdown(doc, image_paths):
                     if line_text:
                         elements.append((line_y, line_text, max_font_size))
-        # 🖼️ Extraer imágenes reales de la página (xref)
         images_on_page = page.get_images(full=True)
         for img_index, img in enumerate(images_on_page):
             xref = img[0]
             try:
                 base_image = page.parent.extract_image(xref)
                 image_bytes = base_image["image"]
                 ext = base_image["ext"]
                 image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
                 with open(image_path, "wb") as f:
                     f.write(image_bytes)
                 image_paths.append(image_path)
-                y_pos = 50 + img_index * 10  # Posición estimada para ordenar
                 elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
                 image_counter += 1
             except Exception as e:
                 elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
-        # Ordenar y construir Markdown
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, text, font_size in elements:
             is_header = font_size >= 14
             if previous_y is not None and abs(y - previous_y) > 10:
@@ -72,6 +70,7 @@ def extract_text_markdown(doc, image_paths):
     return markdown_output.strip()
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)

 def extract_text_markdown(doc, image_paths):
     markdown_output = ""
     image_counter = 1
+    seen_xrefs = set()
     for page_num, page in enumerate(doc):
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
                     if line_text:
                         elements.append((line_y, line_text, max_font_size))
+        # Extraer imágenes únicas por xref
         images_on_page = page.get_images(full=True)
         for img_index, img in enumerate(images_on_page):
             xref = img[0]
+            if xref in seen_xrefs:
+                continue  # Saltar si ya se extrajo
+            seen_xrefs.add(xref)
             try:
                 base_image = page.parent.extract_image(xref)
                 image_bytes = base_image["image"]
                 ext = base_image["ext"]
                 image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
                 with open(image_path, "wb") as f:
                     f.write(image_bytes)
                 image_paths.append(image_path)
+                y_pos = 50 + img_index * 10
                 elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
                 image_counter += 1
             except Exception as e:
                 elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, text, font_size in elements:
             is_header = font_size >= 14
             if previous_y is not None and abs(y - previous_y) > 10:
     return markdown_output.strip()
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)