pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 11 days ago

Commit

0a2dbbc

verified ·

1 Parent(s): 650dd50

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -54

app.py CHANGED Viewed

@@ -14,63 +14,64 @@ def clean_ocr_text(text):
             cleaned_lines.append(line)
     return "\n".join(cleaned_lines)
-def extract_text_markdown(doc, image_paths):
-    markdown_output = ""
     image_counter = 1
     seen_xrefs = set()
-    for page_num, page in enumerate(doc):
-        blocks = page.get_text("dict")["blocks"]
-        elements = []
-        for b in blocks:
-            y = b["bbox"][1]
-            if b["type"] == 0:  # Texto
-                for line in b["lines"]:
-                    line_y = line["bbox"][1]
-                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
-                    max_font_size = max([span.get("size", 10) for span in line["spans"]])
-                    if line_text:
-                        elements.append((line_y, line_text, max_font_size))
-        # Extraer imágenes únicas por xref
-        images_on_page = page.get_images(full=True)
-        for img_index, img in enumerate(images_on_page):
-            xref = img[0]
-            if xref in seen_xrefs:
-                continue  # Saltar si ya se extrajo
-            seen_xrefs.add(xref)
-            try:
-                base_image = page.parent.extract_image(xref)
-                image_bytes = base_image["image"]
-                ext = base_image["ext"]
-                image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
-                with open(image_path, "wb") as f:
-                    f.write(image_bytes)
-                image_paths.append(image_path)
-                y_pos = 50 + img_index * 10
-                elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
-                image_counter += 1
-            except Exception as e:
-                elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
-        elements.sort(key=lambda x: x[0])
-        previous_y = None
-        for y, text, font_size in elements:
-            is_header = font_size >= 14
-            if previous_y is not None and abs(y - previous_y) > 10:
-                markdown_output += "\n"
-            if is_header:
-                markdown_output += f"\n### {text.strip()}\n"
-            else:
-                markdown_output += text.strip() + "\n"
-            previous_y = y
-        markdown_output += "\n---\n\n"
     return markdown_output.strip()
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
@@ -82,8 +83,9 @@ def convert(pdf_file):
         text = page.get_text("text").strip()
         if len(text) > 30:
-            markdown_output += extract_text_markdown([page], image_paths) + "\n"
         else:
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
@@ -102,7 +104,7 @@ def convert(pdf_file):
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
-        markdown_output += "\n---\n\n"
     # Guardar como archivo .md
     markdown_path = "/tmp/resultado.md"

             cleaned_lines.append(line)
     return "\n".join(cleaned_lines)
+def extract_text_markdown(doc, image_paths, page_index):
+    markdown_output = f"\n## Página {page_index + 1}\n\n"
     image_counter = 1
     seen_xrefs = set()
+    elements = []
+    page = doc[0]  # Solo se procesa una página cada vez
+    blocks = page.get_text("dict")["blocks"]
+    for b in blocks:
+        y = b["bbox"][1]
+        if b["type"] == 0:  # Texto
+            for line in b["lines"]:
+                line_y = line["bbox"][1]
+                line_text = " ".join([span["text"] for span in line["spans"]]).strip()
+                max_font_size = max([span.get("size", 10) for span in line["spans"]])
+                if line_text:
+                    elements.append((line_y, line_text, max_font_size))
+    # Extraer imágenes únicas
+    images_on_page = page.get_images(full=True)
+    for img_index, img in enumerate(images_on_page):
+        xref = img[0]
+        if xref in seen_xrefs:
+            continue
+        seen_xrefs.add(xref)
+        try:
+            base_image = page.parent.extract_image(xref)
+            image_bytes = base_image["image"]
+            ext = base_image["ext"]
+            image_path = f"/tmp/imagen_p{page_index + 1}_{img_index + 1}.{ext}"
+            with open(image_path, "wb") as f:
+                f.write(image_bytes)
+            image_paths.append(image_path)
+            # Usar posición alta para insertar al final del Markdown
+            elements.append((float("inf") - img_index, f"\n\n![imagen_{image_counter}]({image_path})\n", 10))
+            image_counter += 1
+        except Exception as e:
+            elements.append((float("inf"), f"[Error imagen: {e}]", 10))
+    # Ordenar por posición
+    elements.sort(key=lambda x: x[0])
+    previous_y = None
+    for y, text, font_size in elements:
+        is_header = font_size >= 14
+        if previous_y is not None and abs(y - previous_y) > 10:
+            markdown_output += "\n"
+        if is_header:
+            markdown_output += f"\n### {text.strip()}\n"
+        else:
+            markdown_output += text.strip() + "\n"
+        previous_y = y
+    markdown_output += "\n---\n\n"
     return markdown_output.strip()
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
         text = page.get_text("text").strip()
         if len(text) > 30:
+            markdown_output += extract_text_markdown([page], image_paths, page_num) + "\n"
         else:
+            markdown_output += f"\n## Página {page_num + 1}\n\n"
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             if ocr_text.strip():
                 markdown_output += ocr_text + "\n"
+            markdown_output += "\n---\n\n"
     # Guardar como archivo .md
     markdown_path = "/tmp/resultado.md"