pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 19 days ago

Commit

7064c41

verified ·

1 Parent(s): dd29269

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -9

app.py CHANGED Viewed

@@ -7,29 +7,49 @@ import os
 def extract_text_markdown(doc):
     markdown_output = ""
-    image_counter = 1  # Contador de imágenes
     for page in doc:
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
-                    line_y = line["bbox"][1]
-                    line_text = " ".join([span["text"] for span in line["spans"]]).strip()
-                    if line_text:
-                        elements.append((line_y, line_text))
             elif b["type"] == 1:  # Imagen
-                # Añade un enlace con nombre único
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
-        # Ordenar por posición vertical
-        elements.sort(key=lambda x: x[0])
-        # Reconstrucción con saltos lógicos
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:

 def extract_text_markdown(doc):
     markdown_output = ""
+    image_counter = 1
     for page in doc:
         blocks = page.get_text("dict")["blocks"]
+        lines_group = []
         elements = []
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
+                    spans = line["spans"]
+                    cells = []
+                    for span in spans:
+                        cells.append((span["bbox"][0], span["text"]))  # (x, texto)
+                    if len(cells) > 1:
+                        lines_group.append((line["bbox"][1], cells))  # (y, columnas)
+                    else:
+                        line_text = " ".join(span["text"] for span in spans).strip()
+                        if line_text:
+                            elements.append((line["bbox"][1], line_text))
             elif b["type"] == 1:  # Imagen
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
+        # Detectar tablas rudimentarias
+        if len(lines_group) >= 2:
+            # Ordenar por coordenada vertical
+            lines_group.sort(key=lambda x: x[0])
+            header_cells = [cell[1].strip() for cell in lines_group[0][1]]
+            markdown_output += "| " + " | ".join(header_cells) + " |\n"
+            markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
+            for y, row in lines_group[1:]:
+                row_cells = [cell[1].strip() for cell in row]
+                markdown_output += "| " + " | ".join(row_cells) + " |\n"
+            markdown_output += "\n"
+        # Agregar líneas de texto sueltas
+        elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10: