pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 21 days ago

Commit

14d9ac1

verified ·

1 Parent(s): 7064c41

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -19

app.py CHANGED Viewed

@@ -11,44 +11,47 @@ def extract_text_markdown(doc):
     for page in doc:
         blocks = page.get_text("dict")["blocks"]
-        lines_group = []
         elements = []
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     spans = line["spans"]
-                    cells = []
-                    for span in spans:
-                        cells.append((span["bbox"][0], span["text"]))  # (x, texto)
                     if len(cells) > 1:
-                        lines_group.append((line["bbox"][1], cells))  # (y, columnas)
                     else:
                         line_text = " ".join(span["text"] for span in spans).strip()
                         if line_text:
                             elements.append((line["bbox"][1], line_text))
             elif b["type"] == 1:  # Imagen
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
-        # Detectar tablas rudimentarias
-        if len(lines_group) >= 2:
-            # Ordenar por coordenada vertical
-            lines_group.sort(key=lambda x: x[0])
-            header_cells = [cell[1].strip() for cell in lines_group[0][1]]
-            markdown_output += "| " + " | ".join(header_cells) + " |\n"
-            markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
-            for y, row in lines_group[1:]:
-                row_cells = [cell[1].strip() for cell in row]
-                markdown_output += "| " + " | ".join(row_cells) + " |\n"
-            markdown_output += "\n"
-        # Agregar líneas de texto sueltas
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, content in elements:
@@ -61,6 +64,22 @@ def extract_text_markdown(doc):
     return markdown_output.strip()
 @spaces.GPU
 def convert(pdf_file):
     original_doc = fitz.open(pdf_file)

     for page in doc:
         blocks = page.get_text("dict")["blocks"]
+        raw_lines = []
         elements = []
         for b in blocks:
             y = b["bbox"][1]
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     spans = line["spans"]
+                    cells = [(span["bbox"][0], span["text"].strip()) for span in spans if span["text"].strip()]
                     if len(cells) > 1:
+                        raw_lines.append((line["bbox"][1], cells))  # posibles líneas de tabla
                     else:
                         line_text = " ".join(span["text"] for span in spans).strip()
                         if line_text:
                             elements.append((line["bbox"][1], line_text))
             elif b["type"] == 1:  # Imagen
                 elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
+        # Procesar posibles tablas
+        table_block = []
+        previous_cell_count = None
+        for y, cells in raw_lines + [(None, None)]:  # Agregar None para forzar el cierre al final
+            cell_count = len(cells) if cells else None
+            if cell_count == previous_cell_count:
+                table_block.append((y, cells))
+            else:
+                if len(table_block) >= 3:  # Sólo si hay suficientes filas similares
+                    markdown_output += convert_table_block_to_markdown(table_block)
+                    markdown_output += "\n"
+                else:
+                    # Agregar como texto normal si no cumple
+                    for _, row_cells in table_block:
+                        markdown_output += " ".join([c[1] for c in row_cells]) + "\n"
+                table_block = [(y, cells)] if cells else []
+                previous_cell_count = cell_count
+        # Procesar elementos normales
         elements.sort(key=lambda x: x[0])
         previous_y = None
         for y, content in elements:
     return markdown_output.strip()
+def convert_table_block_to_markdown(block):
+    """Convierte un bloque de filas con estructura tabular a Markdown"""
+    lines = []
+    for _, cells in block:
+        row = [c[1] for c in cells]
+        lines.append("| " + " | ".join(row) + " |")
+    if len(lines) > 1:
+        # Insertar línea de encabezado
+        header = lines[0]
+        num_cols = header.count("|") - 1
+        separator = "| " + " | ".join(["---"] * num_cols) + " |"
+        lines.insert(1, separator)
+    return "\n".join(lines) + "\n"
 @spaces.GPU
 def convert(pdf_file):
     original_doc = fitz.open(pdf_file)