import spaces
import gradio as gr
import fitz  # PyMuPDF
import ocrmypdf
import tempfile
import os

def extract_text_markdown(doc):
    markdown_output = ""
    image_counter = 1

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        raw_lines = []
        elements = []

        for b in blocks:
            y = b["bbox"][1]
            if b["type"] == 0:  # Texto
                for line in b["lines"]:
                    spans = line["spans"]
                    cells = [(span["bbox"][0], span["text"].strip()) for span in spans if span["text"].strip()]
                    if len(cells) > 1:
                        raw_lines.append((line["bbox"][1], cells))  # posibles líneas de tabla
                    else:
                        line_text = " ".join(span["text"] for span in spans).strip()
                        if line_text:
                            elements.append((line["bbox"][1], line_text))
            elif b["type"] == 1:  # Imagen
                elements.append((y, f"[imagen_{image_counter}]()"))
                image_counter += 1

        # Procesar posibles tablas
        table_block = []
        previous_cell_count = None

        for y, cells in raw_lines + [(None, None)]:  # Agregar None para forzar el cierre al final
            cell_count = len(cells) if cells else None

            if cell_count == previous_cell_count:
                table_block.append((y, cells))
            else:
                if len(table_block) >= 3:  # Sólo si hay suficientes filas similares
                    markdown_output += convert_table_block_to_markdown(table_block)
                    markdown_output += "\n"
                else:
                    # Agregar como texto normal si no cumple
                    for _, row_cells in table_block:
                        markdown_output += " ".join([c[1] for c in row_cells]) + "\n"

                table_block = [(y, cells)] if cells else []
                previous_cell_count = cell_count

        # Procesar elementos normales
        elements.sort(key=lambda x: x[0])
        previous_y = None
        for y, content in elements:
            if previous_y is not None and abs(y - previous_y) > 10:
                markdown_output += "\n"
            markdown_output += content + "\n"
            previous_y = y

        markdown_output += "\n---\n\n"

    return markdown_output.strip()

def convert_table_block_to_markdown(block):
    """Convierte un bloque de filas con estructura tabular a Markdown"""
    lines = []
    for _, cells in block:
        row = [c[1] for c in cells]
        lines.append("| " + " | ".join(row) + " |")

    if len(lines) > 1:
        # Insertar línea de encabezado
        header = lines[0]
        num_cols = header.count("|") - 1
        separator = "| " + " | ".join(["---"] * num_cols) + " |"
        lines.insert(1, separator)

    return "\n".join(lines) + "\n"

@spaces.GPU
def convert(pdf_file):
    original_doc = fitz.open(pdf_file)
    plain_text = "\n".join([page.get_text() for page in original_doc])

    # Aplicar OCR solo si el PDF no tiene texto
    if len(plain_text.strip()) < 100:
        ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
        ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
        doc = fitz.open(ocr_temp_path)
    else:
        doc = original_doc

    markdown = extract_text_markdown(doc)
    metadata = {}  # Si necesitas metadatos, se pueden agregar aquí
    return markdown, metadata

gr.Interface(
    fn=convert,
    inputs=[gr.File(label="Sube tu PDF", type="filepath")],
    outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()