Biifruu commited on
Commit
58c62dc
·
verified ·
1 Parent(s): 564947a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -20
app.py CHANGED
@@ -17,13 +17,12 @@ def extract_text_markdown(doc):
17
  if b["type"] == 0: # Texto
18
  for line in b["lines"]:
19
  line_y = line["bbox"][1]
20
- line_spans = line["spans"]
21
 
22
- # Si el texto tiene múltiples columnas o alineaciones → tabla simple
23
- if len(line_spans) > 1:
24
- line_text = " | ".join([span["text"].strip() for span in line_spans])
25
  else:
26
- line_text = " ".join([span["text"].strip() for span in line_spans])
27
 
28
  if line_text:
29
  elements.append((line_y, line_text))
@@ -31,10 +30,8 @@ def extract_text_markdown(doc):
31
  elements.append((y, f"[imagen_{image_counter}]()"))
32
  image_counter += 1
33
 
34
- # Ordenar por posición vertical
35
  elements.sort(key=lambda x: x[0])
36
 
37
- # Reconstrucción
38
  previous_y = None
39
  for y, content in elements:
40
  if previous_y is not None and abs(y - previous_y) > 10:
@@ -59,20 +56,23 @@ def convert(pdf_file):
59
  doc = original_doc
60
 
61
  markdown = extract_text_markdown(doc)
62
- metadata = {} # Si deseas, aquí puedes agregar metadatos
63
  return markdown, metadata
64
 
65
- # Interfaz Gradio con botón copiar
66
- markdown_output = gr.Textbox(label="Markdown estructurado", lines=20)
67
- metadata_output = gr.JSON(label="Metadata")
68
 
69
- copy_button_html = """
70
- <button onclick="navigator.clipboard.writeText(document.querySelector('textarea').value)">📋 Copiar al portapapeles</button>
71
- """
72
 
73
- gr.Interface(
74
- fn=convert,
75
- inputs=gr.File(label="Sube tu PDF", type="filepath"),
76
- outputs=[markdown_output, metadata_output, gr.HTML(copy_button_html)],
77
- title="Extractor PDF → Markdown con imágenes como enlaces y soporte de tablas",
78
- ).launch()
 
 
 
 
17
  if b["type"] == 0: # Texto
18
  for line in b["lines"]:
19
  line_y = line["bbox"][1]
20
+ spans = line["spans"]
21
 
22
+ if len(spans) > 1:
23
+ line_text = " | ".join(span["text"].strip() for span in spans)
 
24
  else:
25
+ line_text = " ".join(span["text"].strip() for span in spans)
26
 
27
  if line_text:
28
  elements.append((line_y, line_text))
 
30
  elements.append((y, f"[imagen_{image_counter}]()"))
31
  image_counter += 1
32
 
 
33
  elements.sort(key=lambda x: x[0])
34
 
 
35
  previous_y = None
36
  for y, content in elements:
37
  if previous_y is not None and abs(y - previous_y) > 10:
 
56
  doc = original_doc
57
 
58
  markdown = extract_text_markdown(doc)
59
+ metadata = {} # Añade metadatos si quieres
60
  return markdown, metadata
61
 
62
+ # Gradio Interface
63
+ with gr.Blocks(title="Extractor PDF a Markdown") as demo:
64
+ gr.Markdown("### PDF → Markdown con imágenes como enlaces y botón de copiar")
65
 
66
+ pdf_input = gr.File(label="Sube tu PDF", type="filepath")
67
+ markdown_output = gr.Textbox(label="Markdown generado", lines=25, elem_id="md_output")
68
+ metadata_output = gr.JSON(label="Metadata")
69
 
70
+ with gr.Row():
71
+ convert_btn = gr.Button("Convertir PDF")
72
+ copy_btn = gr.HTML("""
73
+ <button onclick="navigator.clipboard.writeText(document.getElementById('md_output').value)">📋 Copiar Markdown</button>
74
+ """)
75
+
76
+ convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
77
+
78
+ demo.launch()