Biifruu commited on
Commit
564947a
·
verified ·
1 Parent(s): 891d450

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -9
app.py CHANGED
@@ -3,11 +3,10 @@ import gradio as gr
3
  import fitz # PyMuPDF
4
  import ocrmypdf
5
  import tempfile
6
- import os
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
- image_counter = 1 # Contador de imágenes
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
@@ -18,18 +17,24 @@ def extract_text_markdown(doc):
18
  if b["type"] == 0: # Texto
19
  for line in b["lines"]:
20
  line_y = line["bbox"][1]
21
- line_text = " ".join([span["text"] for span in line["spans"]]).strip()
 
 
 
 
 
 
 
22
  if line_text:
23
  elements.append((line_y, line_text))
24
  elif b["type"] == 1: # Imagen
25
- # Añade un enlace con nombre único
26
  elements.append((y, f"[imagen_{image_counter}]()"))
27
  image_counter += 1
28
 
29
  # Ordenar por posición vertical
30
  elements.sort(key=lambda x: x[0])
31
 
32
- # Reconstrucción con saltos lógicos
33
  previous_y = None
34
  for y, content in elements:
35
  if previous_y is not None and abs(y - previous_y) > 10:
@@ -46,7 +51,6 @@ def convert(pdf_file):
46
  original_doc = fitz.open(pdf_file)
47
  plain_text = "\n".join([page.get_text() for page in original_doc])
48
 
49
- # Aplicar OCR solo si el PDF no tiene texto
50
  if len(plain_text.strip()) < 100:
51
  ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
52
  ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
@@ -55,11 +59,20 @@ def convert(pdf_file):
55
  doc = original_doc
56
 
57
  markdown = extract_text_markdown(doc)
58
- metadata = {} # Si necesitas metadatos, se pueden agregar aquí
59
  return markdown, metadata
60
 
 
 
 
 
 
 
 
 
61
  gr.Interface(
62
  fn=convert,
63
- inputs=[gr.File(label="Sube tu PDF", type="filepath")],
64
- outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
 
65
  ).launch()
 
3
  import fitz # PyMuPDF
4
  import ocrmypdf
5
  import tempfile
 
6
 
7
  def extract_text_markdown(doc):
8
  markdown_output = ""
9
+ image_counter = 1
10
 
11
  for page in doc:
12
  blocks = page.get_text("dict")["blocks"]
 
17
  if b["type"] == 0: # Texto
18
  for line in b["lines"]:
19
  line_y = line["bbox"][1]
20
+ line_spans = line["spans"]
21
+
22
+ # Si el texto tiene múltiples columnas o alineaciones → tabla simple
23
+ if len(line_spans) > 1:
24
+ line_text = " | ".join([span["text"].strip() for span in line_spans])
25
+ else:
26
+ line_text = " ".join([span["text"].strip() for span in line_spans])
27
+
28
  if line_text:
29
  elements.append((line_y, line_text))
30
  elif b["type"] == 1: # Imagen
 
31
  elements.append((y, f"[imagen_{image_counter}]()"))
32
  image_counter += 1
33
 
34
  # Ordenar por posición vertical
35
  elements.sort(key=lambda x: x[0])
36
 
37
+ # Reconstrucción
38
  previous_y = None
39
  for y, content in elements:
40
  if previous_y is not None and abs(y - previous_y) > 10:
 
51
  original_doc = fitz.open(pdf_file)
52
  plain_text = "\n".join([page.get_text() for page in original_doc])
53
 
 
54
  if len(plain_text.strip()) < 100:
55
  ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
56
  ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
 
59
  doc = original_doc
60
 
61
  markdown = extract_text_markdown(doc)
62
+ metadata = {} # Si deseas, aquí puedes agregar metadatos
63
  return markdown, metadata
64
 
65
+ # Interfaz Gradio con botón copiar
66
+ markdown_output = gr.Textbox(label="Markdown estructurado", lines=20)
67
+ metadata_output = gr.JSON(label="Metadata")
68
+
69
+ copy_button_html = """
70
+ <button onclick="navigator.clipboard.writeText(document.querySelector('textarea').value)">📋 Copiar al portapapeles</button>
71
+ """
72
+
73
  gr.Interface(
74
  fn=convert,
75
+ inputs=gr.File(label="Sube tu PDF", type="filepath"),
76
+ outputs=[markdown_output, metadata_output, gr.HTML(copy_button_html)],
77
+ title="Extractor PDF → Markdown con imágenes como enlaces y soporte de tablas",
78
  ).launch()