Biifruu commited on
Commit
dd29269
·
verified ·
1 Parent(s): f824c8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -33
app.py CHANGED
@@ -3,10 +3,11 @@ import gradio as gr
3
  import fitz # PyMuPDF
4
  import ocrmypdf
5
  import tempfile
 
6
 
7
  def extract_text_markdown(doc):
8
  markdown_output = ""
9
- image_counter = 1
10
 
11
  for page in doc:
12
  blocks = page.get_text("dict")["blocks"]
@@ -17,16 +18,18 @@ def extract_text_markdown(doc):
17
  if b["type"] == 0: # Texto
18
  for line in b["lines"]:
19
  line_y = line["bbox"][1]
20
- spans = line["spans"]
21
- line_text = " ".join(span["text"].strip() for span in spans)
22
  if line_text:
23
  elements.append((line_y, line_text))
24
  elif b["type"] == 1: # Imagen
 
25
  elements.append((y, f"[imagen_{image_counter}]()"))
26
  image_counter += 1
27
 
 
28
  elements.sort(key=lambda x: x[0])
29
 
 
30
  previous_y = None
31
  for y, content in elements:
32
  if previous_y is not None and abs(y - previous_y) > 10:
@@ -43,6 +46,7 @@ def convert(pdf_file):
43
  original_doc = fitz.open(pdf_file)
44
  plain_text = "\n".join([page.get_text() for page in original_doc])
45
 
 
46
  if len(plain_text.strip()) < 100:
47
  ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
48
  ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
@@ -51,35 +55,11 @@ def convert(pdf_file):
51
  doc = original_doc
52
 
53
  markdown = extract_text_markdown(doc)
54
- metadata = {} # Puedes agregar metadata si quieres
55
  return markdown, metadata
56
 
57
- # Gradio Interface
58
- with gr.Blocks(title="PDF → Markdown") as demo:
59
- gr.Markdown("### PDF → Markdown con enlaces de imagen y botón copiar")
60
-
61
- pdf_input = gr.File(label="Sube tu PDF", type="filepath")
62
- markdown_output = gr.Textbox(label="Markdown generado", lines=25, elem_id="markdown-textbox")
63
- metadata_output = gr.JSON(label="Metadata")
64
-
65
- convert_btn = gr.Button("Convertir PDF")
66
-
67
- # Botón copiar usando JS válido y accediendo al DOM real
68
- gr.HTML("""
69
- <button onclick="copyMarkdown()">📋 Copiar Markdown</button>
70
- <script>
71
- function copyMarkdown() {
72
- const textarea = document.querySelector('#markdown-textbox textarea');
73
- if (textarea) {
74
- textarea.select();
75
- document.execCommand('copy');
76
- } else {
77
- alert('No se pudo copiar el texto');
78
- }
79
- }
80
- </script>
81
- """)
82
-
83
- convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
84
-
85
- demo.launch()
 
3
  import fitz # PyMuPDF
4
  import ocrmypdf
5
  import tempfile
6
+ import os
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
+ image_counter = 1 # Contador de imágenes
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
 
18
  if b["type"] == 0: # Texto
19
  for line in b["lines"]:
20
  line_y = line["bbox"][1]
21
+ line_text = " ".join([span["text"] for span in line["spans"]]).strip()
 
22
  if line_text:
23
  elements.append((line_y, line_text))
24
  elif b["type"] == 1: # Imagen
25
+ # Añade un enlace con nombre único
26
  elements.append((y, f"[imagen_{image_counter}]()"))
27
  image_counter += 1
28
 
29
+ # Ordenar por posición vertical
30
  elements.sort(key=lambda x: x[0])
31
 
32
+ # Reconstrucción con saltos lógicos
33
  previous_y = None
34
  for y, content in elements:
35
  if previous_y is not None and abs(y - previous_y) > 10:
 
46
  original_doc = fitz.open(pdf_file)
47
  plain_text = "\n".join([page.get_text() for page in original_doc])
48
 
49
+ # Aplicar OCR solo si el PDF no tiene texto
50
  if len(plain_text.strip()) < 100:
51
  ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
52
  ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
 
55
  doc = original_doc
56
 
57
  markdown = extract_text_markdown(doc)
58
+ metadata = {} # Si necesitas metadatos, se pueden agregar aquí
59
  return markdown, metadata
60
 
61
+ gr.Interface(
62
+ fn=convert,
63
+ inputs=[gr.File(label="Sube tu PDF", type="filepath")],
64
+ outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
65
+ ).launch()