Biifruu commited on
Commit
4337f3a
·
verified ·
1 Parent(s): f3b7c90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -6
app.py CHANGED
@@ -2,10 +2,38 @@ import spaces
2
  import gradio as gr
3
  import fitz # PyMuPDF
4
  import os
 
 
 
 
 
 
 
 
 
 
5
 
6
  @spaces.GPU
7
  def convert(pdf_file):
 
8
  doc = fitz.open(pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  markdown_output = ""
10
  image_dir = "extracted_images"
11
  os.makedirs(image_dir, exist_ok=True)
@@ -15,7 +43,7 @@ def convert(pdf_file):
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
18
- # Extraemos lista de imágenes con sus xrefs
19
  image_list = page.get_images(full=True)
20
  xref_to_image_path = {}
21
 
@@ -24,7 +52,7 @@ def convert(pdf_file):
24
  pix = fitz.Pixmap(doc, xref)
25
  img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
26
 
27
- if pix.n > 4: # CMYK -> RGB
28
  pix = fitz.Pixmap(fitz.csRGB, pix)
29
  pix.save(img_path)
30
  pix = None
@@ -32,7 +60,7 @@ def convert(pdf_file):
32
  xref_to_image_path[xref] = img_path
33
  image_counter += 1
34
 
35
- # Procesar bloques y ordenar por coordenada vertical
36
  for b in blocks:
37
  if b["type"] == 0: # Texto
38
  for line in b["lines"]:
@@ -43,9 +71,10 @@ def convert(pdf_file):
43
  elif b["type"] == 1: # Imagen
44
  y = b["bbox"][1]
45
  xref = b.get("image", None)
 
46
  if xref and xref in xref_to_image_path:
47
- img_path = xref_to_image_path[xref]
48
- elements.append((y, f"![imagen]({img_path})"))
49
  else:
50
  elements.append((y, "[imagen]()"))
51
 
@@ -54,7 +83,10 @@ def convert(pdf_file):
54
  for _, content in elements:
55
  markdown_output += content + "\n\n"
56
 
57
- return markdown_output.strip(), {}
 
 
 
58
 
59
  gr.Interface(
60
  convert,
 
2
  import gradio as gr
3
  import fitz # PyMuPDF
4
  import os
5
+ import tempfile
6
+ import ocrmypdf
7
+
8
+ def extract_text_from_pdf(doc):
9
+ full_text = ""
10
+ for page in doc:
11
+ text = page.get_text()
12
+ if text:
13
+ full_text += text + "\n\n"
14
+ return full_text.strip()
15
 
16
  @spaces.GPU
17
  def convert(pdf_file):
18
+ # Abrimos el PDF original
19
  doc = fitz.open(pdf_file)
20
+
21
+ # Extraemos texto
22
+ full_text = extract_text_from_pdf(doc)
23
+
24
+ # Si texto es muy corto, aplicamos OCR
25
+ if len(full_text) < 100:
26
+ # Creamos archivo temporal para PDF OCR
27
+ temp_ocr_pdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
28
+ temp_ocr_pdf.close()
29
+
30
+ # Aplicar OCR (forzamos OCR en todas las páginas)
31
+ ocrmypdf.ocr(pdf_file, temp_ocr_pdf.name, force_ocr=True)
32
+
33
+ # Abrimos PDF OCR
34
+ doc = fitz.open(temp_ocr_pdf.name)
35
+ full_text = extract_text_from_pdf(doc)
36
+
37
  markdown_output = ""
38
  image_dir = "extracted_images"
39
  os.makedirs(image_dir, exist_ok=True)
 
43
  blocks = page.get_text("dict")["blocks"]
44
  elements = []
45
 
46
+ # Extraemos todas las imágenes con sus xrefs
47
  image_list = page.get_images(full=True)
48
  xref_to_image_path = {}
49
 
 
52
  pix = fitz.Pixmap(doc, xref)
53
  img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
54
 
55
+ if pix.n > 4:
56
  pix = fitz.Pixmap(fitz.csRGB, pix)
57
  pix.save(img_path)
58
  pix = None
 
60
  xref_to_image_path[xref] = img_path
61
  image_counter += 1
62
 
63
+ # Procesamos bloques en orden vertical (y)
64
  for b in blocks:
65
  if b["type"] == 0: # Texto
66
  for line in b["lines"]:
 
71
  elif b["type"] == 1: # Imagen
72
  y = b["bbox"][1]
73
  xref = b.get("image", None)
74
+ # Insertamos link vacío en markdown para la imagen
75
  if xref and xref in xref_to_image_path:
76
+ # Aquí ponemos link vacío (sin destino) como pide
77
+ elements.append((y, f"![imagen]()"))
78
  else:
79
  elements.append((y, "[imagen]()"))
80
 
 
83
  for _, content in elements:
84
  markdown_output += content + "\n\n"
85
 
86
+ # Metadata vacío o puedes agregar si quieres
87
+ metadata = {}
88
+
89
+ return markdown_output.strip(), metadata
90
 
91
  gr.Interface(
92
  convert,