Biifruu commited on
Commit
e62d9f5
·
verified ·
1 Parent(s): 4337f3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -51
app.py CHANGED
@@ -5,47 +5,19 @@ import os
5
  import tempfile
6
  import ocrmypdf
7
 
8
- def extract_text_from_pdf(doc):
9
- full_text = ""
10
- for page in doc:
11
- text = page.get_text()
12
- if text:
13
- full_text += text + "\n\n"
14
- return full_text.strip()
15
-
16
- @spaces.GPU
17
- def convert(pdf_file):
18
- # Abrimos el PDF original
19
- doc = fitz.open(pdf_file)
20
-
21
- # Extraemos texto
22
- full_text = extract_text_from_pdf(doc)
23
-
24
- # Si texto es muy corto, aplicamos OCR
25
- if len(full_text) < 100:
26
- # Creamos archivo temporal para PDF OCR
27
- temp_ocr_pdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
28
- temp_ocr_pdf.close()
29
-
30
- # Aplicar OCR (forzamos OCR en todas las páginas)
31
- ocrmypdf.ocr(pdf_file, temp_ocr_pdf.name, force_ocr=True)
32
-
33
- # Abrimos PDF OCR
34
- doc = fitz.open(temp_ocr_pdf.name)
35
- full_text = extract_text_from_pdf(doc)
36
-
37
  markdown_output = ""
38
  image_dir = "extracted_images"
39
  os.makedirs(image_dir, exist_ok=True)
40
  image_counter = 0
41
 
42
- for page_number, page in enumerate(doc):
43
  blocks = page.get_text("dict")["blocks"]
44
  elements = []
45
 
46
- # Extraemos todas las imágenes con sus xrefs
47
  image_list = page.get_images(full=True)
48
- xref_to_image_path = {}
49
 
50
  for img in image_list:
51
  xref = img[0]
@@ -57,39 +29,49 @@ def convert(pdf_file):
57
  pix.save(img_path)
58
  pix = None
59
 
60
- xref_to_image_path[xref] = img_path
61
  image_counter += 1
62
 
63
- # Procesamos bloques en orden vertical (y)
64
  for b in blocks:
 
65
  if b["type"] == 0: # Texto
 
66
  for line in b["lines"]:
67
- for span in line["spans"]:
68
- y = span["bbox"][1]
69
- text = span["text"]
70
- elements.append((y, text.strip()))
 
71
  elif b["type"] == 1: # Imagen
72
- y = b["bbox"][1]
73
- xref = b.get("image", None)
74
- # Insertamos link vacío en markdown para la imagen
75
- if xref and xref in xref_to_image_path:
76
- # Aquí ponemos link vacío (sin destino) como pide
77
- elements.append((y, f"![imagen]()"))
78
- else:
79
- elements.append((y, "[imagen]()"))
80
 
81
  elements.sort(key=lambda x: x[0])
82
 
83
  for _, content in elements:
84
  markdown_output += content + "\n\n"
85
 
86
- # Metadata vacío o puedes agregar si quieres
87
- metadata = {}
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- return markdown_output.strip(), metadata
 
 
90
 
91
  gr.Interface(
92
- convert,
93
  inputs=[gr.File(label="Upload PDF", type="filepath")],
94
- outputs=[gr.Text(label="Markdown"), gr.JSON(label="Metadata")],
95
  ).launch()
 
5
  import tempfile
6
  import ocrmypdf
7
 
8
+ def extract_text_markdown(doc):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  markdown_output = ""
10
  image_dir = "extracted_images"
11
  os.makedirs(image_dir, exist_ok=True)
12
  image_counter = 0
13
 
14
+ for page in doc:
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
18
+ # Extraer imágenes y guardar para asignar link
19
  image_list = page.get_images(full=True)
20
+ xref_to_placeholder = {}
21
 
22
  for img in image_list:
23
  xref = img[0]
 
29
  pix.save(img_path)
30
  pix = None
31
 
32
+ xref_to_placeholder[xref] = f"![imagen]()"
33
  image_counter += 1
34
 
 
35
  for b in blocks:
36
+ y = b["bbox"][1]
37
  if b["type"] == 0: # Texto
38
+ paragraph = ""
39
  for line in b["lines"]:
40
+ line_text = " ".join([span["text"].strip() for span in line["spans"]])
41
+ paragraph += line_text + " "
42
+ paragraph = paragraph.strip()
43
+ if paragraph:
44
+ elements.append((y, paragraph))
45
  elif b["type"] == 1: # Imagen
46
+ xref = b.get("image")
47
+ elements.append((y, "![imagen]()"))
 
 
 
 
 
 
48
 
49
  elements.sort(key=lambda x: x[0])
50
 
51
  for _, content in elements:
52
  markdown_output += content + "\n\n"
53
 
54
+ return markdown_output.strip()
55
+
56
+ @spaces.GPU
57
+ def convert(pdf_file):
58
+ original_doc = fitz.open(pdf_file)
59
+ plain_text = "\n".join([page.get_text() for page in original_doc])
60
+
61
+ # Si es imagen escaneada sin texto, aplicamos OCR
62
+ if len(plain_text.strip()) < 100:
63
+ ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
64
+ ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
65
+ doc = fitz.open(ocr_temp_path)
66
+ else:
67
+ doc = original_doc
68
 
69
+ markdown = extract_text_markdown(doc)
70
+ metadata = {} # Puedes agregar metadatos si quieres
71
+ return markdown, metadata
72
 
73
  gr.Interface(
74
+ fn=convert,
75
  inputs=[gr.File(label="Upload PDF", type="filepath")],
76
+ outputs=[gr.Text(label="Markdown crudo"), gr.JSON(label="Metadata")],
77
  ).launch()