Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,10 +2,38 @@ import spaces
|
|
2 |
import gradio as gr
|
3 |
import fitz # PyMuPDF
|
4 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
@spaces.GPU
|
7 |
def convert(pdf_file):
|
|
|
8 |
doc = fitz.open(pdf_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
markdown_output = ""
|
10 |
image_dir = "extracted_images"
|
11 |
os.makedirs(image_dir, exist_ok=True)
|
@@ -15,7 +43,7 @@ def convert(pdf_file):
|
|
15 |
blocks = page.get_text("dict")["blocks"]
|
16 |
elements = []
|
17 |
|
18 |
-
# Extraemos
|
19 |
image_list = page.get_images(full=True)
|
20 |
xref_to_image_path = {}
|
21 |
|
@@ -24,7 +52,7 @@ def convert(pdf_file):
|
|
24 |
pix = fitz.Pixmap(doc, xref)
|
25 |
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
|
26 |
|
27 |
-
if pix.n > 4:
|
28 |
pix = fitz.Pixmap(fitz.csRGB, pix)
|
29 |
pix.save(img_path)
|
30 |
pix = None
|
@@ -32,7 +60,7 @@ def convert(pdf_file):
|
|
32 |
xref_to_image_path[xref] = img_path
|
33 |
image_counter += 1
|
34 |
|
35 |
-
#
|
36 |
for b in blocks:
|
37 |
if b["type"] == 0: # Texto
|
38 |
for line in b["lines"]:
|
@@ -43,9 +71,10 @@ def convert(pdf_file):
|
|
43 |
elif b["type"] == 1: # Imagen
|
44 |
y = b["bbox"][1]
|
45 |
xref = b.get("image", None)
|
|
|
46 |
if xref and xref in xref_to_image_path:
|
47 |
-
|
48 |
-
elements.append((y, f""))
|
51 |
|
@@ -54,7 +83,10 @@ def convert(pdf_file):
|
|
54 |
for _, content in elements:
|
55 |
markdown_output += content + "\n\n"
|
56 |
|
57 |
-
|
|
|
|
|
|
|
58 |
|
59 |
gr.Interface(
|
60 |
convert,
|
|
|
2 |
import gradio as gr
|
3 |
import fitz # PyMuPDF
|
4 |
import os
|
5 |
+
import tempfile
|
6 |
+
import ocrmypdf
|
7 |
+
|
8 |
+
def extract_text_from_pdf(doc):
|
9 |
+
full_text = ""
|
10 |
+
for page in doc:
|
11 |
+
text = page.get_text()
|
12 |
+
if text:
|
13 |
+
full_text += text + "\n\n"
|
14 |
+
return full_text.strip()
|
15 |
|
16 |
@spaces.GPU
|
17 |
def convert(pdf_file):
|
18 |
+
# Abrimos el PDF original
|
19 |
doc = fitz.open(pdf_file)
|
20 |
+
|
21 |
+
# Extraemos texto
|
22 |
+
full_text = extract_text_from_pdf(doc)
|
23 |
+
|
24 |
+
# Si texto es muy corto, aplicamos OCR
|
25 |
+
if len(full_text) < 100:
|
26 |
+
# Creamos archivo temporal para PDF OCR
|
27 |
+
temp_ocr_pdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
28 |
+
temp_ocr_pdf.close()
|
29 |
+
|
30 |
+
# Aplicar OCR (forzamos OCR en todas las páginas)
|
31 |
+
ocrmypdf.ocr(pdf_file, temp_ocr_pdf.name, force_ocr=True)
|
32 |
+
|
33 |
+
# Abrimos PDF OCR
|
34 |
+
doc = fitz.open(temp_ocr_pdf.name)
|
35 |
+
full_text = extract_text_from_pdf(doc)
|
36 |
+
|
37 |
markdown_output = ""
|
38 |
image_dir = "extracted_images"
|
39 |
os.makedirs(image_dir, exist_ok=True)
|
|
|
43 |
blocks = page.get_text("dict")["blocks"]
|
44 |
elements = []
|
45 |
|
46 |
+
# Extraemos todas las imágenes con sus xrefs
|
47 |
image_list = page.get_images(full=True)
|
48 |
xref_to_image_path = {}
|
49 |
|
|
|
52 |
pix = fitz.Pixmap(doc, xref)
|
53 |
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
|
54 |
|
55 |
+
if pix.n > 4:
|
56 |
pix = fitz.Pixmap(fitz.csRGB, pix)
|
57 |
pix.save(img_path)
|
58 |
pix = None
|
|
|
60 |
xref_to_image_path[xref] = img_path
|
61 |
image_counter += 1
|
62 |
|
63 |
+
# Procesamos bloques en orden vertical (y)
|
64 |
for b in blocks:
|
65 |
if b["type"] == 0: # Texto
|
66 |
for line in b["lines"]:
|
|
|
71 |
elif b["type"] == 1: # Imagen
|
72 |
y = b["bbox"][1]
|
73 |
xref = b.get("image", None)
|
74 |
+
# Insertamos link vacío en markdown para la imagen
|
75 |
if xref and xref in xref_to_image_path:
|
76 |
+
# Aquí ponemos link vacío (sin destino) como pide
|
77 |
+
elements.append((y, f"![imagen]()"))
|
78 |
else:
|
79 |
elements.append((y, "[imagen]()"))
|
80 |
|
|
|
83 |
for _, content in elements:
|
84 |
markdown_output += content + "\n\n"
|
85 |
|
86 |
+
# Metadata vacío o puedes agregar si quieres
|
87 |
+
metadata = {}
|
88 |
+
|
89 |
+
return markdown_output.strip(), metadata
|
90 |
|
91 |
gr.Interface(
|
92 |
convert,
|