Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import os
|
|
7 |
|
8 |
def extract_text_markdown(doc):
|
9 |
markdown_output = ""
|
10 |
-
image_counter = 1
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
@@ -22,14 +22,11 @@ def extract_text_markdown(doc):
|
|
22 |
if line_text:
|
23 |
elements.append((line_y, line_text))
|
24 |
elif b["type"] == 1: # Imagen
|
25 |
-
# A帽ade un enlace con nombre 煤nico
|
26 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
27 |
image_counter += 1
|
28 |
|
29 |
-
# Ordenar por posici贸n vertical
|
30 |
elements.sort(key=lambda x: x[0])
|
31 |
|
32 |
-
# Reconstrucci贸n con saltos l贸gicos
|
33 |
previous_y = None
|
34 |
for y, content in elements:
|
35 |
if previous_y is not None and abs(y - previous_y) > 10:
|
@@ -41,21 +38,28 @@ def extract_text_markdown(doc):
|
|
41 |
|
42 |
return markdown_output.strip()
|
43 |
|
|
|
|
|
|
|
|
|
|
|
44 |
@spaces.GPU
|
45 |
def convert(pdf_file):
|
46 |
original_doc = fitz.open(pdf_file)
|
47 |
-
plain_text = "\n".join([page.get_text() for page in original_doc])
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
54 |
else:
|
55 |
doc = original_doc
|
56 |
|
57 |
markdown = extract_text_markdown(doc)
|
58 |
-
metadata = {} #
|
59 |
return markdown, metadata
|
60 |
|
61 |
gr.Interface(
|
|
|
7 |
|
8 |
def extract_text_markdown(doc):
|
9 |
markdown_output = ""
|
10 |
+
image_counter = 1
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
|
|
22 |
if line_text:
|
23 |
elements.append((line_y, line_text))
|
24 |
elif b["type"] == 1: # Imagen
|
|
|
25 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
26 |
image_counter += 1
|
27 |
|
|
|
28 |
elements.sort(key=lambda x: x[0])
|
29 |
|
|
|
30 |
previous_y = None
|
31 |
for y, content in elements:
|
32 |
if previous_y is not None and abs(y - previous_y) > 10:
|
|
|
38 |
|
39 |
return markdown_output.strip()
|
40 |
|
41 |
+
def needs_ocr(doc):
|
42 |
+
text_length = sum(len(page.get_text().strip()) for page in doc)
|
43 |
+
image_count = sum(len(page.get_images(full=True)) for page in doc)
|
44 |
+
return text_length < 500 or image_count > 0
|
45 |
+
|
46 |
@spaces.GPU
|
47 |
def convert(pdf_file):
|
48 |
original_doc = fitz.open(pdf_file)
|
|
|
49 |
|
50 |
+
if needs_ocr(original_doc):
|
51 |
+
try:
|
52 |
+
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
|
53 |
+
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
|
54 |
+
doc = fitz.open(ocr_temp_path)
|
55 |
+
os.remove(ocr_temp_path)
|
56 |
+
except Exception as e:
|
57 |
+
return f"Error al aplicar OCR: {e}", {}
|
58 |
else:
|
59 |
doc = original_doc
|
60 |
|
61 |
markdown = extract_text_markdown(doc)
|
62 |
+
metadata = {} # Puedes agregar metadatos aqu铆 si lo necesitas
|
63 |
return markdown, metadata
|
64 |
|
65 |
gr.Interface(
|