Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,20 +1,16 @@
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
import fitz # PyMuPDF
|
4 |
-
import tempfile
|
5 |
-
import os
|
6 |
from PIL import Image
|
7 |
import pytesseract
|
8 |
|
9 |
def clean_ocr_text(text):
|
10 |
lines = text.splitlines()
|
11 |
cleaned_lines = []
|
12 |
-
|
13 |
for line in lines:
|
14 |
line = line.strip()
|
15 |
if line and not line.isspace():
|
16 |
cleaned_lines.append(line)
|
17 |
-
|
18 |
return "\n".join(cleaned_lines)
|
19 |
|
20 |
def extract_text_markdown(doc):
|
@@ -68,10 +64,10 @@ def extract_text_markdown(doc):
|
|
68 |
|
69 |
@spaces.GPU
|
70 |
def convert(pdf_file):
|
71 |
-
image_paths.append(image_path)
|
72 |
doc = fitz.open(pdf_file)
|
73 |
markdown_output = ""
|
74 |
image_counter = 1
|
|
|
75 |
|
76 |
for page_num in range(len(doc)):
|
77 |
page = doc[page_num]
|
@@ -81,13 +77,15 @@ def convert(pdf_file):
|
|
81 |
# Página con texto normal
|
82 |
markdown_output += extract_text_markdown([page]) + "\n"
|
83 |
else:
|
84 |
-
# Página vacía o imagen: hacer OCR
|
85 |
pix = page.get_pixmap(dpi=300)
|
86 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
87 |
|
88 |
-
# Guardar imagen
|
89 |
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
|
90 |
img.save(image_path)
|
|
|
|
|
91 |
markdown_output += f"\n"
|
92 |
|
93 |
try:
|
@@ -112,4 +110,3 @@ gr.Interface(
|
|
112 |
gr.Gallery(label="Imágenes extraídas").style(grid=[2], height="auto")
|
113 |
],
|
114 |
).launch()
|
115 |
-
|
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
import fitz # PyMuPDF
|
|
|
|
|
4 |
from PIL import Image
|
5 |
import pytesseract
|
6 |
|
7 |
def clean_ocr_text(text):
|
8 |
lines = text.splitlines()
|
9 |
cleaned_lines = []
|
|
|
10 |
for line in lines:
|
11 |
line = line.strip()
|
12 |
if line and not line.isspace():
|
13 |
cleaned_lines.append(line)
|
|
|
14 |
return "\n".join(cleaned_lines)
|
15 |
|
16 |
def extract_text_markdown(doc):
|
|
|
64 |
|
65 |
@spaces.GPU
|
66 |
def convert(pdf_file):
|
|
|
67 |
doc = fitz.open(pdf_file)
|
68 |
markdown_output = ""
|
69 |
image_counter = 1
|
70 |
+
image_paths = []
|
71 |
|
72 |
for page_num in range(len(doc)):
|
73 |
page = doc[page_num]
|
|
|
77 |
# Página con texto normal
|
78 |
markdown_output += extract_text_markdown([page]) + "\n"
|
79 |
else:
|
80 |
+
# Página vacía o con imagen: hacer OCR
|
81 |
pix = page.get_pixmap(dpi=300)
|
82 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
83 |
|
84 |
+
# Guardar imagen completa
|
85 |
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
|
86 |
img.save(image_path)
|
87 |
+
image_paths.append(image_path)
|
88 |
+
|
89 |
markdown_output += f"\n"
|
90 |
|
91 |
try:
|
|
|
110 |
gr.Gallery(label="Imágenes extraídas").style(grid=[2], height="auto")
|
111 |
],
|
112 |
).launch()
|
|