Biifruu commited on
Commit
d4b4544
verified
1 Parent(s): 204916f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -15
app.py CHANGED
@@ -1,27 +1,32 @@
1
  import spaces
2
  import gradio as gr
3
- from pdf2image import convert_from_path
4
- import pytesseract
5
- from PIL import Image
6
- import os
7
 
8
  @spaces.GPU
9
  def convert(pdf_file):
10
- pages = convert_from_path(pdf_file)
11
  markdown_output = ""
12
- metadata = {} # Opcional: puedes extraer metadata con PyMuPDF si lo deseas
13
 
14
- for idx, page_image in enumerate(pages):
15
- # Realizar OCR
16
- text = pytesseract.image_to_string(page_image)
17
 
18
- if text.strip() == "":
19
- # Si no hay texto, insertar un enlace vac铆o
20
- markdown_output += f"[imagen]()\n\n"
21
- else:
22
- markdown_output += text.strip() + "\n\n"
 
 
 
23
 
24
- return markdown_output.strip(), metadata
 
 
 
 
 
 
25
 
26
  gr.Interface(
27
  convert,
 
1
  import spaces
2
  import gradio as gr
3
+ import fitz # PyMuPDF
 
 
 
4
 
5
  @spaces.GPU
6
  def convert(pdf_file):
7
+ doc = fitz.open(pdf_file)
8
  markdown_output = ""
 
9
 
10
+ for page in doc:
11
+ blocks = page.get_text("dict")["blocks"]
12
+ elements = []
13
 
14
+ for b in blocks:
15
+ if b["type"] == 0: # texto
16
+ for line in b["lines"]:
17
+ for span in line["spans"]:
18
+ elements.append((span["bbox"][1], span["text"])) # y, texto
19
+ elif b["type"] == 1: # imagen
20
+ y_pos = b["bbox"][1]
21
+ elements.append((y_pos, "[imagen]()"))
22
 
23
+ # Ordenar por posici贸n vertical
24
+ elements.sort(key=lambda x: x[0])
25
+
26
+ for _, content in elements:
27
+ markdown_output += content.strip() + "\n\n"
28
+
29
+ return markdown_output.strip(), {}
30
 
31
  gr.Interface(
32
  convert,