Biifruu commited on
Commit
8363732
·
verified ·
1 Parent(s): f2073aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -15
app.py CHANGED
@@ -9,11 +9,7 @@ import cv2
9
 
10
  def clean_ocr_text(text):
11
  lines = text.splitlines()
12
- cleaned_lines = []
13
- for line in lines:
14
- line = line.strip()
15
- if line and not line.isspace():
16
- cleaned_lines.append(line)
17
  return "\n".join(cleaned_lines)
18
 
19
  def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
@@ -70,14 +66,12 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
70
  return markdown_output.strip()
71
 
72
  @spaces.GPU
73
- def convert(inputs):
74
- # <- inputs es una lista, aunque solo tenga 1 archivo
75
- pdf_file = inputs[0]
76
- print(f"Entradas recibidas: {inputs}")
77
- if not inputs or not os.path.exists(inputs[0]):
78
- return "Archivo no encontrado o inválido.", [], ""
79
-
80
- doc = fitz.open(pdf_file)
81
  markdown_output = ""
82
  image_paths = []
83
  seen_xrefs = set()
@@ -136,10 +130,9 @@ def convert(inputs):
136
 
137
  return markdown_output.strip(), image_paths, markdown_path
138
 
139
- # Interfaz Gradio
140
  with gr.Blocks() as demo:
141
  with gr.Row():
142
- pdf_input = gr.File(label="Sube tu PDF", type="filepath")
143
  submit_btn = gr.Button("Procesar PDF")
144
 
145
  markdown_output = gr.Textbox(label="Markdown estructurado", lines=25, interactive=True)
 
9
 
10
  def clean_ocr_text(text):
11
  lines = text.splitlines()
12
+ cleaned_lines = [line.strip() for line in lines if line.strip()]
 
 
 
 
13
  return "\n".join(cleaned_lines)
14
 
15
  def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
 
66
  return markdown_output.strip()
67
 
68
  @spaces.GPU
69
+ def convert(pdf_bytes):
70
+ temp_pdf_path = "/tmp/uploaded_file.pdf"
71
+ with open(temp_pdf_path, "wb") as f:
72
+ f.write(pdf_bytes)
73
+
74
+ doc = fitz.open(temp_pdf_path)
 
 
75
  markdown_output = ""
76
  image_paths = []
77
  seen_xrefs = set()
 
130
 
131
  return markdown_output.strip(), image_paths, markdown_path
132
 
 
133
  with gr.Blocks() as demo:
134
  with gr.Row():
135
+ pdf_input = gr.File(label="Sube tu PDF", type="binary") # <-- Aquí el cambio importante
136
  submit_btn = gr.Button("Procesar PDF")
137
 
138
  markdown_output = gr.Textbox(label="Markdown estructurado", lines=25, interactive=True)