Biifruu commited on
Commit
02bbba1
·
verified ·
1 Parent(s): c283ac8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -5
app.py CHANGED
@@ -6,6 +6,7 @@ import pytesseract
6
  import os
7
  import numpy as np
8
  import cv2
 
9
 
10
  def clean_ocr_text(text):
11
  lines = text.splitlines()
@@ -67,6 +68,16 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
67
 
68
  @spaces.GPU
69
  def convert(pdf_bytes):
 
 
 
 
 
 
 
 
 
 
70
  temp_pdf_path = "/tmp/uploaded_file.pdf"
71
  with open(temp_pdf_path, "wb") as f:
72
  f.write(pdf_bytes)
@@ -86,18 +97,15 @@ def convert(pdf_bytes):
86
  markdown_output += f"\n## Página {page_num + 1}\n\n"
87
  pix = page.get_pixmap(dpi=300)
88
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
89
-
90
  image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
91
  img.save(image_path)
92
  image_paths.append(image_path)
93
-
94
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
95
 
96
  try:
97
  ocr_text = pytesseract.image_to_string(img)
98
  except pytesseract.TesseractError:
99
  ocr_text = ""
100
-
101
  ocr_text = clean_ocr_text(ocr_text)
102
  if ocr_text.strip():
103
  markdown_output += ocr_text + "\n"
@@ -107,9 +115,7 @@ def convert(pdf_bytes):
107
  gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
108
  _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
109
  contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
110
-
111
  contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
112
-
113
  for i, cnt in enumerate(contours):
114
  x, y, w, h = cv2.boundingRect(cnt)
115
  area = w * h
 
6
  import os
7
  import numpy as np
8
  import cv2
9
+ import base64
10
 
11
  def clean_ocr_text(text):
12
  lines = text.splitlines()
 
68
 
69
  @spaces.GPU
70
  def convert(pdf_bytes):
71
+ # Si no se recibe ningún PDF, usamos uno por defecto.
72
+ if not pdf_bytes:
73
+ default_pdf_base64 = (
74
+ "JVBERi0xLjQKMSAwIG9iago8PC9UeXBlIC9DYXRhbG9nIC9QYWdlcyAyIDAgUgovT3V0cHV0cyA8PC9Qcm9jU2V0Wy9QREZdPj4+"
75
+ "CmVuZG9iagoKMiAwIG9iago8PC9UeXBlIC9QYWdlcyAvS2lkcyBbMyAwIFJdIC9Db3VudCAxPj4KZW5kb2JqCgozIDAgb2JqCjw8"
76
+ "L1R5cGUgL1BhZ2UgL1BhZ2VzIDIgMCBSIC9NZWRpYUJveCBbMCAwIDMwMCAxNDNdIC9Db250ZW50cyA0IDAgUiA+PgplbmRvYmoK"
77
+ "CgoKZW5kb2JqCnhyZWYKMC45NgolJUVPRgo="
78
+ )
79
+ pdf_bytes = base64.b64decode(default_pdf_base64)
80
+
81
  temp_pdf_path = "/tmp/uploaded_file.pdf"
82
  with open(temp_pdf_path, "wb") as f:
83
  f.write(pdf_bytes)
 
97
  markdown_output += f"\n## Página {page_num + 1}\n\n"
98
  pix = page.get_pixmap(dpi=300)
99
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
100
  image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
101
  img.save(image_path)
102
  image_paths.append(image_path)
 
103
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
104
 
105
  try:
106
  ocr_text = pytesseract.image_to_string(img)
107
  except pytesseract.TesseractError:
108
  ocr_text = ""
 
109
  ocr_text = clean_ocr_text(ocr_text)
110
  if ocr_text.strip():
111
  markdown_output += ocr_text + "\n"
 
115
  gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
116
  _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
117
  contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
118
  contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
 
119
  for i, cnt in enumerate(contours):
120
  x, y, w, h = cv2.boundingRect(cnt)
121
  area = w * h