Biifruu commited on
Commit
c20f519
verified
1 Parent(s): 479e852

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -7,7 +7,7 @@ import os
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
- image_counter = 1 # Contador de im谩genes
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
@@ -22,14 +22,11 @@ def extract_text_markdown(doc):
22
  if line_text:
23
  elements.append((line_y, line_text))
24
  elif b["type"] == 1: # Imagen
25
- # A帽ade un enlace con nombre 煤nico
26
  elements.append((y, f"[imagen_{image_counter}]()"))
27
  image_counter += 1
28
 
29
- # Ordenar por posici贸n vertical
30
  elements.sort(key=lambda x: x[0])
31
 
32
- # Reconstrucci贸n con saltos l贸gicos
33
  previous_y = None
34
  for y, content in elements:
35
  if previous_y is not None and abs(y - previous_y) > 10:
@@ -41,21 +38,28 @@ def extract_text_markdown(doc):
41
 
42
  return markdown_output.strip()
43
 
 
 
 
 
 
44
  @spaces.GPU
45
  def convert(pdf_file):
46
  original_doc = fitz.open(pdf_file)
47
- plain_text = "\n".join([page.get_text() for page in original_doc])
48
 
49
- # Aplicar OCR solo si el PDF no tiene texto
50
- if len(plain_text.strip()) < 100:
51
- ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
52
- ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
53
- doc = fitz.open(ocr_temp_path)
 
 
 
54
  else:
55
  doc = original_doc
56
 
57
  markdown = extract_text_markdown(doc)
58
- metadata = {} # Si necesitas metadatos, se pueden agregar aqu铆
59
  return markdown, metadata
60
 
61
  gr.Interface(
 
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
+ image_counter = 1
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
 
22
  if line_text:
23
  elements.append((line_y, line_text))
24
  elif b["type"] == 1: # Imagen
 
25
  elements.append((y, f"[imagen_{image_counter}]()"))
26
  image_counter += 1
27
 
 
28
  elements.sort(key=lambda x: x[0])
29
 
 
30
  previous_y = None
31
  for y, content in elements:
32
  if previous_y is not None and abs(y - previous_y) > 10:
 
38
 
39
  return markdown_output.strip()
40
 
41
+ def needs_ocr(doc):
42
+ text_length = sum(len(page.get_text().strip()) for page in doc)
43
+ image_count = sum(len(page.get_images(full=True)) for page in doc)
44
+ return text_length < 500 or image_count > 0
45
+
46
  @spaces.GPU
47
  def convert(pdf_file):
48
  original_doc = fitz.open(pdf_file)
 
49
 
50
+ if needs_ocr(original_doc):
51
+ try:
52
+ ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
53
+ ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
54
+ doc = fitz.open(ocr_temp_path)
55
+ os.remove(ocr_temp_path)
56
+ except Exception as e:
57
+ return f"Error al aplicar OCR: {e}", {}
58
  else:
59
  doc = original_doc
60
 
61
  markdown = extract_text_markdown(doc)
62
+ metadata = {} # Puedes agregar metadatos aqu铆 si lo necesitas
63
  return markdown, metadata
64
 
65
  gr.Interface(