Biifruu commited on
Commit
75d0452
·
verified ·
1 Parent(s): 158a59d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -14
app.py CHANGED
@@ -1,11 +1,21 @@
1
- from PIL import Image
2
- import pytesseract
3
  import spaces
4
  import gradio as gr
5
  import fitz # PyMuPDF
6
- import ocrmypdf
7
  import tempfile
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def extract_text_markdown(doc):
11
  markdown_output = ""
@@ -21,30 +31,36 @@ def extract_text_markdown(doc):
21
  for line in b["lines"]:
22
  line_y = line["bbox"][1]
23
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
 
24
  if line_text:
25
- elements.append((line_y, line_text))
26
  elif b["type"] == 1: # Imagen
27
- elements.append((y, f"[imagen_{image_counter}]()"))
28
  image_counter += 1
29
 
30
  elements.sort(key=lambda x: x[0])
31
 
32
  previous_y = None
33
- for y, content in elements:
 
 
 
 
34
  if previous_y is not None and abs(y - previous_y) > 10:
35
  markdown_output += "\n"
36
- markdown_output += content + "\n"
 
 
 
 
 
37
  previous_y = y
 
38
 
39
  markdown_output += "\n---\n\n"
40
 
41
  return markdown_output.strip()
42
 
43
- def needs_ocr(doc):
44
- text_length = sum(len(page.get_text().strip()) for page in doc)
45
- image_count = sum(len(page.get_images(full=True)) for page in doc)
46
- return text_length < 500 or image_count > 0
47
-
48
  @spaces.GPU
49
  def convert(pdf_file):
50
  doc = fitz.open(pdf_file)
@@ -59,11 +75,11 @@ def convert(pdf_file):
59
  # Página con texto normal
60
  markdown_output += extract_text_markdown([page]) + "\n"
61
  else:
62
- # Página sin texto: usar OCR por imagen
63
  pix = page.get_pixmap(dpi=300)
64
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
65
  ocr_text = pytesseract.image_to_string(img, lang="spa")
66
- markdown_output += ocr_text.strip() + "\n"
67
 
68
  markdown_output += "\n---\n\n"
69
 
 
 
 
1
  import spaces
2
  import gradio as gr
3
  import fitz # PyMuPDF
 
4
  import tempfile
5
  import os
6
+ from PIL import Image
7
+ import pytesseract
8
+
9
+ def clean_ocr_text(text):
10
+ lines = text.splitlines()
11
+ cleaned_lines = []
12
+
13
+ for line in lines:
14
+ line = line.strip()
15
+ if line and not line.isspace():
16
+ cleaned_lines.append(line)
17
+
18
+ return "\n".join(cleaned_lines)
19
 
20
  def extract_text_markdown(doc):
21
  markdown_output = ""
 
31
  for line in b["lines"]:
32
  line_y = line["bbox"][1]
33
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
34
+ max_font_size = max([span.get("size", 10) for span in line["spans"]])
35
  if line_text:
36
+ elements.append((line_y, line_text, max_font_size))
37
  elif b["type"] == 1: # Imagen
38
+ elements.append((y, f"![imagen_{image_counter}](#)", 10))
39
  image_counter += 1
40
 
41
  elements.sort(key=lambda x: x[0])
42
 
43
  previous_y = None
44
+ previous_font = None
45
+
46
+ for y, text, font_size in elements:
47
+ is_header = font_size >= 14
48
+
49
  if previous_y is not None and abs(y - previous_y) > 10:
50
  markdown_output += "\n"
51
+
52
+ if is_header:
53
+ markdown_output += f"\n### {text.strip()}\n"
54
+ else:
55
+ markdown_output += text.strip() + "\n"
56
+
57
  previous_y = y
58
+ previous_font = font_size
59
 
60
  markdown_output += "\n---\n\n"
61
 
62
  return markdown_output.strip()
63
 
 
 
 
 
 
64
  @spaces.GPU
65
  def convert(pdf_file):
66
  doc = fitz.open(pdf_file)
 
75
  # Página con texto normal
76
  markdown_output += extract_text_markdown([page]) + "\n"
77
  else:
78
+ # Página sin texto: usar OCR
79
  pix = page.get_pixmap(dpi=300)
80
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
81
  ocr_text = pytesseract.image_to_string(img, lang="spa")
82
+ markdown_output += clean_ocr_text(ocr_text) + "\n"
83
 
84
  markdown_output += "\n---\n\n"
85