Biifruu commited on
Commit
9e2e286
·
verified ·
1 Parent(s): 565985f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -8
app.py CHANGED
@@ -1,20 +1,16 @@
1
  import spaces
2
  import gradio as gr
3
  import fitz # PyMuPDF
4
- import tempfile
5
- import os
6
  from PIL import Image
7
  import pytesseract
8
 
9
  def clean_ocr_text(text):
10
  lines = text.splitlines()
11
  cleaned_lines = []
12
-
13
  for line in lines:
14
  line = line.strip()
15
  if line and not line.isspace():
16
  cleaned_lines.append(line)
17
-
18
  return "\n".join(cleaned_lines)
19
 
20
  def extract_text_markdown(doc):
@@ -68,10 +64,10 @@ def extract_text_markdown(doc):
68
 
69
  @spaces.GPU
70
  def convert(pdf_file):
71
- image_paths.append(image_path)
72
  doc = fitz.open(pdf_file)
73
  markdown_output = ""
74
  image_counter = 1
 
75
 
76
  for page_num in range(len(doc)):
77
  page = doc[page_num]
@@ -81,13 +77,15 @@ def convert(pdf_file):
81
  # Página con texto normal
82
  markdown_output += extract_text_markdown([page]) + "\n"
83
  else:
84
- # Página vacía o imagen: hacer OCR
85
  pix = page.get_pixmap(dpi=300)
86
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
87
 
88
- # Guardar imagen escaneada completa
89
  image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
90
  img.save(image_path)
 
 
91
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
92
 
93
  try:
@@ -112,4 +110,3 @@ gr.Interface(
112
  gr.Gallery(label="Imágenes extraídas").style(grid=[2], height="auto")
113
  ],
114
  ).launch()
115
-
 
1
  import spaces
2
  import gradio as gr
3
  import fitz # PyMuPDF
 
 
4
  from PIL import Image
5
  import pytesseract
6
 
7
  def clean_ocr_text(text):
8
  lines = text.splitlines()
9
  cleaned_lines = []
 
10
  for line in lines:
11
  line = line.strip()
12
  if line and not line.isspace():
13
  cleaned_lines.append(line)
 
14
  return "\n".join(cleaned_lines)
15
 
16
  def extract_text_markdown(doc):
 
64
 
65
  @spaces.GPU
66
  def convert(pdf_file):
 
67
  doc = fitz.open(pdf_file)
68
  markdown_output = ""
69
  image_counter = 1
70
+ image_paths = []
71
 
72
  for page_num in range(len(doc)):
73
  page = doc[page_num]
 
77
  # Página con texto normal
78
  markdown_output += extract_text_markdown([page]) + "\n"
79
  else:
80
+ # Página vacía o con imagen: hacer OCR
81
  pix = page.get_pixmap(dpi=300)
82
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
83
 
84
+ # Guardar imagen completa
85
  image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
86
  img.save(image_path)
87
+ image_paths.append(image_path)
88
+
89
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
90
 
91
  try:
 
110
  gr.Gallery(label="Imágenes extraídas").style(grid=[2], height="auto")
111
  ],
112
  ).launch()