Biifruu commited on
Commit
3920f3b
·
verified ·
1 Parent(s): b697ac0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -40
app.py CHANGED
@@ -1,51 +1,27 @@
1
  import spaces
2
  import gradio as gr
3
- from pypdf import PdfReader
4
- import ocrmypdf
5
-
6
-
7
- def extract_text_from_pdf(reader):
8
- full_text = ""
9
- for idx, page in enumerate(reader.pages):
10
- text = page.extract_text()
11
- if len(text) > 0:
12
- full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
13
-
14
- return full_text.strip()
15
-
16
 
17
  @spaces.GPU
18
  def convert(pdf_file):
19
- reader = PdfReader(pdf_file)
20
-
21
- # Extract metadata
22
- metadata = {
23
- "author": reader.metadata.author,
24
- "creator": reader.metadata.creator,
25
- "producer": reader.metadata.producer,
26
- "subject": reader.metadata.subject,
27
- "title": reader.metadata.title,
28
- }
29
-
30
- # Extract text
31
- full_text = extract_text_from_pdf(reader)
32
-
33
- # Check if there are any images
34
- image_count = 0
35
- for page in reader.pages:
36
- image_count += len(page.images)
37
-
38
- # If there are images and not much content, perform OCR on the document
39
- if image_count > 0 and len(full_text) < 1000:
40
- out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
41
- ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
42
 
43
- # Re-extract text
44
- reader = PdfReader(pdf_file)
45
- full_text = extract_text_from_pdf(reader)
46
 
47
- return full_text, metadata
 
 
 
 
48
 
 
49
 
50
  gr.Interface(
51
  convert,
 
1
  import spaces
2
  import gradio as gr
3
+ from pdf2image import convert_from_path
4
+ import pytesseract
5
+ from PIL import Image
6
+ import os
 
 
 
 
 
 
 
 
 
7
 
8
  @spaces.GPU
9
  def convert(pdf_file):
10
+ pages = convert_from_path(pdf_file)
11
+ markdown_output = ""
12
+ metadata = {} # Opcional: puedes extraer metadata con PyMuPDF si lo deseas
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ for idx, page_image in enumerate(pages):
15
+ # Realizar OCR
16
+ text = pytesseract.image_to_string(page_image)
17
 
18
+ if text.strip() == "":
19
+ # Si no hay texto, insertar un enlace vacío
20
+ markdown_output += f"[imagen]()\n\n"
21
+ else:
22
+ markdown_output += text.strip() + "\n\n"
23
 
24
+ return markdown_output.strip(), metadata
25
 
26
  gr.Interface(
27
  convert,