Biifruu commited on
Commit
f79c813
·
verified ·
1 Parent(s): d7cc8b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -14
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import spaces
2
  import gradio as gr
3
  import fitz # PyMuPDF
@@ -45,22 +47,27 @@ def needs_ocr(doc):
45
 
46
  @spaces.GPU
47
  def convert(pdf_file):
48
- original_doc = fitz.open(pdf_file)
 
 
 
 
 
 
49
 
50
- if needs_ocr(original_doc):
51
- try:
52
- ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
53
- ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
54
- doc = fitz.open(ocr_temp_path)
55
- os.remove(ocr_temp_path)
56
- except Exception as e:
57
- return f"Error al aplicar OCR: {e}", {}
58
- else:
59
- doc = original_doc
 
60
 
61
- markdown = extract_text_markdown(doc)
62
- metadata = {} # Puedes agregar metadatos aquí si lo necesitas
63
- return markdown, metadata
64
 
65
  gr.Interface(
66
  fn=convert,
 
1
+ from PIL import Image
2
+ import pytesseract
3
  import spaces
4
  import gradio as gr
5
  import fitz # PyMuPDF
 
47
 
48
  @spaces.GPU
49
  def convert(pdf_file):
50
+ doc = fitz.open(pdf_file)
51
+ markdown_output = ""
52
+ image_counter = 1
53
+
54
+ for page_num in range(len(doc)):
55
+ page = doc[page_num]
56
+ text = page.get_text("text").strip()
57
 
58
+ if len(text) > 30:
59
+ # Página con texto normal
60
+ markdown_output += extract_text_markdown([page]) + "\n"
61
+ else:
62
+ # Página sin texto: usar OCR por imagen
63
+ pix = page.get_pixmap(dpi=300)
64
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
65
+ ocr_text = pytesseract.image_to_string(img, lang="spa")
66
+ markdown_output += ocr_text.strip() + "\n"
67
+
68
+ markdown_output += "\n---\n\n"
69
 
70
+ return markdown_output.strip(), {}
 
 
71
 
72
  gr.Interface(
73
  fn=convert,