Biifruu commited on
Commit
7b1bb08
verified
1 Parent(s): 3e3d3c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -13
app.py CHANGED
@@ -1,15 +1,12 @@
1
  import spaces
2
  import gradio as gr
3
  import fitz # PyMuPDF
4
- import os
5
- import tempfile
6
  import ocrmypdf
 
 
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
- image_dir = "extracted_images"
11
- os.makedirs(image_dir, exist_ok=True)
12
- image_counter = 0
13
 
14
  for page in doc:
15
  blocks = page.get_text("dict")["blocks"]
@@ -26,18 +23,18 @@ def extract_text_markdown(doc):
26
  elif b["type"] == 1: # Imagen
27
  elements.append((y, "![imagen]()"))
28
 
29
- # Ordenar elementos por posici贸n vertical
30
  elements.sort(key=lambda x: x[0])
31
 
32
- # Reconstruir markdown respetando el espaciado visual
33
  previous_y = None
34
  for y, content in elements:
35
- if previous_y is not None and abs(y - previous_y) > 10: # si hay espacio entre l铆neas, a帽adir salto
36
  markdown_output += "\n"
37
  markdown_output += content + "\n"
38
  previous_y = y
39
 
40
- markdown_output += "\n---\n\n" # separador entre p谩ginas
41
 
42
  return markdown_output.strip()
43
 
@@ -46,7 +43,7 @@ def convert(pdf_file):
46
  original_doc = fitz.open(pdf_file)
47
  plain_text = "\n".join([page.get_text() for page in original_doc])
48
 
49
- # Si es imagen escaneada sin texto, aplicamos OCR
50
  if len(plain_text.strip()) < 100:
51
  ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
52
  ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
@@ -55,11 +52,11 @@ def convert(pdf_file):
55
  doc = original_doc
56
 
57
  markdown = extract_text_markdown(doc)
58
- metadata = {} # Puedes agregar metadatos si quieres
59
  return markdown, metadata
60
 
61
  gr.Interface(
62
  fn=convert,
63
- inputs=[gr.File(label="Upload PDF", type="filepath")],
64
- outputs=[gr.Text(label="Markdown crudo"), gr.JSON(label="Metadata")],
65
  ).launch()
 
1
  import spaces
2
  import gradio as gr
3
  import fitz # PyMuPDF
 
 
4
  import ocrmypdf
5
+ import tempfile
6
+ import os
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
 
 
 
10
 
11
  for page in doc:
12
  blocks = page.get_text("dict")["blocks"]
 
23
  elif b["type"] == 1: # Imagen
24
  elements.append((y, "![imagen]()"))
25
 
26
+ # Ordenar por posici贸n vertical
27
  elements.sort(key=lambda x: x[0])
28
 
29
+ # Reconstrucci贸n respetando saltos
30
  previous_y = None
31
  for y, content in elements:
32
+ if previous_y is not None and abs(y - previous_y) > 10:
33
  markdown_output += "\n"
34
  markdown_output += content + "\n"
35
  previous_y = y
36
 
37
+ markdown_output += "\n---\n\n" # Separador de p谩ginas
38
 
39
  return markdown_output.strip()
40
 
 
43
  original_doc = fitz.open(pdf_file)
44
  plain_text = "\n".join([page.get_text() for page in original_doc])
45
 
46
+ # Aplicar OCR solo si el PDF no tiene texto
47
  if len(plain_text.strip()) < 100:
48
  ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
49
  ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
 
52
  doc = original_doc
53
 
54
  markdown = extract_text_markdown(doc)
55
+ metadata = {} # Si necesitas metadatos, se pueden agregar aqu铆
56
  return markdown, metadata
57
 
58
  gr.Interface(
59
  fn=convert,
60
+ inputs=[gr.File(label="Sube tu PDF", type="filepath")],
61
+ outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
62
  ).launch()