AkashDataScience commited on
Commit
1402288
·
1 Parent(s): f449995

Execution changes

Browse files
Files changed (1) hide show
  1. app.py +15 -13
app.py CHANGED
@@ -35,12 +35,19 @@ def get_pdf_page_count(pdf_path):
35
  reader = PdfReader(pdf_path)
36
  return len(reader.pages)
37
 
 
 
 
 
 
38
  def get_docling_ocr(pdf_path, page_num):
39
  result = converter.convert(pdf_path, page_range=(page_num, page_num))
40
  markdown_text_docling = result.document.export_to_markdown()
41
  return markdown_text_docling
42
 
43
- def get_paddle_ocr(page_image):
 
 
44
  output = pipeline.predict(input=np.array(page_image))
45
 
46
  markdown_list = []
@@ -52,7 +59,8 @@ def get_paddle_ocr(page_image):
52
  markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
53
  return markdown_text_paddleOCR
54
 
55
- def get_smoldocling_ocr(page_image):
 
56
  image = load_image(page_image)
57
 
58
  # Create input messages
@@ -85,16 +93,6 @@ def get_smoldocling_ocr(page_image):
85
 
86
  markdown_text_smoldocling = doc.export_to_markdown()
87
  return markdown_text_smoldocling
88
-
89
-
90
- def inference(pdf_path, page_num):
91
- docling_ocr = get_docling_ocr(pdf_path, page_num)
92
- # Extract the first page as an image
93
- images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
94
- page_image = images[0]
95
- paddle_ocr = get_paddle_ocr(page_image)
96
- smoldocling_ocr = get_smoldocling_ocr(page_image)
97
- return page_image, docling_ocr, paddle_ocr, smoldocling_ocr
98
 
99
  title = "OCR Arena"
100
  description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
@@ -102,6 +100,7 @@ examples = [["data/amazon-10-k-2024.pdf"],
102
  ["data/goog-10-k-2023.pdf"]]
103
 
104
  with gr.Blocks(theme=gr.themes.Glass()) as demo:
 
105
  with gr.Row():
106
  with gr.Column():
107
  pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
@@ -118,7 +117,10 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
118
  clear_btn = gr.ClearButton(components=[pdf, page_num])
119
  submit_btn = gr.Button("Submit", variant='primary')
120
 
121
- submit_btn.click(inference, inputs=[pdf, page_num], outputs=[original, docling_ocr_out, paddle_ocr_out, smoldocling_ocr_out])
 
 
 
122
 
123
  with gr.Column():
124
  original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
 
35
  reader = PdfReader(pdf_path)
36
  return len(reader.pages)
37
 
38
+ def get_page_image(pdf_path, page_num):
39
+ images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
40
+ page_image = images[0]
41
+ return page_image
42
+
43
  def get_docling_ocr(pdf_path, page_num):
44
  result = converter.convert(pdf_path, page_range=(page_num, page_num))
45
  markdown_text_docling = result.document.export_to_markdown()
46
  return markdown_text_docling
47
 
48
+ def get_paddle_ocr(pdf_path, page_num):
49
+ page_image = get_page_image(pdf_path, page_num)
50
+
51
  output = pipeline.predict(input=np.array(page_image))
52
 
53
  markdown_list = []
 
59
  markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
60
  return markdown_text_paddleOCR
61
 
62
+ def get_smoldocling_ocr(pdf_path, page_num):
63
+ page_image = get_page_image(pdf_path, page_num)
64
  image = load_image(page_image)
65
 
66
  # Create input messages
 
93
 
94
  markdown_text_smoldocling = doc.export_to_markdown()
95
  return markdown_text_smoldocling
 
 
 
 
 
 
 
 
 
 
96
 
97
  title = "OCR Arena"
98
  description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
 
100
  ["data/goog-10-k-2023.pdf"]]
101
 
102
  with gr.Blocks(theme=gr.themes.Glass()) as demo:
103
+ gr.Markdown(f"# {title}\n{description}")
104
  with gr.Row():
105
  with gr.Column():
106
  pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
 
117
  clear_btn = gr.ClearButton(components=[pdf, page_num])
118
  submit_btn = gr.Button("Submit", variant='primary')
119
 
120
+ submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=original).then(
121
+ get_docling_ocr, inputs=[pdf, page_num], outputs=docling_ocr_out).then(
122
+ get_paddle_ocr, inputs=[pdf, page_num], outputs=paddle_ocr_out).then(
123
+ get_smoldocling_ocr, inputs=[pdf, page_num], outputs=smoldocling_ocr_out)
124
 
125
  with gr.Column():
126
  original = gr.Image(width=640, height=640, label="Original Page", interactive=False)