xiaoyao9184 commited on
Commit
c5cf1e9
·
verified ·
1 Parent(s): 7a5dd25

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (2) hide show
  1. gradio_app.py +1 -48
  2. requirements.txt +1 -1
gradio_app.py CHANGED
@@ -22,7 +22,6 @@ from surya.model.recognition.model import load_model as load_rec_model
22
  from surya.model.recognition.processor import load_processor as load_rec_processor
23
  from surya.model.table_rec.model import load_model as load_table_model
24
  from surya.model.table_rec.processor import load_processor as load_table_processor
25
- from surya.model.ocr_error.model import load_model as load_ocr_error_model, load_tokenizer as load_ocr_error_processor
26
  from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image
27
  from surya.ocr import run_ocr
28
  from surya.postprocessing.text import draw_text_on_image
@@ -32,9 +31,7 @@ from surya.input.langs import replace_lang_with_code
32
  from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult
33
  from surya.settings import settings
34
  from surya.tables import batch_table_recognition
35
- from surya.postprocessing.util import rescale_bbox
36
- from pdftext.extraction import plain_text_output
37
- from surya.ocr_error import batch_ocr_error_detection
38
 
39
 
40
  def load_det_cached():
@@ -49,34 +46,6 @@ def load_layout_cached():
49
  def load_table_cached():
50
  return load_table_model(), load_table_processor()
51
 
52
- def load_ocr_error_cached():
53
- return load_ocr_error_model(), load_ocr_error_processor()
54
-
55
-
56
- def run_ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=15):
57
- # Sample the text from the middle of the PDF
58
- page_middle = page_count // 2
59
- page_range = range(max(page_middle - max_pages, 0), min(page_middle + max_pages, page_count))
60
- text = plain_text_output(pdf_file, page_range=page_range)
61
-
62
- sample_gap = len(text) // max_samples
63
- if len(text) == 0 or sample_gap == 0:
64
- return "This PDF has no text or very little text", ["no text"]
65
-
66
- if sample_gap < sample_len:
67
- sample_gap = sample_len
68
-
69
- # Split the text into samples for the model
70
- samples = []
71
- for i in range(0, len(text), sample_gap):
72
- samples.append(text[i:i + sample_len])
73
-
74
- results = batch_ocr_error_detection(samples, ocr_error_model, ocr_error_processor)
75
- label = "This PDF has good text."
76
- if results.labels.count("bad") / len(results.labels) > .2:
77
- label = "This PDF may have garbled or bad OCR text."
78
- return label, results.labels
79
-
80
 
81
  def text_detection(img) -> (Image.Image, TextDetectionResult):
82
  pred = batch_text_detection([img], det_model, det_processor)[0]
@@ -179,7 +148,6 @@ det_model, det_processor = load_det_cached()
179
  rec_model, rec_processor = load_rec_cached()
180
  layout_model, layout_processor = load_layout_cached()
181
  table_model, table_processor = load_table_cached()
182
- ocr_error_model, ocr_error_processor = load_ocr_error_cached()
183
 
184
  with gr.Blocks(title="Surya") as demo:
185
  gr.Markdown("""
@@ -211,8 +179,6 @@ with gr.Blocks(title="Surya") as demo:
211
  use_pdf_boxes_ckb = gr.Checkbox(label="Use PDF table boxes", value=True, info="Table recognition only: Use the bounding boxes from the PDF file vs text detection model.")
212
  skip_table_detection_ckb = gr.Checkbox(label="Skip table detection", value=False, info="Table recognition only: Skip table detection and treat the whole image/page as a table.")
213
  table_rec_btn = gr.Button("Run Table Rec")
214
-
215
- ocr_errors_btn = gr.Button("Run bad PDF text detection")
216
  with gr.Column():
217
  result_img = gr.Image(label="Result image")
218
  result_json = gr.JSON(label="Result json")
@@ -284,18 +250,5 @@ with gr.Blocks(title="Surya") as demo:
284
  inputs=[in_img, in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
285
  outputs=[result_img, result_json]
286
  )
287
- # Run bad PDF text detection
288
- def ocr_errors_pdf(file, page_count, sample_len=512, max_samples=10, max_pages=15):
289
- if file.endswith('.pdf'):
290
- count = count_pdf(file)
291
- else:
292
- raise gr.Error("This feature only works with PDFs.", duration=5)
293
- label, results = run_ocr_errors(file, count)
294
- return gr.update(label="Result json:" + label, value=results)
295
- ocr_errors_btn.click(
296
- fn=ocr_errors_pdf,
297
- inputs=[in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
298
- outputs=[result_json]
299
- )
300
 
301
  demo.launch()
 
22
  from surya.model.recognition.processor import load_processor as load_rec_processor
23
  from surya.model.table_rec.model import load_model as load_table_model
24
  from surya.model.table_rec.processor import load_processor as load_table_processor
 
25
  from surya.postprocessing.heatmap import draw_polys_on_image, draw_bboxes_on_image
26
  from surya.ocr import run_ocr
27
  from surya.postprocessing.text import draw_text_on_image
 
31
  from surya.schema import OCRResult, TextDetectionResult, LayoutResult, TableResult
32
  from surya.settings import settings
33
  from surya.tables import batch_table_recognition
34
+ from surya.postprocessing.util import rescale_bboxes, rescale_bbox
 
 
35
 
36
 
37
  def load_det_cached():
 
46
  def load_table_cached():
47
  return load_table_model(), load_table_processor()
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def text_detection(img) -> (Image.Image, TextDetectionResult):
51
  pred = batch_text_detection([img], det_model, det_processor)[0]
 
148
  rec_model, rec_processor = load_rec_cached()
149
  layout_model, layout_processor = load_layout_cached()
150
  table_model, table_processor = load_table_cached()
 
151
 
152
  with gr.Blocks(title="Surya") as demo:
153
  gr.Markdown("""
 
179
  use_pdf_boxes_ckb = gr.Checkbox(label="Use PDF table boxes", value=True, info="Table recognition only: Use the bounding boxes from the PDF file vs text detection model.")
180
  skip_table_detection_ckb = gr.Checkbox(label="Skip table detection", value=False, info="Table recognition only: Skip table detection and treat the whole image/page as a table.")
181
  table_rec_btn = gr.Button("Run Table Rec")
 
 
182
  with gr.Column():
183
  result_img = gr.Image(label="Result image")
184
  result_json = gr.JSON(label="Result json")
 
250
  inputs=[in_img, in_file, in_num, use_pdf_boxes_ckb, skip_table_detection_ckb],
251
  outputs=[result_img, result_json]
252
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  torch==2.5.1
2
- surya-ocr==0.8.1
3
  gradio==5.8.0
4
  huggingface-hub==0.26.3
 
1
  torch==2.5.1
2
+ surya-ocr==0.7.0
3
  gradio==5.8.0
4
  huggingface-hub==0.26.3