yasserrmd commited on
Commit
ee63712
·
verified ·
1 Parent(s): 351fbad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -6
app.py CHANGED
@@ -1,30 +1,38 @@
1
  import gradio as gr
2
- from docling.document_converter import DocumentConverter
 
3
  from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
 
4
  import spaces
5
 
6
  # GPU decorator not really required for Docling OCR, but kept if you want
7
  @spaces.GPU
8
  def convert_document(file, output_format):
 
9
  pdf_opts = PdfPipelineOptions(
10
  do_ocr=True,
11
- ocr_options=TesseractCliOcrOptions(lang=["eng"]),
12
  )
13
 
14
- converter = DocumentConverter(pipeline_options=pdf_opts)
 
 
 
 
 
 
 
15
  result = converter.convert(file.name)
16
 
17
  # Choose output format safely
18
  if output_format == "Markdown":
19
  converted_text = result.document.export_to_markdown()
20
  elif output_format == "JSON":
21
- # JSON needs to be dumped into a string for the Textbox
22
- import json
23
  converted_text = json.dumps(result.document.export_to_json(), indent=2)
24
  else:
25
  converted_text = "⚠️ Unsupported format"
26
 
27
- # Metadata always JSON-friendly
28
  metadata = {"Available Attributes": dir(result.document)}
29
 
30
  return converted_text, metadata
 
1
  import gradio as gr
2
+ import json
3
+ from docling.document_converter import DocumentConverter, PdfFormatOption
4
  from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
5
+ from docling.datamodel.base_models import InputFormat
6
  import spaces
7
 
8
  # GPU decorator not really required for Docling OCR, but kept if you want
9
  @spaces.GPU
10
  def convert_document(file, output_format):
11
+ # Configure OCR pipeline
12
  pdf_opts = PdfPipelineOptions(
13
  do_ocr=True,
14
+ ocr_options=TesseractCliOcrOptions(lang=["eng"]) # or ["eng","ara"] if needed
15
  )
16
 
17
+ # Correct way: pass options via format_options
18
+ converter = DocumentConverter(
19
+ format_options={
20
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)
21
+ }
22
+ )
23
+
24
+ # Convert document
25
  result = converter.convert(file.name)
26
 
27
  # Choose output format safely
28
  if output_format == "Markdown":
29
  converted_text = result.document.export_to_markdown()
30
  elif output_format == "JSON":
 
 
31
  converted_text = json.dumps(result.document.export_to_json(), indent=2)
32
  else:
33
  converted_text = "⚠️ Unsupported format"
34
 
35
+ # Metadata as JSON-friendly dict
36
  metadata = {"Available Attributes": dir(result.document)}
37
 
38
  return converted_text, metadata