hellorahulk commited on
Commit
cca0a5d
·
1 Parent(s): 070e4b3

Fix file handling with filepath type and better error handling

Browse files
Files changed (1) hide show
  1. app.py +23 -29
app.py CHANGED
@@ -26,41 +26,35 @@ Made with ❤️ using Docling and Gradio
26
  # Initialize the document parser
27
  parser = DocumentParser()
28
 
29
- def get_file_extension(file_type):
30
- """Get file extension based on MIME type"""
31
- extensions = {
32
- 'application/pdf': '.pdf',
33
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
34
- 'text/plain': '.txt',
35
- 'text/html': '.html',
36
- 'text/markdown': '.md'
37
- }
38
- return extensions.get(file_type, '.tmp')
39
-
40
  def process_document(file_obj):
41
  """Process uploaded document and return structured information"""
 
 
 
 
 
 
 
 
 
42
  temp_path = None
43
  try:
44
- # Handle file upload based on type
45
- if isinstance(file_obj, dict):
46
- # Get file data and original name
47
- file_data = file_obj['data']
48
- original_name = file_obj.get('name', 'uploaded_file')
49
- file_type = file_obj.get('mime_type', mimetypes.guess_type(original_name)[0])
50
- extension = os.path.splitext(original_name)[1] or get_file_extension(file_type)
51
- else:
52
- # Handle binary data directly
53
- file_data = file_obj
54
- extension = '.pdf' # Default to PDF for binary uploads
55
-
56
- # Create temporary file
57
  with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
58
- if isinstance(file_data, bytes):
59
- tmp_file.write(file_data)
 
 
60
  else:
61
- tmp_file.write(file_data.read())
62
  temp_path = tmp_file.name
63
-
64
  # Parse the document
65
  result = parser.parse(temp_path)
66
 
@@ -121,7 +115,7 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
121
  file_input = gr.File(
122
  label="Upload Document",
123
  file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
124
- type="binary"
125
  )
126
  submit_btn = gr.Button("Process Document", variant="primary")
127
 
 
26
  # Initialize the document parser
27
  parser = DocumentParser()
28
 
 
 
 
 
 
 
 
 
 
 
 
29
  def process_document(file_obj):
30
  """Process uploaded document and return structured information"""
31
+ if file_obj is None:
32
+ return (
33
+ "Error: No file uploaded",
34
+ pd.DataFrame(),
35
+ "No sections available",
36
+ "No entities available",
37
+ "Confidence Score: 0.0"
38
+ )
39
+
40
  temp_path = None
41
  try:
42
+ # Create temporary file with appropriate extension
43
+ original_filename = file_obj.name if hasattr(file_obj, 'name') else "uploaded_file.pdf"
44
+ extension = os.path.splitext(original_filename)[1].lower()
45
+ if not extension:
46
+ extension = '.pdf' # Default to PDF if no extension
47
+
48
+ # Create temporary file and write content
 
 
 
 
 
 
49
  with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
50
+ # Write the content
51
+ content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
52
+ if isinstance(content, bytes):
53
+ tmp_file.write(content)
54
  else:
55
+ tmp_file.write(content.encode('utf-8'))
56
  temp_path = tmp_file.name
57
+
58
  # Parse the document
59
  result = parser.parse(temp_path)
60
 
 
115
  file_input = gr.File(
116
  label="Upload Document",
117
  file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
118
+ type="filepath" # Changed from binary to filepath
119
  )
120
  submit_btn = gr.Button("Process Document", variant="primary")
121