Spaces:

hellorahulk
/

docling_free

Running

App Files Files Community

hellorahulk commited on Jan 23

Commit

cca0a5d

1 Parent(s): 070e4b3

Fix file handling with filepath type and better error handling

Browse files

Files changed (1) hide show

app.py +23 -29

app.py CHANGED Viewed

@@ -26,41 +26,35 @@ Made with ❤️ using Docling and Gradio
 # Initialize the document parser
 parser = DocumentParser()
-def get_file_extension(file_type):
-    """Get file extension based on MIME type"""
-    extensions = {
-        'application/pdf': '.pdf',
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
-        'text/plain': '.txt',
-        'text/html': '.html',
-        'text/markdown': '.md'
-    }
-    return extensions.get(file_type, '.tmp')
 def process_document(file_obj):
     """Process uploaded document and return structured information"""
     temp_path = None
     try:
-        # Handle file upload based on type
-        if isinstance(file_obj, dict):
-            # Get file data and original name
-            file_data = file_obj['data']
-            original_name = file_obj.get('name', 'uploaded_file')
-            file_type = file_obj.get('mime_type', mimetypes.guess_type(original_name)[0])
-            extension = os.path.splitext(original_name)[1] or get_file_extension(file_type)
-        else:
-            # Handle binary data directly
-            file_data = file_obj
-            extension = '.pdf'  # Default to PDF for binary uploads
-        # Create temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
-            if isinstance(file_data, bytes):
-                tmp_file.write(file_data)
             else:
-                tmp_file.write(file_data.read())
             temp_path = tmp_file.name
         # Parse the document
         result = parser.parse(temp_path)
@@ -121,7 +115,7 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
             file_input = gr.File(
                 label="Upload Document",
                 file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
-                type="binary"
             )
             submit_btn = gr.Button("Process Document", variant="primary")

 # Initialize the document parser
 parser = DocumentParser()
 def process_document(file_obj):
     """Process uploaded document and return structured information"""
+    if file_obj is None:
+        return (
+            "Error: No file uploaded",
+            pd.DataFrame(),
+            "No sections available",
+            "No entities available",
+            "Confidence Score: 0.0"
+        )
     temp_path = None
     try:
+        # Create temporary file with appropriate extension
+        original_filename = file_obj.name if hasattr(file_obj, 'name') else "uploaded_file.pdf"
+        extension = os.path.splitext(original_filename)[1].lower()
+        if not extension:
+            extension = '.pdf'  # Default to PDF if no extension
+        # Create temporary file and write content
         with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
+            # Write the content
+            content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
+            if isinstance(content, bytes):
+                tmp_file.write(content)
             else:
+                tmp_file.write(content.encode('utf-8'))
             temp_path = tmp_file.name
         # Parse the document
         result = parser.parse(temp_path)
             file_input = gr.File(
                 label="Upload Document",
                 file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
+                type="filepath"  # Changed from binary to filepath
             )
             submit_btn = gr.Button("Process Document", variant="primary")