Spaces:
Running
Running
Commit
·
cca0a5d
1
Parent(s):
070e4b3
Fix file handling with filepath type and better error handling
Browse files
app.py
CHANGED
@@ -26,41 +26,35 @@ Made with ❤️ using Docling and Gradio
|
|
26 |
# Initialize the document parser
|
27 |
parser = DocumentParser()
|
28 |
|
29 |
-
def get_file_extension(file_type):
|
30 |
-
"""Get file extension based on MIME type"""
|
31 |
-
extensions = {
|
32 |
-
'application/pdf': '.pdf',
|
33 |
-
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
34 |
-
'text/plain': '.txt',
|
35 |
-
'text/html': '.html',
|
36 |
-
'text/markdown': '.md'
|
37 |
-
}
|
38 |
-
return extensions.get(file_type, '.tmp')
|
39 |
-
|
40 |
def process_document(file_obj):
|
41 |
"""Process uploaded document and return structured information"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
temp_path = None
|
43 |
try:
|
44 |
-
#
|
45 |
-
if
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
else:
|
52 |
-
# Handle binary data directly
|
53 |
-
file_data = file_obj
|
54 |
-
extension = '.pdf' # Default to PDF for binary uploads
|
55 |
-
|
56 |
-
# Create temporary file
|
57 |
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
|
58 |
-
|
59 |
-
|
|
|
|
|
60 |
else:
|
61 |
-
tmp_file.write(
|
62 |
temp_path = tmp_file.name
|
63 |
-
|
64 |
# Parse the document
|
65 |
result = parser.parse(temp_path)
|
66 |
|
@@ -121,7 +115,7 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
|
|
121 |
file_input = gr.File(
|
122 |
label="Upload Document",
|
123 |
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
|
124 |
-
type="binary
|
125 |
)
|
126 |
submit_btn = gr.Button("Process Document", variant="primary")
|
127 |
|
|
|
26 |
# Initialize the document parser
|
27 |
parser = DocumentParser()
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def process_document(file_obj):
|
30 |
"""Process uploaded document and return structured information"""
|
31 |
+
if file_obj is None:
|
32 |
+
return (
|
33 |
+
"Error: No file uploaded",
|
34 |
+
pd.DataFrame(),
|
35 |
+
"No sections available",
|
36 |
+
"No entities available",
|
37 |
+
"Confidence Score: 0.0"
|
38 |
+
)
|
39 |
+
|
40 |
temp_path = None
|
41 |
try:
|
42 |
+
# Create temporary file with appropriate extension
|
43 |
+
original_filename = file_obj.name if hasattr(file_obj, 'name') else "uploaded_file.pdf"
|
44 |
+
extension = os.path.splitext(original_filename)[1].lower()
|
45 |
+
if not extension:
|
46 |
+
extension = '.pdf' # Default to PDF if no extension
|
47 |
+
|
48 |
+
# Create temporary file and write content
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
|
50 |
+
# Write the content
|
51 |
+
content = file_obj.read() if hasattr(file_obj, 'read') else file_obj
|
52 |
+
if isinstance(content, bytes):
|
53 |
+
tmp_file.write(content)
|
54 |
else:
|
55 |
+
tmp_file.write(content.encode('utf-8'))
|
56 |
temp_path = tmp_file.name
|
57 |
+
|
58 |
# Parse the document
|
59 |
result = parser.parse(temp_path)
|
60 |
|
|
|
115 |
file_input = gr.File(
|
116 |
label="Upload Document",
|
117 |
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
|
118 |
+
type="filepath" # Changed from binary to filepath
|
119 |
)
|
120 |
submit_btn = gr.Button("Process Document", variant="primary")
|
121 |
|