Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import mimetypes
|
|
7 |
import zipfile
|
8 |
import tempfile
|
9 |
from datetime import datetime
|
10 |
-
from typing import List, Dict, Optional
|
11 |
from pathlib import Path
|
12 |
import requests
|
13 |
import validators
|
@@ -94,7 +94,7 @@ class URLProcessor:
|
|
94 |
try:
|
95 |
file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
|
96 |
if not file_id:
|
97 |
-
logger.error(f"Invalid Google Drive URL: {
|
98 |
return None
|
99 |
|
100 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
@@ -165,7 +165,7 @@ class FileProcessor:
|
|
165 |
except Exception:
|
166 |
return False
|
167 |
|
168 |
-
def process_files(self, files) -> List[Dict]:
|
169 |
"""Process multiple uploaded files and return a single JSON extraction"""
|
170 |
if not files:
|
171 |
return []
|
@@ -173,15 +173,17 @@ class FileProcessor:
|
|
173 |
combined_data = []
|
174 |
try:
|
175 |
for file in files:
|
176 |
-
|
|
|
|
|
177 |
if file_size > self.max_file_size:
|
178 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
179 |
continue # Skip this file
|
180 |
|
181 |
-
if zipfile.is_zipfile(
|
182 |
-
combined_data.extend(self._process_zip_file(
|
183 |
else:
|
184 |
-
combined_data.extend(self._process_single_file(
|
185 |
|
186 |
except Exception as e:
|
187 |
logger.error(f"Error processing files: {str(e)}")
|
@@ -213,17 +215,17 @@ class FileProcessor:
|
|
213 |
logger.error(f"Error reading file {filename}: {str(e)}")
|
214 |
return results
|
215 |
|
216 |
-
def _process_single_file(self,
|
217 |
try:
|
218 |
-
file_stat = os.stat(
|
219 |
-
with open(
|
220 |
content = f.read()
|
221 |
|
222 |
return [{
|
223 |
'source': 'file',
|
224 |
-
'filename': os.path.basename(
|
225 |
'file_size': file_stat.st_size,
|
226 |
-
'mime_type': mimetypes.guess_type(
|
227 |
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
228 |
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
229 |
'content': content,
|
@@ -286,7 +288,8 @@ def create_interface():
|
|
286 |
with gr.Tab("File Input"):
|
287 |
file_input = gr.File(
|
288 |
label="Upload text files or ZIP archives",
|
289 |
-
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
|
|
|
290 |
)
|
291 |
|
292 |
with gr.Tab("Text Input"):
|
@@ -326,7 +329,7 @@ def create_interface():
|
|
326 |
|
327 |
# Process URLs
|
328 |
if urls:
|
329 |
-
url_list = re.split(r'[,\n]', urls)
|
330 |
url_list = [url.strip() for url in url_list if url.strip()]
|
331 |
|
332 |
for url in url_list:
|
@@ -344,7 +347,7 @@ def create_interface():
|
|
344 |
# Process files
|
345 |
if files:
|
346 |
combined_data = file_processor.process_files(files)
|
347 |
-
results.extend
|
348 |
|
349 |
# Process text input
|
350 |
if text:
|
@@ -402,7 +405,7 @@ def create_interface():
|
|
402 |
gr.Markdown("""
|
403 |
### Usage Guidelines
|
404 |
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
405 |
-
- **File Input**: Upload text files or ZIP archives
|
406 |
- **Text Input**: Direct text processing
|
407 |
- **Chat**: Load your JSON data and ask questions about it
|
408 |
- Advanced cleaning and validation included
|
|
|
7 |
import zipfile
|
8 |
import tempfile
|
9 |
from datetime import datetime
|
10 |
+
from typing import List, Dict, Optional, Union, Any
|
11 |
from pathlib import Path
|
12 |
import requests
|
13 |
import validators
|
|
|
94 |
try:
|
95 |
file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
|
96 |
if not file_id:
|
97 |
+
logger.error(f"Invalid Google Drive URL: {url}")
|
98 |
return None
|
99 |
|
100 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
|
|
165 |
except Exception:
|
166 |
return False
|
167 |
|
168 |
+
def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
|
169 |
"""Process multiple uploaded files and return a single JSON extraction"""
|
170 |
if not files:
|
171 |
return []
|
|
|
173 |
combined_data = []
|
174 |
try:
|
175 |
for file in files:
|
176 |
+
# Check if the file is a Gradio File object or a string path
|
177 |
+
file_name = file.name if isinstance(file, gr.File) else file
|
178 |
+
file_size = os.path.getsize(file_name)
|
179 |
if file_size > self.max_file_size:
|
180 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
181 |
continue # Skip this file
|
182 |
|
183 |
+
if zipfile.is_zipfile(file_name):
|
184 |
+
combined_data.extend(self._process_zip_file(file_name))
|
185 |
else:
|
186 |
+
combined_data.extend(self._process_single_file(file_name))
|
187 |
|
188 |
except Exception as e:
|
189 |
logger.error(f"Error processing files: {str(e)}")
|
|
|
215 |
logger.error(f"Error reading file {filename}: {str(e)}")
|
216 |
return results
|
217 |
|
218 |
+
def _process_single_file(self, file_path: str) -> List[Dict]:
|
219 |
try:
|
220 |
+
file_stat = os.stat(file_path)
|
221 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
222 |
content = f.read()
|
223 |
|
224 |
return [{
|
225 |
'source': 'file',
|
226 |
+
'filename': os.path.basename(file_path),
|
227 |
'file_size': file_stat.st_size,
|
228 |
+
'mime_type': mimetypes.guess_type(file_path)[0],
|
229 |
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
230 |
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
231 |
'content': content,
|
|
|
288 |
with gr.Tab("File Input"):
|
289 |
file_input = gr.File(
|
290 |
label="Upload text files or ZIP archives",
|
291 |
+
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"],
|
292 |
+
multiple=True # Allow multiple file uploads
|
293 |
)
|
294 |
|
295 |
with gr.Tab("Text Input"):
|
|
|
329 |
|
330 |
# Process URLs
|
331 |
if urls:
|
332 |
+
url_list = re.split(r '[,\n]', urls)
|
333 |
url_list = [url.strip() for url in url_list if url.strip()]
|
334 |
|
335 |
for url in url_list:
|
|
|
347 |
# Process files
|
348 |
if files:
|
349 |
combined_data = file_processor.process_files(files)
|
350 |
+
results.extend(combined_data)
|
351 |
|
352 |
# Process text input
|
353 |
if text:
|
|
|
405 |
gr.Markdown("""
|
406 |
### Usage Guidelines
|
407 |
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
408 |
+
- **File Input**: Upload multiple text files or ZIP archives
|
409 |
- **Text Input**: Direct text processing
|
410 |
- **Chat**: Load your JSON data and ask questions about it
|
411 |
- Advanced cleaning and validation included
|