Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,9 +15,6 @@ import gradio as gr
|
|
15 |
from bs4 import BeautifulSoup
|
16 |
from fake_useragent import UserAgent
|
17 |
from cleantext import clean
|
18 |
-
from starlette.applications import Starlette
|
19 |
-
from starlette.responses import JSONResponse
|
20 |
-
from starlette.routing import Route
|
21 |
|
22 |
# Setup logging with detailed configuration
|
23 |
logging.basicConfig(
|
@@ -154,12 +151,19 @@ class URLProcessor:
|
|
154 |
|
155 |
class FileProcessor:
|
156 |
"""Class to handle file processing"""
|
157 |
-
|
158 |
def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
|
159 |
self.max_file_size = max_file_size
|
160 |
self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
|
165 |
"""Process multiple uploaded files and return a single JSON extraction"""
|
@@ -167,76 +171,70 @@ class FileProcessor:
|
|
167 |
return []
|
168 |
|
169 |
combined_data = []
|
170 |
-
self.processed_zip_count = 0
|
171 |
-
|
172 |
try:
|
173 |
for file in files:
|
174 |
# Check if the file is a Gradio File object or a string path
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
if os.path.isdir(file_path):
|
182 |
-
logger.warning(f"Skipping directory: {file_path}")
|
183 |
-
continue
|
184 |
-
|
185 |
-
# Skip if file doesn't exist
|
186 |
-
if not os.path.exists(file_path):
|
187 |
-
logger.warning(f"File does not exist: {file_path}")
|
188 |
-
continue
|
189 |
-
|
190 |
-
# Check file size
|
191 |
-
file_size = os.path.getsize(file_path)
|
192 |
-
if file_size > self.max_file_size:
|
193 |
-
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
194 |
-
continue # Skip this file
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
continue
|
201 |
-
self.processed_zip_count += 1
|
202 |
-
zip_results = self._process_zip_file(file_path)
|
203 |
-
combined_data.extend(zip_results)
|
204 |
else:
|
205 |
-
|
206 |
-
combined_data.extend(file_results)
|
207 |
|
208 |
except Exception as e:
|
209 |
logger.error(f"Error processing files: {str(e)}")
|
210 |
-
|
211 |
-
return combined_data
|
212 |
|
213 |
def _process_zip_file(self, zip_path: str) -> List[Dict]:
|
214 |
-
"""Process ZIP file contents
|
215 |
results = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
try:
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
"filename": filename,
|
232 |
-
"content": content,
|
233 |
-
"timestamp": datetime.now().isoformat()
|
234 |
-
})
|
235 |
-
except Exception as e:
|
236 |
-
logger.error(f"Error reading file {filename} from zip: {str(e)}")
|
237 |
except Exception as e:
|
238 |
-
logger.error(f"
|
239 |
-
|
240 |
|
241 |
class Chatbot:
|
242 |
"""Simple chatbot that uses provided JSON data for responses."""
|
@@ -418,10 +416,10 @@ def create_interface():
|
|
418 |
def main():
|
419 |
# Configure system settings
|
420 |
mimetypes.init()
|
421 |
-
|
422 |
# Create and launch interface
|
423 |
interface = create_interface()
|
424 |
-
|
425 |
# Launch with proper configuration
|
426 |
interface.launch(
|
427 |
server_name="0.0.0.0",
|
|
|
15 |
from bs4 import BeautifulSoup
|
16 |
from fake_useragent import UserAgent
|
17 |
from cleantext import clean
|
|
|
|
|
|
|
18 |
|
19 |
# Setup logging with detailed configuration
|
20 |
logging.basicConfig(
|
|
|
151 |
|
152 |
class FileProcessor:
|
153 |
"""Class to handle file processing"""
|
154 |
+
|
155 |
def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
|
156 |
self.max_file_size = max_file_size
|
157 |
self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
|
158 |
+
|
159 |
+
def is_text_file(self, filepath: str) -> bool:
|
160 |
+
"""Check if file is a text file"""
|
161 |
+
try:
|
162 |
+
mime_type, _ = mimetypes.guess_type(filepath)
|
163 |
+
return (mime_type and mime_type.startswith('text/')) or \
|
164 |
+
(os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
|
165 |
+
except Exception:
|
166 |
+
return False
|
167 |
|
168 |
def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]:
|
169 |
"""Process multiple uploaded files and return a single JSON extraction"""
|
|
|
171 |
return []
|
172 |
|
173 |
combined_data = []
|
|
|
|
|
174 |
try:
|
175 |
for file in files:
|
176 |
# Check if the file is a Gradio File object or a string path
|
177 |
+
file_name = file.name if isinstance(file, gr.File) else file
|
178 |
+
if os.path.isfile(file_name):
|
179 |
+
file_size = os.path.getsize(file_name)
|
180 |
+
if file_size > self.max_file_size:
|
181 |
+
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
182 |
+
continue # Skip this file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
+
if zipfile.is_zipfile(file_name):
|
185 |
+
combined_data.extend(self._process_zip_file(file_name))
|
186 |
+
else:
|
187 |
+
combined_data.extend(self._process_single_file(file_name))
|
|
|
|
|
|
|
|
|
188 |
else:
|
189 |
+
logger.warning(f"Skipping directory: {file_name}")
|
|
|
190 |
|
191 |
except Exception as e:
|
192 |
logger.error(f"Error processing files: {str(e)}")
|
193 |
+
return []
|
|
|
194 |
|
195 |
def _process_zip_file(self, zip_path: str) -> List[Dict]:
|
196 |
+
"""Process ZIP file contents"""
|
197 |
results = []
|
198 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
199 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
200 |
+
zip_ref.extractall(temp_dir)
|
201 |
+
for root, _, files in os.walk(temp_dir):
|
202 |
+
for filename in files:
|
203 |
+
filepath = os.path.join(root, filename)
|
204 |
+
if self.is_text_file(filepath):
|
205 |
+
try:
|
206 |
+
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
207 |
+
content = f.read()
|
208 |
+
if content.strip():
|
209 |
+
results.append({
|
210 |
+
"source": "file",
|
211 |
+
"filename": filename,
|
212 |
+
"content": content,
|
213 |
+
"timestamp": datetime.now().isoformat()
|
214 |
+
})
|
215 |
+
except Exception as e:
|
216 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
217 |
+
return results
|
218 |
+
|
219 |
+
def _process_single_file(self, file_path: str) -> List[Dict]:
|
220 |
try:
|
221 |
+
file_stat = os.stat(file_path)
|
222 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
223 |
+
content = f.read()
|
224 |
+
|
225 |
+
return [{
|
226 |
+
'source': 'file',
|
227 |
+
'filename': os.path.basename(file_path),
|
228 |
+
'file_size': file_stat.st_size,
|
229 |
+
'mime_type': mimetypes.guess_type(file_path)[0],
|
230 |
+
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
231 |
+
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
232 |
+
'content': content,
|
233 |
+
'timestamp': datetime.now().isoformat()
|
234 |
+
}]
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
except Exception as e:
|
236 |
+
logger.error(f"File processing error: {e}")
|
237 |
+
return []
|
238 |
|
239 |
class Chatbot:
|
240 |
"""Simple chatbot that uses provided JSON data for responses."""
|
|
|
416 |
def main():
|
417 |
# Configure system settings
|
418 |
mimetypes.init()
|
419 |
+
|
420 |
# Create and launch interface
|
421 |
interface = create_interface()
|
422 |
+
|
423 |
# Launch with proper configuration
|
424 |
interface.launch(
|
425 |
server_name="0.0.0.0",
|