Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import mimetypes
|
|
7 |
import zipfile
|
8 |
import tempfile
|
9 |
from datetime import datetime
|
10 |
-
from typing import List, Dict, Optional
|
11 |
from pathlib import Path
|
12 |
import requests
|
13 |
import validators
|
@@ -32,7 +32,7 @@ class URLProcessor:
|
|
32 |
self.session = requests.Session()
|
33 |
self.timeout = 10 # seconds
|
34 |
self.session.headers.update({
|
35 |
-
'User-Agent': UserAgent().random,
|
36 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
37 |
'Accept-Language': 'en-US,en;q=0.5',
|
38 |
'Accept-Encoding': 'gzip, deflate, br',
|
@@ -96,7 +96,7 @@ class URLProcessor:
|
|
96 |
if not file_id:
|
97 |
logger.error(f"Invalid Google Drive URL: {url}")
|
98 |
return None
|
99 |
-
|
100 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
101 |
response = self.session.get(direct_url, timeout=self.timeout)
|
102 |
response.raise_for_status()
|
@@ -109,7 +109,7 @@ class URLProcessor:
|
|
109 |
except Exception as e:
|
110 |
logger.error(f"Google Drive processing failed: {e}")
|
111 |
return None
|
112 |
-
|
113 |
def _handle_google_calendar(self, url: str) -> Optional[Dict]:
|
114 |
"""Process Google Calendar ICS feeds"""
|
115 |
try:
|
@@ -165,68 +165,64 @@ class FileProcessor:
|
|
165 |
except Exception:
|
166 |
return False
|
167 |
|
168 |
-
def
|
169 |
-
"""Process
|
170 |
-
if not
|
171 |
return []
|
172 |
|
173 |
-
|
174 |
try:
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
file_size = os.path.getsize(file_name)
|
180 |
-
if file_size > self.max_file_size:
|
181 |
-
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
182 |
-
continue # Skip this file
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
combined_data.extend(self._process_single_file(file_name))
|
188 |
else:
|
189 |
-
|
190 |
|
191 |
except Exception as e:
|
192 |
-
logger.error(f"Error processing
|
193 |
return []
|
194 |
|
195 |
-
|
|
|
|
|
196 |
"""Process ZIP file contents"""
|
197 |
results = []
|
198 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
199 |
-
|
200 |
-
|
201 |
-
for
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
logger.error(f"Error reading file {filename}: {str(e)}")
|
217 |
return results
|
218 |
|
219 |
-
def _process_single_file(self,
|
220 |
try:
|
221 |
-
file_stat = os.stat(
|
222 |
-
with open(
|
223 |
content = f.read()
|
224 |
|
225 |
return [{
|
226 |
'source': 'file',
|
227 |
-
'filename': os.path.basename(
|
228 |
'file_size': file_stat.st_size,
|
229 |
-
'mime_type': mimetypes.guess_type(
|
230 |
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
231 |
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
232 |
'content': content,
|
@@ -266,18 +262,13 @@ def create_interface():
|
|
266 |
"""Create a comprehensive Gradio interface with advanced features"""
|
267 |
|
268 |
css = """
|
269 |
-
|
270 |
-
.
|
271 |
-
|
272 |
-
.tab { background-color: #ffffff; border-radius: 8px; padding: 20px; margin-bottom: 20px; }
|
273 |
-
.button { background-color: #007bff; color: white; border: none; border-radius: 5px; padding: 10px 20px; cursor: pointer; }
|
274 |
-
.button:hover { background-color: #0056b3; }
|
275 |
-
.warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 5px; }
|
276 |
-
.error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 5px; }
|
277 |
"""
|
278 |
|
279 |
-
with gr.Blocks(css=css, title="Advanced
|
280 |
-
gr.Markdown("# 🌐 Advanced
|
281 |
|
282 |
with gr.Tab("URL Processing"):
|
283 |
url_input = gr.Textbox(
|
@@ -288,7 +279,7 @@ def create_interface():
|
|
288 |
|
289 |
with gr.Tab("File Input"):
|
290 |
file_input = gr.File(
|
291 |
-
label="Upload text
|
292 |
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
|
293 |
)
|
294 |
|
@@ -309,7 +300,7 @@ def create_interface():
|
|
309 |
placeholder="Paste your JSON data here...",
|
310 |
lines=5
|
311 |
)
|
312 |
-
load_btn = gr.Button("Load Data"
|
313 |
chat_output = gr.Textbox(label="Chatbot Response", interactive=False)
|
314 |
|
315 |
process_btn = gr.Button("Process Input", variant="primary")
|
@@ -320,7 +311,7 @@ def create_interface():
|
|
320 |
# Initialize chatbot
|
321 |
chatbot = Chatbot()
|
322 |
|
323 |
-
def process_all_inputs(urls,
|
324 |
"""Process all input types with progress tracking"""
|
325 |
try:
|
326 |
processor = URLProcessor()
|
@@ -338,16 +329,15 @@ def create_interface():
|
|
338 |
content = processor.fetch_content(url)
|
339 |
if content:
|
340 |
results.append({
|
341 |
-
'source': '
|
342 |
'url': url,
|
343 |
'content': content,
|
344 |
'timestamp': datetime.now().isoformat()
|
345 |
})
|
346 |
|
347 |
# Process files
|
348 |
-
if
|
349 |
-
|
350 |
-
results.extend(combined_data)
|
351 |
|
352 |
# Process text input
|
353 |
if text:
|
@@ -405,7 +395,7 @@ def create_interface():
|
|
405 |
gr.Markdown("""
|
406 |
### Usage Guidelines
|
407 |
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
408 |
-
- **File Input**: Upload
|
409 |
- **Text Input**: Direct text processing
|
410 |
- **Chat**: Load your JSON data and ask questions about it
|
411 |
- Advanced cleaning and validation included
|
@@ -430,4 +420,4 @@ def main():
|
|
430 |
)
|
431 |
|
432 |
if __name__ == "__main__":
|
433 |
-
main()
|
|
|
7 |
import zipfile
|
8 |
import tempfile
|
9 |
from datetime import datetime
|
10 |
+
from typing import List, Dict, Optional
|
11 |
from pathlib import Path
|
12 |
import requests
|
13 |
import validators
|
|
|
32 |
self.session = requests.Session()
|
33 |
self.timeout = 10 # seconds
|
34 |
self.session.headers.update({
|
35 |
+
'User -Agent': UserAgent().random,
|
36 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
37 |
'Accept-Language': 'en-US,en;q=0.5',
|
38 |
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
96 |
if not file_id:
|
97 |
logger.error(f"Invalid Google Drive URL: {url}")
|
98 |
return None
|
99 |
+
|
100 |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
101 |
response = self.session.get(direct_url, timeout=self.timeout)
|
102 |
response.raise_for_status()
|
|
|
109 |
except Exception as e:
|
110 |
logger.error(f"Google Drive processing failed: {e}")
|
111 |
return None
|
112 |
+
|
113 |
def _handle_google_calendar(self, url: str) -> Optional[Dict]:
|
114 |
"""Process Google Calendar ICS feeds"""
|
115 |
try:
|
|
|
165 |
except Exception:
|
166 |
return False
|
167 |
|
168 |
+
def process_file(self, file) -> List[Dict]:
|
169 |
+
"""Process uploaded file with enhanced error handling"""
|
170 |
+
if not file:
|
171 |
return []
|
172 |
|
173 |
+
dataset = []
|
174 |
try:
|
175 |
+
file_size = os.path.getsize(file.name)
|
176 |
+
if file_size > self.max_file_size:
|
177 |
+
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
178 |
+
return [{"error": f"File size ({file_size} bytes) exceeds maximum allowed size of {self.max_file_size} bytes."}]
|
|
|
|
|
|
|
|
|
179 |
|
180 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
181 |
+
if zipfile.is_zipfile(file.name):
|
182 |
+
dataset.extend(self._process_zip_file(file.name, temp_dir))
|
|
|
183 |
else:
|
184 |
+
dataset.extend(self._process_single_file(file))
|
185 |
|
186 |
except Exception as e:
|
187 |
+
logger.error(f"Error processing file: {str(e)}")
|
188 |
return []
|
189 |
|
190 |
+
return dataset
|
191 |
+
|
192 |
+
def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
|
193 |
"""Process ZIP file contents"""
|
194 |
results = []
|
195 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
196 |
+
zip_ref.extractall(temp_dir)
|
197 |
+
for root, _, files in os.walk(temp_dir):
|
198 |
+
for filename in files:
|
199 |
+
filepath = os.path.join(root, filename)
|
200 |
+
if self.is_text_file(filepath):
|
201 |
+
try:
|
202 |
+
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
203 |
+
content = f.read()
|
204 |
+
if content.strip():
|
205 |
+
results.append({
|
206 |
+
"source": "file",
|
207 |
+
"filename": filename,
|
208 |
+
"content": content,
|
209 |
+
"timestamp": datetime.now().isoformat()
|
210 |
+
})
|
211 |
+
except Exception as e:
|
212 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
|
|
213 |
return results
|
214 |
|
215 |
+
def _process_single_file(self, file) -> List[Dict]:
|
216 |
try:
|
217 |
+
file_stat = os.stat(file.name)
|
218 |
+
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
|
219 |
content = f.read()
|
220 |
|
221 |
return [{
|
222 |
'source': 'file',
|
223 |
+
'filename': os.path.basename(file.name),
|
224 |
'file_size': file_stat.st_size,
|
225 |
+
'mime_type': mimetypes.guess_type(file.name)[0],
|
226 |
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
|
227 |
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
|
228 |
'content': content,
|
|
|
262 |
"""Create a comprehensive Gradio interface with advanced features"""
|
263 |
|
264 |
css = """
|
265 |
+
.container { max-width: 1200px; margin: auto; }
|
266 |
+
.warning { background-color: #fff3cd; color: #856404; }
|
267 |
+
.error { background-color: #f8d7da; color: #721c24; }
|
|
|
|
|
|
|
|
|
|
|
268 |
"""
|
269 |
|
270 |
+
with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
|
271 |
+
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
|
272 |
|
273 |
with gr.Tab("URL Processing"):
|
274 |
url_input = gr.Textbox(
|
|
|
279 |
|
280 |
with gr.Tab("File Input"):
|
281 |
file_input = gr.File(
|
282 |
+
label="Upload text file or ZIP archive",
|
283 |
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
|
284 |
)
|
285 |
|
|
|
300 |
placeholder="Paste your JSON data here...",
|
301 |
lines=5
|
302 |
)
|
303 |
+
load_btn = gr.Button("Load Data")
|
304 |
chat_output = gr.Textbox(label="Chatbot Response", interactive=False)
|
305 |
|
306 |
process_btn = gr.Button("Process Input", variant="primary")
|
|
|
311 |
# Initialize chatbot
|
312 |
chatbot = Chatbot()
|
313 |
|
314 |
+
def process_all_inputs(urls, file, text):
|
315 |
"""Process all input types with progress tracking"""
|
316 |
try:
|
317 |
processor = URLProcessor()
|
|
|
329 |
content = processor.fetch_content(url)
|
330 |
if content:
|
331 |
results.append({
|
332 |
+
'source': 'url',
|
333 |
'url': url,
|
334 |
'content': content,
|
335 |
'timestamp': datetime.now().isoformat()
|
336 |
})
|
337 |
|
338 |
# Process files
|
339 |
+
if file:
|
340 |
+
results.extend(file_processor.process_file(file))
|
|
|
341 |
|
342 |
# Process text input
|
343 |
if text:
|
|
|
395 |
gr.Markdown("""
|
396 |
### Usage Guidelines
|
397 |
- **URL Processing**: Enter valid HTTP/HTTPS URLs
|
398 |
+
- **File Input**: Upload text files or ZIP archives
|
399 |
- **Text Input**: Direct text processing
|
400 |
- **Chat**: Load your JSON data and ask questions about it
|
401 |
- Advanced cleaning and validation included
|
|
|
420 |
)
|
421 |
|
422 |
if __name__ == "__main__":
|
423 |
+
main()
|