Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,6 @@ import time
|
|
5 |
import logging
|
6 |
import mimetypes
|
7 |
import zipfile
|
8 |
-
import tempfile
|
9 |
from datetime import datetime
|
10 |
from typing import List, Dict, Optional, Union
|
11 |
from pathlib import Path
|
@@ -32,7 +31,7 @@ class URLProcessor:
|
|
32 |
self.session = requests.Session()
|
33 |
self.timeout = 10 # seconds
|
34 |
self.session.headers.update({
|
35 |
-
'User-Agent': UserAgent().random,
|
36 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
37 |
'Accept-Language': 'en-US,en;q=0.5',
|
38 |
'Accept-Encoding': 'gzip, deflate, br',
|
@@ -96,8 +95,8 @@ class URLProcessor:
|
|
96 |
if not file_id:
|
97 |
logger.error(f"Invalid Google Drive URL: {url}")
|
98 |
return None
|
99 |
-
|
100 |
-
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
|
101 |
response = self.session.get(direct_url, timeout=self.timeout)
|
102 |
response.raise_for_status()
|
103 |
|
@@ -135,7 +134,6 @@ class URLProcessor:
|
|
135 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
136 |
element.decompose()
|
137 |
|
138 |
-
# Try to find the main content in a more robust way
|
139 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
140 |
|
141 |
if main_content:
|
@@ -178,29 +176,23 @@ class FileProcessor:
|
|
178 |
|
179 |
try:
|
180 |
for file in files:
|
181 |
-
# Check if the file is a Gradio File object or a string path
|
182 |
file_path = file.name if isinstance(file, gr.File) else file
|
183 |
|
184 |
-
# Log the file path being processed
|
185 |
logger.info(f"Processing file: {file_path}")
|
186 |
|
187 |
-
# Skip if it's a directory
|
188 |
if os.path.isdir(file_path):
|
189 |
logger.warning(f"Skipping directory: {file_path}")
|
190 |
continue
|
191 |
|
192 |
-
# Skip if file doesn't exist
|
193 |
if not os.path.exists(file_path):
|
194 |
logger.warning(f"File does not exist: {file_path}")
|
195 |
continue
|
196 |
|
197 |
-
# Check file size
|
198 |
file_size = os.path.getsize(file_path)
|
199 |
if file_size > self.max_file_size:
|
200 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
201 |
-
continue
|
202 |
|
203 |
-
# Process based on file type
|
204 |
if zipfile.is_zipfile(file_path):
|
205 |
if self.processed_zip_count >= self.max_zip_files:
|
206 |
logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
|
@@ -218,9 +210,11 @@ class FileProcessor:
|
|
218 |
logger.error(f"Error processing files: {str(e)}")
|
219 |
|
220 |
return combined_data
|
221 |
-
|
|
|
222 |
"""Process ZIP file contents"""
|
223 |
results = []
|
|
|
224 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
225 |
zip_ref.extractall(temp_dir)
|
226 |
for root, _, files in os.walk(temp_dir):
|
@@ -230,7 +224,7 @@ class FileProcessor:
|
|
230 |
try:
|
231 |
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
232 |
content = f.read()
|
233 |
-
|
234 |
results.append({
|
235 |
"source": "file",
|
236 |
"filename": filename,
|
@@ -280,7 +274,6 @@ class Chatbot:
|
|
280 |
if not self.data:
|
281 |
return "No data loaded. Please load your JSON data first."
|
282 |
|
283 |
-
# Simple keyword-based response logic
|
284 |
for key, value in self.data.items():
|
285 |
if key.lower() in user_input.lower():
|
286 |
return f"{key}: {value}"
|
@@ -337,7 +330,6 @@ def create_interface():
|
|
337 |
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
338 |
output_file = gr.File(label="Processed Output")
|
339 |
|
340 |
-
# Initialize chatbot
|
341 |
chatbot = Chatbot()
|
342 |
|
343 |
def process_all_inputs(urls, file, text):
|
@@ -347,7 +339,6 @@ def create_interface():
|
|
347 |
file_processor = FileProcessor()
|
348 |
results = []
|
349 |
|
350 |
-
# Process URLs
|
351 |
if urls:
|
352 |
url_list = re.split(r'[,\n]', urls)
|
353 |
url_list = [url.strip() for url in url_list if url.strip()]
|
@@ -364,20 +355,17 @@ def create_interface():
|
|
364 |
'timestamp': datetime.now().isoformat()
|
365 |
})
|
366 |
|
367 |
-
# Process files
|
368 |
if file:
|
369 |
-
results.extend(file_processor.
|
370 |
|
371 |
-
# Process text input
|
372 |
if text:
|
373 |
cleaned_text = processor.advanced_text_cleaning(text)
|
374 |
results.append({
|
375 |
'source': 'direct_input',
|
376 |
'content': cleaned_text,
|
377 |
-
'timestamp': datetime.now().isoformat()
|
378 |
})
|
379 |
|
380 |
-
# Generate output
|
381 |
if results:
|
382 |
output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
|
383 |
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
5 |
import logging
|
6 |
import mimetypes
|
7 |
import zipfile
|
|
|
8 |
from datetime import datetime
|
9 |
from typing import List, Dict, Optional, Union
|
10 |
from pathlib import Path
|
|
|
31 |
self.session = requests.Session()
|
32 |
self.timeout = 10 # seconds
|
33 |
self.session.headers.update({
|
34 |
+
'User -Agent': UserAgent().random,
|
35 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
36 |
'Accept-Language': 'en-US,en;q=0.5',
|
37 |
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
95 |
if not file_id:
|
96 |
logger.error(f"Invalid Google Drive URL: {url}")
|
97 |
return None
|
98 |
+
|
99 |
+
direct_url = f"https://drive.google.com/uc? export=download&id={file_id.group(1)}"
|
100 |
response = self.session.get(direct_url, timeout=self.timeout)
|
101 |
response.raise_for_status()
|
102 |
|
|
|
134 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
135 |
element.decompose()
|
136 |
|
|
|
137 |
main_content = soup.find('main') or soup.find('article') or soup.body
|
138 |
|
139 |
if main_content:
|
|
|
176 |
|
177 |
try:
|
178 |
for file in files:
|
|
|
179 |
file_path = file.name if isinstance(file, gr.File) else file
|
180 |
|
|
|
181 |
logger.info(f"Processing file: {file_path}")
|
182 |
|
|
|
183 |
if os.path.isdir(file_path):
|
184 |
logger.warning(f"Skipping directory: {file_path}")
|
185 |
continue
|
186 |
|
|
|
187 |
if not os.path.exists(file_path):
|
188 |
logger.warning(f"File does not exist: {file_path}")
|
189 |
continue
|
190 |
|
|
|
191 |
file_size = os.path.getsize(file_path)
|
192 |
if file_size > self.max_file_size:
|
193 |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
|
194 |
+
continue
|
195 |
|
|
|
196 |
if zipfile.is_zipfile(file_path):
|
197 |
if self.processed_zip_count >= self.max_zip_files:
|
198 |
logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}")
|
|
|
210 |
logger.error(f"Error processing files: {str(e)}")
|
211 |
|
212 |
return combined_data
|
213 |
+
|
214 |
+
def _process_zip_file(self, zip_path: str) -> List[Dict]:
|
215 |
"""Process ZIP file contents"""
|
216 |
results = []
|
217 |
+
temp_dir = tempfile.mkdtemp()
|
218 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
219 |
zip_ref.extractall(temp_dir)
|
220 |
for root, _, files in os.walk(temp_dir):
|
|
|
224 |
try:
|
225 |
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
226 |
content = f.read()
|
227 |
+
if content.strip():
|
228 |
results.append({
|
229 |
"source": "file",
|
230 |
"filename": filename,
|
|
|
274 |
if not self.data:
|
275 |
return "No data loaded. Please load your JSON data first."
|
276 |
|
|
|
277 |
for key, value in self.data.items():
|
278 |
if key.lower() in user_input.lower():
|
279 |
return f"{key}: {value}"
|
|
|
330 |
output_text = gr.Textbox(label="Processing Results", interactive=False)
|
331 |
output_file = gr.File(label="Processed Output")
|
332 |
|
|
|
333 |
chatbot = Chatbot()
|
334 |
|
335 |
def process_all_inputs(urls, file, text):
|
|
|
339 |
file_processor = FileProcessor()
|
340 |
results = []
|
341 |
|
|
|
342 |
if urls:
|
343 |
url_list = re.split(r'[,\n]', urls)
|
344 |
url_list = [url.strip() for url in url_list if url.strip()]
|
|
|
355 |
'timestamp': datetime.now().isoformat()
|
356 |
})
|
357 |
|
|
|
358 |
if file:
|
359 |
+
results.extend(file_processor.process_files(file))
|
360 |
|
|
|
361 |
if text:
|
362 |
cleaned_text = processor.advanced_text_cleaning(text)
|
363 |
results.append({
|
364 |
'source': 'direct_input',
|
365 |
'content': cleaned_text,
|
366 |
+
'timestamp': datetime.now(). isoformat()
|
367 |
})
|
368 |
|
|
|
369 |
if results:
|
370 |
output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
|
371 |
output_dir.mkdir(parents=True, exist_ok=True)
|