Spaces:
Running
Running
import json | |
import os | |
import re | |
import time | |
import logging | |
import mimetypes | |
import zipfile | |
from datetime import datetime | |
from typing import List, Dict, Optional, Union | |
from pathlib import Path | |
import requests | |
import validators | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
from fake_useragent import UserAgent | |
from cleantext import clean | |
# Setup logging with detailed configuration | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s', | |
handlers=[ | |
logging.StreamHandler(), | |
logging.FileHandler('app.log', encoding='utf-8') | |
] | |
) | |
logger = logging.getLogger(__name__) | |
class URLProcessor: | |
def __init__(self): | |
self.session = requests.Session() | |
self.timeout = 10 # seconds | |
self.session.headers.update({ | |
'User-Agent': UserAgent().random, | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1' | |
}) | |
def advanced_text_cleaning(self, text: str) -> str: | |
"""Robust text cleaning with version compatibility""" | |
try: | |
cleaned_text = clean( | |
text, | |
fix_unicode=True, | |
to_ascii=True, | |
lower=True, | |
no_line_breaks=True, | |
no_urls=True, | |
no_emails=True, | |
no_phone_numbers=True, | |
no_numbers=False, | |
no_digits=False, | |
no_currency_symbols=True, | |
no_punct=False | |
).strip() | |
return cleaned_text | |
except Exception as e: | |
logger.warning(f"Text cleaning error: {e}. Using fallback method.") | |
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters | |
text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters | |
text = re.sub(r'\s+', ' ', text) # Normalize whitespace | |
return text.strip() | |
def validate_url(self, url: str) -> Dict: | |
"""Validate URL format and accessibility""" | |
try: | |
if not validators.url(url): | |
return {'is_valid': False, 'message': 'Invalid URL format'} | |
response = self.session.head(url, timeout=self.timeout) | |
response.raise_for_status() | |
return {'is_valid': True, 'message': 'URL is valid and accessible'} | |
except Exception as e: | |
return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'} | |
def fetch_content(self, url: str) -> Optional[Dict]: | |
"""Universal content fetcher with special case handling""" | |
try: | |
if 'drive.google.com' in url: | |
return self._handle_google_drive(url) | |
if 'calendar.google.com' in url and 'ical' in url: | |
return self._handle_google_calendar(url) | |
return self._fetch_html_content(url) | |
except Exception as e: | |
logger.error(f"Content fetch failed: {e}") | |
return None | |
def _handle_google_drive(self, url: str) -> Optional[Dict]: | |
"""Process Google Drive file links""" | |
try: | |
file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url) | |
if not file_id: | |
logger.error(f"Invalid Google Drive URL: {url}") | |
return None | |
direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}" | |
response = self.session.get(direct_url, timeout=self.timeout) | |
response.raise_for_status() | |
return { | |
'content': response.text, | |
'content_type': response.headers.get('Content-Type', ''), | |
'timestamp': datetime.now().isoformat() | |
} | |
except Exception as e: | |
logger.error(f"Google Drive processing failed: {e}") | |
return None | |
def _handle_google_calendar(self, url: str) -> Optional[Dict]: | |
"""Process Google Calendar ICS feeds""" | |
try: | |
response = self.session.get(url, timeout=self.timeout) | |
response.raise_for_status() | |
return { | |
'content': response.text, | |
'content_type': 'text/calendar', | |
'timestamp': datetime.now().isoformat() | |
} | |
except Exception as e: | |
logger.error(f"Calendar fetch failed: {e}") | |
return None | |
def _fetch_html_content(self, url: str) -> Optional[Dict]: | |
"""Standard HTML content processing""" | |
try: | |
response = self.session.get(url, timeout=self.timeout) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']): | |
element.decompose() | |
main_content = soup.find('main') or soup.find('article') or soup.body | |
if main_content: | |
text_content = main_content.get_text(separator='\n', strip=True) | |
cleaned_content = self.advanced_text_cleaning(text_content) | |
return { | |
'content': cleaned_content, | |
'content_type': response.headers.get('Content-Type', ''), | |
'timestamp': datetime.now().isoformat() | |
} | |
else: | |
logger.warning(f"No main content found for URL: {url}") | |
return None | |
except Exception as e: | |
logger.error(f"HTML processing failed: {e}") | |
return None | |
class FileProcessor: | |
"""Class to handle file processing""" | |
def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default | |
self.max_file_size = max_file_size | |
self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'} | |
self.processed_zip_count = 0 | |
self.max_zip_files = 5 | |
def is_text_file(self, file_path: str) -> bool: | |
"""Check if the file is a text file based on its extension.""" | |
return any(file_path.lower().endswith(ext) for ext in self.supported_text_extensions) | |
def process_files(self, files: Union[List[gr.File], List[str]]) -> List[Dict]: | |
"""Process multiple uploaded files and return a single JSON extraction""" | |
if not files: | |
return [] | |
combined_data = [] | |
self.processed_zip_count = 0 | |
try: | |
for file in files: | |
file_path = file.name if isinstance(file, gr.File) else file | |
logger.info(f"Processing file: {file_path}") | |
if os.path.isdir(file_path): | |
logger.warning(f"Skipping directory: {file_path}") | |
continue | |
if not os.path.exists(file_path): | |
logger.warning(f"File does not exist: {file_path}") | |
continue | |
file_size = os.path.getsize(file_path) | |
if file_size > self.max_file_size: | |
logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size") | |
continue | |
if zipfile.is_zipfile(file_path): | |
if self.processed_zip_count >= self.max_zip_files: | |
logger.warning(f"Maximum number of ZIP files ({self.max_zip_files}) reached, skipping {file_path}") | |
continue | |
self.processed_zip_count += 1 | |
zip_results = self._process_zip_file(file_path) | |
combined_data.extend(zip_results) | |
elif self.is_text_file(file_path): | |
file_results = self._process_single_file(file) # Changed file_path to file | |
combined_data.extend(file_results) | |
else: | |
logger.warning(f"Unsupported file type: {file_path}") | |
except Exception as e: | |
logger.error(f"Error processing files: {str(e)}") | |
return combined_data | |
def _process_single_file(self, file) -> List[Dict]: | |
try: | |
file_stat = os.stat(file.name) | |
with open(file.name, 'r', encoding='utf-8', errors='ignore') as f: | |
content = f.read() | |
return [{ | |
'source': 'file', | |
'filename': os.path.basename(file.name), | |
'file_size': file_stat.st_size, | |
'mime_type': mimetypes.guess_type(file.name)[0], | |
'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(), | |
'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(), | |
'content': content, | |
'timestamp': datetime.now().isoformat() | |
}] | |
except Exception as e: | |
logger.error(f"File processing error: {e}") | |
return [] | |
def _process_zip_file(self, zip_file_path: str) -> List[Dict]: | |
"""Process a ZIP file and extract data from text files within.""" | |
extracted_data = [] | |
try: | |
with zipfile.ZipFile(zip_file_path, 'r') as zf: | |
for name in zf.namelist(): | |
if self.is_text_file(name): | |
try: | |
file_info = zf.getinfo(name) | |
with zf.open(name) as f: | |
content = f.read().decode('utf-8', errors='ignore') | |
# Use file_info for file size and date/time | |
extracted_data.append({ | |
'source': 'zip', | |
'filename': name, | |
'file_size': file_info.file_size, # Get file size from ZipInfo | |
'mime_type': mimetypes.guess_type(name)[0], | |
'created': datetime(*file_info.date_time).isoformat(), # Get date from ZipInfo | |
'modified': datetime(*file_info.date_time).isoformat(), | |
'content': content, | |
'timestamp': datetime.now().isoformat() | |
}) | |
except Exception as e: | |
logger.error(f"Error processing file {name} from ZIP: {e}") | |
except zipfile.BadZipFile: | |
logger.error(f"Error: {zip_file_path} is not a valid ZIP file.") | |
except Exception as e: | |
logger.error(f"Error processing ZIP file {zip_file_path}: {e}") | |
return extracted_data | |
class Chatbot: | |
"""Simple chatbot that uses provided JSON data for responses.""" | |
def __init__(self): | |
self.data = None | |
def load_data(self, json_data: str): | |
"""Load JSON data into the chatbot.""" | |
try: | |
self.data = json.loads(json_data) | |
return "Data loaded successfully!" | |
except json.JSONDecodeError: | |
return "Invalid JSON data. Please check your input." | |
def chat(self, user_input: str) -> str: | |
"""Generate a response based on user input and loaded data.""" | |
if not self.data: | |
return "No data loaded. Please load your JSON data first." | |
for key, value in self.data.items(): | |
if key.lower() in user_input.lower(): | |
return f"{key}: {value}" | |
return "I don't have information on that. Please ask about something else." | |
def create_interface(): | |
"""Create a comprehensive Gradio interface with advanced features""" | |
css = """ | |
.container { max-width: 1200px; margin: auto; } | |
.warning { background-color: #fff3cd; color: #856404; } | |
.error { background-color: #f8d7da; color: #721c24; } | |
""" | |
with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface: | |
gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit") | |
with gr.Tab("URL Processing"): | |
url_input = gr.Textbox( | |
label="Enter URLs (comma or newline separated)", | |
lines=5, | |
placeholder="https://example1.com\nhttps://example2.com" | |
) | |
with gr.Tab("File Input"): | |
file_input = gr.File( | |
label="Upload text file or ZIP archive", | |
file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"] | |
) | |
with gr.Tab("Text Input"): | |
text_input = gr.Textbox( | |
label="Raw Text Input", | |
lines=5, | |
placeholder="Paste your text here..." | |
) | |
with gr.Tab("Chat"): | |
json_input = gr.Textbox( | |
label="Load JSON Data", | |
placeholder="Paste your JSON data here...", | |
lines=5 | |
) | |
load_btn = gr.Button("Load Data", variant="primary") | |
chat_input = gr.Textbox( | |
label="Chat with your data", | |
placeholder="Type your question here..." | |
) | |
chat_output = gr.Textbox(label="Chatbot Response", interactive=False) | |
process_btn = gr.Button("Process Input", variant="primary") | |
output_text = gr.Textbox(label="Processing Results", interactive=False) | |
output_file = gr.File(label="Processed Output") | |
chatbot = Chatbot() | |
def process_all_inputs(urls, file, text): | |
"""Process all input types with progress tracking""" | |
try: | |
processor = URLProcessor() | |
file_processor = FileProcessor() | |
results = [] | |
if urls: | |
url_list = re.split(r'[,\n]', urls) | |
url_list = [url.strip() for url in url_list if url.strip()] | |
for url in url_list: | |
validation = processor.validate_url(url) | |
if validation.get('is_valid'): | |
content = processor.fetch_content(url) | |
if content: | |
results.append({ | |
'source': 'url', | |
'url': url, | |
'content': content, | |
'timestamp': datetime.now().isoformat() | |
}) | |
if file: | |
results.extend(file_processor.process_files(file)) | |
if text: | |
cleaned_text = processor.advanced_text_cleaning(text) | |
results.append({ | |
'source': 'direct_input', | |
'content': cleaned_text, | |
'timestamp': datetime.now().isoformat() | |
}) | |
if results: | |
output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d') | |
output_dir.mkdir(parents=True, exist_ok=True) | |
output_path = output_dir / f'processed_{int(time.time())}.json' | |
with open(output_path, 'w', encoding='utf-8') as f: | |
json.dump(results, f, ensure_ascii=False, indent=2) | |
summary = f"Processed {len(results)} items successfully!" | |
return str(output_path), summary | |
else: | |
return None, "No valid content to process." | |
except Exception as e: | |
logger.error(f"Processing error: {e}") | |
return None, f"Error: {str(e)}" | |
def load_chat_data(json_data): | |
"""Load JSON data into the chatbot.""" | |
return chatbot.load_data(json_data) | |
def chat_with_data(user_input): | |
"""Chat with the loaded data.""" | |
return chatbot.chat(user_input) | |
process_btn.click( | |
process_all_inputs, | |
inputs=[url_input, file_input, text_input], | |
outputs=[output_file, output_text] | |
) | |
load_btn.click( | |
load_chat_data, | |
inputs=json_input, | |
outputs=chat_output | |
) | |
chat_input.submit( | |
chat_with_data, | |
inputs=chat_input, | |
outputs=chat_output | |
) | |
gr.Markdown(""" | |
### Usage Guidelines | |
- **URL Processing**: Enter valid HTTP/HTTPS URLs | |
- **File Input**: Upload text files or ZIP archives | |
- **Text Input**: Direct text processing | |
- **Chat**: Load JSON data and ask questions about it | |
- Advanced cleaning and validation included | |
""") | |
return interface | |
def main(): | |
# Configure system settings | |
mimetypes.init() | |
# Create and launch interface | |
interface = create_interface() | |
# Launch with proper configuration | |
interface.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
inbrowser=False, # Disable browser opening in container | |
debug=False # Disable debug mode for production | |
) | |
if __name__ == "__main__": | |
main() | |