acecalisto3 commited on
Commit
29808fd
·
verified ·
1 Parent(s): 2e3f355

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -76
app.py CHANGED
@@ -13,10 +13,11 @@ from urllib.parse import urlparse
13
  import requests
14
  import validators
15
  import gradio as gr
16
- import cachetools
17
  from bs4 import BeautifulSoup
18
  from fake_useragent import UserAgent
19
  from ratelimit import limits, sleep_and_retry
 
20
 
21
  # Setup logging with detailed configuration
22
  logging.basicConfig(
@@ -30,17 +31,13 @@ logging.basicConfig(
30
  logger = logging.getLogger(__name__)
31
 
32
  class URLProcessor:
33
- """Class to handle URL processing with advanced features"""
34
-
35
- def __init__(self, timeout: int = 15, max_retries: int = 3, concurrent_requests: int = 5, cache_size: int = 100):
36
- self.timeout = timeout
37
- self.max_retries = max_retries
38
- self.concurrent_requests = concurrent_requests
39
- self.ua = UserAgent() # Initialize UserAgent
40
 
41
- # Implement multilevel caching
42
- self.url_cache = cachetools.LRUCache(maxsize=cache_size)
43
- self.content_cache = cachetools.TTLCache(maxsize=cache_size, ttl=3600) # 1-hour cache
44
 
45
  self.session = requests.Session()
46
  self.session.headers.update({
@@ -97,63 +94,64 @@ class URLProcessor:
97
 
98
  @sleep_and_retry
99
  @limits(calls=20, period=60) # Refined rate limiting
100
- def fetch_content(self, url: str) -> Optional[str]:
101
- """Fetch content from URL with retry mechanism"""
102
- # Check content cache first
103
- if url in self.content_cache:
104
- return self.content_cache[url]
105
-
106
- for attempt in range(self.max_retries):
107
- try:
108
- response = self.session.get(url, timeout=self.timeout)
109
- response.raise_for_status()
110
-
111
- # Use BeautifulSoup for more robust parsing
112
- soup = BeautifulSoup(response.text, 'html.parser')
113
-
114
- # Remove scripts, styles, comments
115
- for script in soup(["script", "style"]):
116
- script.decompose()
117
-
118
- # Extract clean text
119
- text = soup.get_text(separator=' ')
120
- cleaned_text = self .advanced_text_cleaning(text)
121
-
122
- # Cache the result
123
- self.content_cache[url] = cleaned_text
124
- return cleaned_text
125
 
126
- except requests.RequestException as e:
127
- logger.warning(f"Fetch attempt {attempt + 1} failed for {url}: {e}")
128
- time.sleep(2 ** attempt) # Exponential backoff
129
-
130
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  def advanced_text_cleaning(self, text: str) -> str:
133
- """Sophisticated text cleaning and normalization"""
134
- if not text:
135
- return ""
136
-
137
- # Remove control characters
138
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
139
-
140
- # Normalize Unicode characters
141
- text = text.encode('ascii', 'ignore').decode('ascii')
142
-
143
- # Replace multiple whitespaces
144
- text = re.sub(r'\s+', ' ', text)
145
-
146
- # Remove HTML entities
147
- text = re.sub(r'&[a-zA-Z]+;', '', text)
148
-
149
- # Normalize quotation marks
150
- text = text.replace('"', '"').replace('"', '"')
151
- text = text.replace('‘', "'").replace('’', "'")
152
-
153
- # Remove excessive punctuation
154
- text = re.sub(r'([.,!?]){2,}', r'\1', text)
155
-
156
- return text.strip()
157
 
158
  class FileProcessor:
159
  """Class to handle file processing"""
@@ -170,6 +168,8 @@ class FileProcessor:
170
  except Exception:
171
  return False
172
 
 
 
173
  def process_file(self, file) -> List[Dict]:
174
  """Process uploaded file with enhanced error handling"""
175
  if not file:
@@ -218,20 +218,23 @@ class FileProcessor:
218
  return results
219
 
220
  def _process_single_file(self, file) -> List[Dict]:
221
- """Process single file"""
222
- results = []
223
  try:
 
224
  content = file.read().decode('utf-8', errors='ignore')
225
- if content.strip():
226
- results.append({
227
- "source": "file",
228
- "filename": os.path.basename(file.name),
229
- "content": content,
230
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
231
- })
 
 
 
 
232
  except Exception as e:
233
- logger.error(f"Error processing single file: {str(e)}")
234
- return results
235
 
236
  def create_interface():
237
  """Create a comprehensive Gradio interface with advanced features"""
 
13
  import requests
14
  import validators
15
  import gradio as gr
16
+ from diskcache import Cache
17
  from bs4 import BeautifulSoup
18
  from fake_useragent import UserAgent
19
  from ratelimit import limits, sleep_and_retry
20
+ from cleantext import clean
21
 
22
  # Setup logging with detailed configuration
23
  logging.basicConfig(
 
31
  logger = logging.getLogger(__name__)
32
 
33
  class URLProcessor:
34
+ def __init__(self, timeout=15, max_retries=3, concurrent_requests=5, cache_dir='cache'):
35
+ self.cache_dir = Path(cache_dir)
36
+ self.cache_dir.mkdir(exist_ok=True)
 
 
 
 
37
 
38
+ # Persistent disk-based caches
39
+ self.url_cache = Cache(str(self.cache_dir / 'url_cache'))
40
+ self.content_cache = Cache(str(self.cache_dir / 'content_cache'), size_limit=2**30)
41
 
42
  self.session = requests.Session()
43
  self.session.headers.update({
 
94
 
95
  @sleep_and_retry
96
  @limits(calls=20, period=60) # Refined rate limiting
97
+ def fetch_content(self, url: str) -> Optional[Dict]:
98
+ cached = self.content_cache.get(url)
99
+ if cached:
100
+ return cached
101
+
102
+ try:
103
+ response = self.session.get(url, timeout=self.timeout)
104
+ soup = BeautifulSoup(response.text, 'html.parser')
105
+
106
+ # Extract structured elements
107
+ title = soup.title.text.strip() if soup.title else ''
108
+ meta_desc = soup.find('meta', {'name': 'description'})
109
+ description = meta_desc['content'].strip() if meta_desc else ''
110
+
111
+ headings = [{'level': h.name, 'text': h.text.strip()}
112
+ for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
113
+
114
+ links = [a['href'] for a in soup.find_all('a', href=True)
115
+ if validators.url(a['href'])]
116
+
117
+ # Main content extraction
118
+ for element in soup(['script', 'style', 'nav', 'footer']):
119
+ element.decompose()
 
 
120
 
121
+ main_content = soup.find('main') or soup.find('article') or soup.body
122
+ text = main_content.get_text(separator=' ') if main_content else ''
123
+
124
+ structured_data = {
125
+ 'title': title,
126
+ 'description': description,
127
+ 'headings': headings,
128
+ 'links': links,
129
+ 'content': self.advanced_text_cleaning(text),
130
+ 'status_code': response.status_code,
131
+ 'content_type': response.headers.get('Content-Type', ''),
132
+ 'timestamp': datetime.now().isoformat()
133
+ }
134
+
135
+ self.content_cache.set(url, structured_data, expire=3600)
136
+ return structured_data
137
+ except Exception as e:
138
+ logger.error(f"Error fetching {url}: {e}")
139
+ return None
140
 
141
  def advanced_text_cleaning(self, text: str) -> str:
142
+ return clean(text,
143
+ fix_unicode=True,
144
+ to_ascii=True,
145
+ lower=True,
146
+ no_line_breaks=True,
147
+ no_urls=True,
148
+ no_emails=True,
149
+ no_phone_numbers=True,
150
+ no_numbers=False,
151
+ no_digits=False,
152
+ no_currency_symbols=True,
153
+ no_punct=False
154
+ ).strip()
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  class FileProcessor:
157
  """Class to handle file processing"""
 
168
  except Exception:
169
  return False
170
 
171
+
172
+
173
  def process_file(self, file) -> List[Dict]:
174
  """Process uploaded file with enhanced error handling"""
175
  if not file:
 
218
  return results
219
 
220
  def _process_single_file(self, file) -> List[Dict]:
 
 
221
  try:
222
+ file_stat = os.stat(file.name)
223
  content = file.read().decode('utf-8', errors='ignore')
224
+
225
+ return [{
226
+ 'source': 'file',
227
+ 'filename': os.path.basename(file.name),
228
+ 'file_size': file_stat.st_size,
229
+ 'mime_type': mimetypes.guess_type(file.name)[0],
230
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
231
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
232
+ 'content': content,
233
+ 'timestamp': datetime.now().isoformat()
234
+ }]
235
  except Exception as e:
236
+ logger.error(f"File processing error: {e}")
237
+ return []
238
 
239
  def create_interface():
240
  """Create a comprehensive Gradio interface with advanced features"""