acecalisto3 commited on
Commit
1336a84
·
verified ·
1 Parent(s): eed2b1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -130
app.py CHANGED
@@ -31,152 +31,90 @@ logging.basicConfig(
31
  logger = logging.getLogger(__name__)
32
 
33
  class URLProcessor:
34
- """Class to handle URL processing with advanced features"""
35
-
36
- def __init__(self, timeout=15, max_retries=3, cache_dir='cache'):
37
- self.ua = UserAgent() # Initialize UserAgent first
38
- self.timeout = timeout
39
- self.max_retries = max_retries
40
-
41
- # Persistent caching setup
42
- self.cache_dir = Path(cache_dir)
43
- self.cache_dir.mkdir(exist_ok=True)
44
- self.url_cache = Cache(str(self.cache_dir / 'url_cache'))
45
- self.content_cache = Cache(str(self.cache_dir / 'content_cache'), size_limit=2**30)
46
-
47
- # Session configuration
48
- self.session = requests.Session()
49
- self.session.headers.update({
50
- 'User-Agent': self.ua.random, # Correct header key
51
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52
- 'Accept-Language': 'en-US,en;q=0.5',
53
- 'Connection': 'keep-alive'
54
- })
55
- @sleep_and_retry
56
- @limits(calls=10, period=60) # Rate limiting: 10 calls per minute
57
- def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
58
- """Validate URL format and accessibility"""
59
  try:
60
- # Check cache first
61
- if url in self.url_cache:
62
- return self.url_cache[url]
63
-
64
- result = urlparse(url)
65
- validation_result = {
66
- 'is_valid': False,
67
- 'message': 'Invalid URL',
68
- 'scheme': result.scheme,
69
- 'netloc': result.netloc
70
- }
71
-
72
- if not all([result.scheme, result.netloc]):
73
- validation_result['message'] = 'Missing scheme or network location'
74
- return validation_result
75
-
76
- if not validators.url(url):
77
- validation_result['message'] = 'URL format validation failed'
78
- return validation_result
79
-
80
- # Perform HEAD request for accessibility
81
- try:
82
- response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
83
- validation_result['is_valid'] = response.status_code in [200, 301, 302]
84
- validation_result['status_code'] = response.status_code
85
- validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
86
- except requests.RequestException as e:
87
- validation_result['message'] = f"Connection error: {str(e)}"
88
-
89
- # Cache the result
90
- self.url_cache[url] = validation_result
91
- return validation_result
92
-
93
- except Exception as e:
94
- logger.error(f"Unexpected error validating URL {url}: {e}")
95
- return {
96
- 'is_valid': False,
97
- 'message': f"Unexpected validation error: {str(e)}"
98
- }
99
 
100
- @sleep_and_retry
101
- @limits(calls=20, period=60) # Refined rate limiting
102
  def fetch_content(self, url: str) -> Optional[Dict]:
103
- """Fetch and structure content from URL, handling Google Drive and Calendar links."""
 
104
  if 'drive.google.com' in url:
105
- # Convert Google Drive URL to direct download link
 
 
 
 
 
 
 
 
 
 
 
106
  file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
107
- if file_id:
108
- url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
109
- else:
110
  logger.error(f"Invalid Google Drive URL: {url}")
111
  return None
112
-
113
- cached = self.content_cache.get(url)
114
- if cached:
115
- return cached
116
-
117
- try:
118
- response = self.session.get(url, timeout=self.timeout)
119
- response.raise_for_status()
120
-
121
- # Handle ICS files (Google Calendar)
122
- if 'text/calendar' in response.headers.get('Content-Type', ''):
123
- return {
124
- 'content': response.text,
125
- 'content_type': 'text/calendar',
126
- 'timestamp': datetime.now().isoformat()
127
- }
128
-
129
- # Handle HTML content
130
- soup = BeautifulSoup(response.text, 'html.parser')
131
- title = soup.title.text.strip() if soup.title else ''
132
- meta_desc = soup.find('meta', {'name': 'description'})
133
- description = meta_desc['content'].strip() if meta_desc else ''
134
-
135
- headings = [{'level': h.name, 'text': h.text.strip()}
136
- for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
137
-
138
- links = [a['href'] for a in soup.find_all('a', href=True)
139
- if validators.url(a['href'])]
140
-
141
- # Main content extraction
142
- for element in soup(['script', 'style', 'nav', 'footer']):
143
- element.decompose()
144
 
145
- main_content = soup.find('main') or soup.find('article') or soup.body
146
- text = main_content.get_text(separator=' ') if main_content else ''
 
147
 
148
- structured_data = {
149
- 'title': title,
150
- 'description': description,
151
- 'headings': headings,
152
- 'links': links,
153
- 'content': self.advanced_text_cleaning(text),
154
- 'status_code': response.status_code,
155
  'content_type': response.headers.get('Content-Type', ''),
156
  'timestamp': datetime.now().isoformat()
157
  }
158
-
159
- self.content_cache.set(url, structured_data, expire=3600)
160
- return structured_data
161
  except Exception as e:
162
- logger.error(f"Error fetching {url}: {e}")
163
  return None
164
 
165
- def advanced_text_cleaning(self, text: str) -> str:
166
- return clean(text,
167
- fix_unicode=True,
168
- to_ascii=True,
169
- lower=True,
170
- no_line_breaks=True,
171
- no_urls=True,
172
- no_emails=True,
173
- no_phone_numbers=True,
174
- no_numbers=False,
175
- no_digits=False,
176
- no_currency_symbols=True,
177
- no_punct=False
178
- ).strip()
179
 
 
 
 
 
 
 
 
 
 
 
 
180
  class FileProcessor:
181
  """Class to handle file processing"""
182
 
 
31
  logger = logging.getLogger(__name__)
32
 
33
  class URLProcessor:
34
+ def advanced_text_cleaning(self, text: str) -> str:
35
+ """Robust text cleaning with version compatibility"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  try:
37
+ # Modern clean-text parameters
38
+ return clean(text,
39
+ fix_unicode=True,
40
+ to_ascii=True,
41
+ lower=True,
42
+ no_line_breaks=True,
43
+ no_urls=True,
44
+ no_emails=True,
45
+ no_phone_numbers=True,
46
+ no_numbers=False,
47
+ no_digits=False,
48
+ no_currency_symbols=True,
49
+ no_punct=False
50
+ ).strip()
51
+ except TypeError as e:
52
+ # Fallback to basic cleaning
53
+ logger.warning("Using fallback text cleaning method")
54
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Control chars
55
+ text = text.encode('ascii', 'ignore').decode('ascii') # Unicode
56
+ text = re.sub(r'\s+', ' ', text) # Whitespace
57
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
59
  def fetch_content(self, url: str) -> Optional[Dict]:
60
+ """Universal content fetcher with special case handling"""
61
+ # Google Drive document handling
62
  if 'drive.google.com' in url:
63
+ return self._handle_google_drive(url)
64
+
65
+ # Google Calendar ICS handling
66
+ if 'calendar.google.com' in url and 'ical' in url:
67
+ return self._handle_google_calendar(url)
68
+
69
+ # Standard HTML processing
70
+ return self._fetch_html_content(url)
71
+
72
+ def _handle_google_drive(self, url: str) -> Optional[Dict]:
73
+ """Process Google Drive file links"""
74
+ try:
75
  file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
76
+ if not file_id:
 
 
77
  logger.error(f"Invalid Google Drive URL: {url}")
78
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
81
+ response = self.session.get(direct_url, timeout=self.timeout)
82
+ response.raise_for_status()
83
 
84
+ return {
85
+ 'content': response.text,
 
 
 
 
 
86
  'content_type': response.headers.get('Content-Type', ''),
87
  'timestamp': datetime.now().isoformat()
88
  }
 
 
 
89
  except Exception as e:
90
+ logger.error(f"Google Drive processing failed: {e}")
91
  return None
92
 
93
+ def _handle_google_calendar(self, url: str) -> Optional[Dict]:
94
+ """Process Google Calendar ICS feeds"""
95
+ try:
96
+ response = self.session.get(url, timeout=self.timeout)
97
+ response.raise_for_status()
98
+ return {
99
+ 'content': response.text,
100
+ 'content_type': 'text/calendar',
101
+ 'timestamp': datetime.now().isoformat()
102
+ }
103
+ except Exception as e:
104
+ logger.error(f"Calendar fetch failed: {e}")
105
+ return None
 
106
 
107
+ def _fetch_html_content(self, url: str) -> Optional[Dict]:
108
+ """Standard HTML content processing"""
109
+ try:
110
+ response = self.session.get(url, timeout=self.timeout)
111
+ soup = BeautifulSoup(response.text, 'html.parser')
112
+ # ... existing HTML processing logic ...
113
+ return structured_data
114
+ except Exception as e:
115
+ logger.error(f"HTML processing failed: {e}")
116
+ return None
117
+
118
  class FileProcessor:
119
  """Class to handle file processing"""
120