Da-123 commited on
Commit
5c85daa
Β·
1 Parent(s): 810398e
agentic_implementation/email_scraper.py CHANGED
@@ -13,7 +13,7 @@ from dotenv import load_dotenv
13
  from zoneinfo import ZoneInfo
14
  from email.utils import parsedate_to_datetime
15
  from typing import List, Dict
16
-
17
  load_dotenv()
18
 
19
  # Email credentials
@@ -26,33 +26,7 @@ def validate_email_setup():
26
  """Validate email setup and credentials"""
27
  print("=== Email Setup Validation ===")
28
 
29
- # Check .env file existence
30
- # env_file_exists = os.path.exists('.env')
31
- # print(f".env file exists: {'βœ… Yes' if env_file_exists else '❌ No'}")
32
-
33
- # if not env_file_exists:
34
- # print("❌ No .env file found! Create one with:")
35
- # print(" [email protected]")
36
- # print(" APP_PASSWORD=your_16_char_app_password")
37
- # print(" OPENAI_API_KEY=your_openai_key")
38
- # return False
39
-
40
- # Check environment variables
41
  issues = []
42
-
43
- # if not EMAIL_ID:
44
- # issues.append("EMAIL_ID not set or empty")
45
- # elif '@' not in EMAIL_ID:
46
- # issues.append("EMAIL_ID doesn't look like an email address")
47
- # elif not EMAIL_ID.endswith('@gmail.com'):
48
- # issues.append("EMAIL_ID should be a Gmail address (@gmail.com)")
49
-
50
- # if not APP_PASSWORD:
51
- # issues.append("APP_PASSWORD not set or empty")
52
- # elif len(APP_PASSWORD) != 16:
53
- # issues.append(f"APP_PASSWORD should be 16 characters, got {len(APP_PASSWORD)}")
54
- # elif ' ' in APP_PASSWORD:
55
- # issues.append("APP_PASSWORD should not contain spaces (remove spaces from app password)")
56
 
57
  if not os.getenv("OPENAI_API_KEY"):
58
  issues.append("OPENAI_API_KEY not set (needed for query processing)")
@@ -76,9 +50,6 @@ def _imap_connect():
76
 
77
  if EMAIL_ID:
78
  print(f"Email ID: {EMAIL_ID[:5]}...@{EMAIL_ID.split('@')[1] if '@' in EMAIL_ID else 'INVALID'}")
79
- # if APP_PASSWORD:
80
- # print(f"App Password length: {len(APP_PASSWORD)} characters")
81
- # print(f"App Password format: {'βœ… Looks correct (16 chars)' if len(APP_PASSWORD) == 16 else f'❌ Expected 16 chars, got {len(APP_PASSWORD)}'}")
82
 
83
  if not EMAIL_ID or not APP_PASSWORD:
84
  error_msg = "Missing credentials in environment variables!"
@@ -198,141 +169,6 @@ def _is_date_in_range(email_date: str, start_date: str, end_date: str) -> bool:
198
  except ValueError:
199
  return False
200
 
201
- def scrape_emails_from_sender(sender_email: str, start_date: str, end_date: str) -> List[Dict]:
202
- """
203
- Scrape emails from specific sender within date range
204
- Uses intelligent caching to avoid re-scraping
205
- """
206
- print(f"Scraping emails from {sender_email} between {start_date} and {end_date}")
207
-
208
- # Load existing database
209
- db = _load_email_db()
210
- sender_email = sender_email.lower().strip()
211
-
212
- # Check if we have cached emails for this sender
213
- if sender_email in db:
214
- cached_emails = db[sender_email].get("emails", [])
215
-
216
- # Filter cached emails by date range
217
- filtered_emails = [
218
- email for email in cached_emails
219
- if _is_date_in_range(email["date"], start_date, end_date)
220
- ]
221
-
222
- # Check if we need to scrape more recent emails
223
- last_scraped = db[sender_email].get("last_scraped", "01-Jan-2020")
224
- today = datetime.today().strftime("%d-%b-%Y")
225
-
226
- if last_scraped == today and filtered_emails:
227
- print(f"Using cached emails (last scraped: {last_scraped})")
228
- return filtered_emails
229
-
230
- # Need to scrape emails
231
- try:
232
- mail = _imap_connect()
233
-
234
- # Prepare IMAP search criteria
235
- start_imap = _date_to_imap_format(start_date)
236
- # Add one day to end_date for BEFORE criteria (IMAP BEFORE is exclusive)
237
- end_dt = datetime.strptime(end_date, "%d-%b-%Y") + timedelta(days=1)
238
- end_imap = end_dt.strftime("%d-%b-%Y")
239
-
240
- search_criteria = f'(FROM "{sender_email}") SINCE "{start_imap}" BEFORE "{end_imap}"'
241
- print(f"IMAP search: {search_criteria}")
242
-
243
- # Search for emails
244
- status, data = mail.search(None, search_criteria)
245
- if status != 'OK':
246
- raise Exception(f"IMAP search failed: {status}")
247
-
248
- email_ids = data[0].split()
249
- print(f"Found {len(email_ids)} emails")
250
-
251
- scraped_emails = []
252
-
253
- # Process each email
254
- for i, email_id in enumerate(email_ids):
255
- try:
256
- print(f"Processing email {i+1}/{len(email_ids)}")
257
-
258
- # Fetch email
259
- status, msg_data = mail.fetch(email_id, "(RFC822)")
260
- if status != 'OK':
261
- continue
262
-
263
- # Parse email
264
- msg = message_from_bytes(msg_data[0][1])
265
-
266
- # Extract information
267
- subject = msg.get("Subject", "No Subject")
268
- content = _email_to_clean_text(msg)
269
-
270
- # Parse date
271
- date_header = msg.get("Date", "")
272
- if date_header:
273
- try:
274
- dt_obj = parsedate_to_datetime(date_header)
275
- # Convert to IST
276
- ist_dt = dt_obj.astimezone(ZoneInfo("Asia/Kolkata"))
277
- email_date = ist_dt.strftime("%d-%b-%Y")
278
- email_time = ist_dt.strftime("%H:%M:%S")
279
- except:
280
- email_date = datetime.today().strftime("%d-%b-%Y")
281
- email_time = "00:00:00"
282
- else:
283
- email_date = datetime.today().strftime("%d-%b-%Y")
284
- email_time = "00:00:00"
285
-
286
- # Get message ID for deduplication
287
- message_id = msg.get("Message-ID", f"missing-{email_id.decode()}")
288
-
289
- scraped_emails.append({
290
- "date": email_date,
291
- "time": email_time,
292
- "subject": subject,
293
- "content": content[:2000], # Limit content length
294
- "message_id": message_id
295
- })
296
-
297
- except Exception as e:
298
- print(f"Error processing email {email_id}: {e}")
299
- continue
300
-
301
- mail.logout()
302
-
303
- # Update database
304
- if sender_email not in db:
305
- db[sender_email] = {"emails": [], "last_scraped": ""}
306
-
307
- # Merge with existing emails (avoid duplicates)
308
- existing_emails = db[sender_email].get("emails", [])
309
- existing_ids = {email.get("message_id") for email in existing_emails}
310
-
311
- new_emails = [
312
- email for email in scraped_emails
313
- if email["message_id"] not in existing_ids
314
- ]
315
-
316
- # Update database
317
- db[sender_email]["emails"] = existing_emails + new_emails
318
- db[sender_email]["last_scraped"] = datetime.today().strftime("%d-%b-%Y")
319
-
320
- # Save database
321
- _save_email_db(db)
322
-
323
- # Return filtered results
324
- all_emails = db[sender_email]["emails"]
325
- filtered_emails = [
326
- email for email in all_emails
327
- if _is_date_in_range(email["date"], start_date, end_date)
328
- ]
329
-
330
- print(f"Scraped {len(new_emails)} new emails, returning {len(filtered_emails)} in date range")
331
- return filtered_emails
332
-
333
- except Exception as e:
334
- print(f"Email scraping failed: {e}")
335
- raise
336
 
337
  def scrape_emails_by_text_search(keyword: str, start_date: str, end_date: str) -> List[Dict]:
338
  """
@@ -455,7 +291,7 @@ def scrape_emails_by_text_search(keyword: str, start_date: str, end_date: str) -
455
  if __name__ == "__main__":
456
  # Test scraping
457
  try:
458
- emails = scrape_emails_from_sender(
459
460
  "01-Jun-2025",
461
  "07-Jun-2025"
 
13
  from zoneinfo import ZoneInfo
14
  from email.utils import parsedate_to_datetime
15
  from typing import List, Dict
16
+ from logger import logger
17
  load_dotenv()
18
 
19
  # Email credentials
 
26
  """Validate email setup and credentials"""
27
  print("=== Email Setup Validation ===")
28
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  issues = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  if not os.getenv("OPENAI_API_KEY"):
32
  issues.append("OPENAI_API_KEY not set (needed for query processing)")
 
50
 
51
  if EMAIL_ID:
52
  print(f"Email ID: {EMAIL_ID[:5]}...@{EMAIL_ID.split('@')[1] if '@' in EMAIL_ID else 'INVALID'}")
 
 
 
53
 
54
  if not EMAIL_ID or not APP_PASSWORD:
55
  error_msg = "Missing credentials in environment variables!"
 
169
  except ValueError:
170
  return False
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  def scrape_emails_by_text_search(keyword: str, start_date: str, end_date: str) -> List[Dict]:
174
  """
 
291
  if __name__ == "__main__":
292
  # Test scraping
293
  try:
294
+ emails = scrape_emails_by_text_search(
295
296
  "01-Jun-2025",
297
  "07-Jun-2025"
agentic_implementation/tools.py CHANGED
@@ -6,7 +6,7 @@ from schemas import (
6
  SendReplyParams,
7
  )
8
  from typing import Any, Dict
9
- from email_scraper import scrape_emails_from_sender, scrape_emails_by_text_search, _load_email_db, _save_email_db, _is_date_in_range
10
  from datetime import datetime, timedelta
11
  from typing import List
12
  from openai import OpenAI
 
6
  SendReplyParams,
7
  )
8
  from typing import Any, Dict
9
+ from email_scraper import scrape_emails_by_text_search, _load_email_db, _save_email_db, _is_date_in_range
10
  from datetime import datetime, timedelta
11
  from typing import List
12
  from openai import OpenAI