clean
Browse files
agentic_implementation/email_scraper.py
CHANGED
@@ -13,7 +13,7 @@ from dotenv import load_dotenv
|
|
13 |
from zoneinfo import ZoneInfo
|
14 |
from email.utils import parsedate_to_datetime
|
15 |
from typing import List, Dict
|
16 |
-
|
17 |
load_dotenv()
|
18 |
|
19 |
# Email credentials
|
@@ -26,33 +26,7 @@ def validate_email_setup():
|
|
26 |
"""Validate email setup and credentials"""
|
27 |
print("=== Email Setup Validation ===")
|
28 |
|
29 |
-
# Check .env file existence
|
30 |
-
# env_file_exists = os.path.exists('.env')
|
31 |
-
# print(f".env file exists: {'β
Yes' if env_file_exists else 'β No'}")
|
32 |
-
|
33 |
-
# if not env_file_exists:
|
34 |
-
# print("β No .env file found! Create one with:")
|
35 |
-
# print(" [email protected]")
|
36 |
-
# print(" APP_PASSWORD=your_16_char_app_password")
|
37 |
-
# print(" OPENAI_API_KEY=your_openai_key")
|
38 |
-
# return False
|
39 |
-
|
40 |
-
# Check environment variables
|
41 |
issues = []
|
42 |
-
|
43 |
-
# if not EMAIL_ID:
|
44 |
-
# issues.append("EMAIL_ID not set or empty")
|
45 |
-
# elif '@' not in EMAIL_ID:
|
46 |
-
# issues.append("EMAIL_ID doesn't look like an email address")
|
47 |
-
# elif not EMAIL_ID.endswith('@gmail.com'):
|
48 |
-
# issues.append("EMAIL_ID should be a Gmail address (@gmail.com)")
|
49 |
-
|
50 |
-
# if not APP_PASSWORD:
|
51 |
-
# issues.append("APP_PASSWORD not set or empty")
|
52 |
-
# elif len(APP_PASSWORD) != 16:
|
53 |
-
# issues.append(f"APP_PASSWORD should be 16 characters, got {len(APP_PASSWORD)}")
|
54 |
-
# elif ' ' in APP_PASSWORD:
|
55 |
-
# issues.append("APP_PASSWORD should not contain spaces (remove spaces from app password)")
|
56 |
|
57 |
if not os.getenv("OPENAI_API_KEY"):
|
58 |
issues.append("OPENAI_API_KEY not set (needed for query processing)")
|
@@ -76,9 +50,6 @@ def _imap_connect():
|
|
76 |
|
77 |
if EMAIL_ID:
|
78 |
print(f"Email ID: {EMAIL_ID[:5]}...@{EMAIL_ID.split('@')[1] if '@' in EMAIL_ID else 'INVALID'}")
|
79 |
-
# if APP_PASSWORD:
|
80 |
-
# print(f"App Password length: {len(APP_PASSWORD)} characters")
|
81 |
-
# print(f"App Password format: {'β
Looks correct (16 chars)' if len(APP_PASSWORD) == 16 else f'β Expected 16 chars, got {len(APP_PASSWORD)}'}")
|
82 |
|
83 |
if not EMAIL_ID or not APP_PASSWORD:
|
84 |
error_msg = "Missing credentials in environment variables!"
|
@@ -198,141 +169,6 @@ def _is_date_in_range(email_date: str, start_date: str, end_date: str) -> bool:
|
|
198 |
except ValueError:
|
199 |
return False
|
200 |
|
201 |
-
def scrape_emails_from_sender(sender_email: str, start_date: str, end_date: str) -> List[Dict]:
|
202 |
-
"""
|
203 |
-
Scrape emails from specific sender within date range
|
204 |
-
Uses intelligent caching to avoid re-scraping
|
205 |
-
"""
|
206 |
-
print(f"Scraping emails from {sender_email} between {start_date} and {end_date}")
|
207 |
-
|
208 |
-
# Load existing database
|
209 |
-
db = _load_email_db()
|
210 |
-
sender_email = sender_email.lower().strip()
|
211 |
-
|
212 |
-
# Check if we have cached emails for this sender
|
213 |
-
if sender_email in db:
|
214 |
-
cached_emails = db[sender_email].get("emails", [])
|
215 |
-
|
216 |
-
# Filter cached emails by date range
|
217 |
-
filtered_emails = [
|
218 |
-
email for email in cached_emails
|
219 |
-
if _is_date_in_range(email["date"], start_date, end_date)
|
220 |
-
]
|
221 |
-
|
222 |
-
# Check if we need to scrape more recent emails
|
223 |
-
last_scraped = db[sender_email].get("last_scraped", "01-Jan-2020")
|
224 |
-
today = datetime.today().strftime("%d-%b-%Y")
|
225 |
-
|
226 |
-
if last_scraped == today and filtered_emails:
|
227 |
-
print(f"Using cached emails (last scraped: {last_scraped})")
|
228 |
-
return filtered_emails
|
229 |
-
|
230 |
-
# Need to scrape emails
|
231 |
-
try:
|
232 |
-
mail = _imap_connect()
|
233 |
-
|
234 |
-
# Prepare IMAP search criteria
|
235 |
-
start_imap = _date_to_imap_format(start_date)
|
236 |
-
# Add one day to end_date for BEFORE criteria (IMAP BEFORE is exclusive)
|
237 |
-
end_dt = datetime.strptime(end_date, "%d-%b-%Y") + timedelta(days=1)
|
238 |
-
end_imap = end_dt.strftime("%d-%b-%Y")
|
239 |
-
|
240 |
-
search_criteria = f'(FROM "{sender_email}") SINCE "{start_imap}" BEFORE "{end_imap}"'
|
241 |
-
print(f"IMAP search: {search_criteria}")
|
242 |
-
|
243 |
-
# Search for emails
|
244 |
-
status, data = mail.search(None, search_criteria)
|
245 |
-
if status != 'OK':
|
246 |
-
raise Exception(f"IMAP search failed: {status}")
|
247 |
-
|
248 |
-
email_ids = data[0].split()
|
249 |
-
print(f"Found {len(email_ids)} emails")
|
250 |
-
|
251 |
-
scraped_emails = []
|
252 |
-
|
253 |
-
# Process each email
|
254 |
-
for i, email_id in enumerate(email_ids):
|
255 |
-
try:
|
256 |
-
print(f"Processing email {i+1}/{len(email_ids)}")
|
257 |
-
|
258 |
-
# Fetch email
|
259 |
-
status, msg_data = mail.fetch(email_id, "(RFC822)")
|
260 |
-
if status != 'OK':
|
261 |
-
continue
|
262 |
-
|
263 |
-
# Parse email
|
264 |
-
msg = message_from_bytes(msg_data[0][1])
|
265 |
-
|
266 |
-
# Extract information
|
267 |
-
subject = msg.get("Subject", "No Subject")
|
268 |
-
content = _email_to_clean_text(msg)
|
269 |
-
|
270 |
-
# Parse date
|
271 |
-
date_header = msg.get("Date", "")
|
272 |
-
if date_header:
|
273 |
-
try:
|
274 |
-
dt_obj = parsedate_to_datetime(date_header)
|
275 |
-
# Convert to IST
|
276 |
-
ist_dt = dt_obj.astimezone(ZoneInfo("Asia/Kolkata"))
|
277 |
-
email_date = ist_dt.strftime("%d-%b-%Y")
|
278 |
-
email_time = ist_dt.strftime("%H:%M:%S")
|
279 |
-
except:
|
280 |
-
email_date = datetime.today().strftime("%d-%b-%Y")
|
281 |
-
email_time = "00:00:00"
|
282 |
-
else:
|
283 |
-
email_date = datetime.today().strftime("%d-%b-%Y")
|
284 |
-
email_time = "00:00:00"
|
285 |
-
|
286 |
-
# Get message ID for deduplication
|
287 |
-
message_id = msg.get("Message-ID", f"missing-{email_id.decode()}")
|
288 |
-
|
289 |
-
scraped_emails.append({
|
290 |
-
"date": email_date,
|
291 |
-
"time": email_time,
|
292 |
-
"subject": subject,
|
293 |
-
"content": content[:2000], # Limit content length
|
294 |
-
"message_id": message_id
|
295 |
-
})
|
296 |
-
|
297 |
-
except Exception as e:
|
298 |
-
print(f"Error processing email {email_id}: {e}")
|
299 |
-
continue
|
300 |
-
|
301 |
-
mail.logout()
|
302 |
-
|
303 |
-
# Update database
|
304 |
-
if sender_email not in db:
|
305 |
-
db[sender_email] = {"emails": [], "last_scraped": ""}
|
306 |
-
|
307 |
-
# Merge with existing emails (avoid duplicates)
|
308 |
-
existing_emails = db[sender_email].get("emails", [])
|
309 |
-
existing_ids = {email.get("message_id") for email in existing_emails}
|
310 |
-
|
311 |
-
new_emails = [
|
312 |
-
email for email in scraped_emails
|
313 |
-
if email["message_id"] not in existing_ids
|
314 |
-
]
|
315 |
-
|
316 |
-
# Update database
|
317 |
-
db[sender_email]["emails"] = existing_emails + new_emails
|
318 |
-
db[sender_email]["last_scraped"] = datetime.today().strftime("%d-%b-%Y")
|
319 |
-
|
320 |
-
# Save database
|
321 |
-
_save_email_db(db)
|
322 |
-
|
323 |
-
# Return filtered results
|
324 |
-
all_emails = db[sender_email]["emails"]
|
325 |
-
filtered_emails = [
|
326 |
-
email for email in all_emails
|
327 |
-
if _is_date_in_range(email["date"], start_date, end_date)
|
328 |
-
]
|
329 |
-
|
330 |
-
print(f"Scraped {len(new_emails)} new emails, returning {len(filtered_emails)} in date range")
|
331 |
-
return filtered_emails
|
332 |
-
|
333 |
-
except Exception as e:
|
334 |
-
print(f"Email scraping failed: {e}")
|
335 |
-
raise
|
336 |
|
337 |
def scrape_emails_by_text_search(keyword: str, start_date: str, end_date: str) -> List[Dict]:
|
338 |
"""
|
@@ -455,7 +291,7 @@ def scrape_emails_by_text_search(keyword: str, start_date: str, end_date: str) -
|
|
455 |
if __name__ == "__main__":
|
456 |
# Test scraping
|
457 |
try:
|
458 |
-
emails =
|
459 | |
460 |
"01-Jun-2025",
|
461 |
"07-Jun-2025"
|
|
|
13 |
from zoneinfo import ZoneInfo
|
14 |
from email.utils import parsedate_to_datetime
|
15 |
from typing import List, Dict
|
16 |
+
from logger import logger
|
17 |
load_dotenv()
|
18 |
|
19 |
# Email credentials
|
|
|
26 |
"""Validate email setup and credentials"""
|
27 |
print("=== Email Setup Validation ===")
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
issues = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
if not os.getenv("OPENAI_API_KEY"):
|
32 |
issues.append("OPENAI_API_KEY not set (needed for query processing)")
|
|
|
50 |
|
51 |
if EMAIL_ID:
|
52 |
print(f"Email ID: {EMAIL_ID[:5]}...@{EMAIL_ID.split('@')[1] if '@' in EMAIL_ID else 'INVALID'}")
|
|
|
|
|
|
|
53 |
|
54 |
if not EMAIL_ID or not APP_PASSWORD:
|
55 |
error_msg = "Missing credentials in environment variables!"
|
|
|
169 |
except ValueError:
|
170 |
return False
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
def scrape_emails_by_text_search(keyword: str, start_date: str, end_date: str) -> List[Dict]:
|
174 |
"""
|
|
|
291 |
if __name__ == "__main__":
|
292 |
# Test scraping
|
293 |
try:
|
294 |
+
emails = scrape_emails_by_text_search(
|
295 | |
296 |
"01-Jun-2025",
|
297 |
"07-Jun-2025"
|
agentic_implementation/tools.py
CHANGED
@@ -6,7 +6,7 @@ from schemas import (
|
|
6 |
SendReplyParams,
|
7 |
)
|
8 |
from typing import Any, Dict
|
9 |
-
from email_scraper import
|
10 |
from datetime import datetime, timedelta
|
11 |
from typing import List
|
12 |
from openai import OpenAI
|
|
|
6 |
SendReplyParams,
|
7 |
)
|
8 |
from typing import Any, Dict
|
9 |
+
from email_scraper import scrape_emails_by_text_search, _load_email_db, _save_email_db, _is_date_in_range
|
10 |
from datetime import datetime, timedelta
|
11 |
from typing import List
|
12 |
from openai import OpenAI
|