dygoo commited on
Commit
725cd97
Β·
verified Β·
1 Parent(s): 3702004

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +550 -204
app.py CHANGED
@@ -9,18 +9,31 @@ import os
9
  from datetime import datetime, timedelta
10
  from dateutil import parser
11
  import json
 
 
 
12
 
13
  # Initialize Anthropic client
14
  client = anthropic.Anthropic(
15
- api_key=os.getenv("ANTHROPIC_API_KEY") # Set as secret in HF Space settings
16
  )
17
 
18
- # === Model functions ===
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def extract_publication_date(soup, url):
21
- """Extract publication date from article HTML"""
22
  try:
23
- # Common date selectors
24
  date_selectors = [
25
  'time[datetime]',
26
  '.date', '.publish-date', '.published', '.post-date',
@@ -40,14 +53,13 @@ def extract_publication_date(soup, url):
40
  except:
41
  continue
42
 
43
- # Look for date patterns in text
44
  date_patterns = [
45
- r'(\w+ \d{1,2}, \d{4})', # January 15, 2023
46
- r'(\d{1,2}/\d{1,2}/\d{4})', # 01/15/2023
47
- r'(\d{4}-\d{2}-\d{2})' # 2023-01-15
48
  ]
49
 
50
- text = soup.get_text()[:2000] # First 2000 chars
51
  for pattern in date_patterns:
52
  matches = re.findall(pattern, text)
53
  if matches:
@@ -61,7 +73,11 @@ def extract_publication_date(soup, url):
61
 
62
  return None
63
 
64
- def get_full_article(url):
 
 
 
 
65
  try:
66
  headers = {
67
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
@@ -71,11 +87,10 @@ def get_full_article(url):
71
  'Upgrade-Insecure-Requests': '1'
72
  }
73
 
74
- response = requests.get(url, headers=headers, timeout=20, verify=True)
75
  response.raise_for_status()
76
  soup = BeautifulSoup(response.content, 'html.parser')
77
 
78
- # Extract publication date
79
  pub_date = extract_publication_date(soup, url)
80
 
81
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
@@ -103,24 +118,24 @@ def get_full_article(url):
103
 
104
  except requests.exceptions.Timeout:
105
  return "[WARNING] Article fetch timeout - using snippet instead", None
106
- except requests.exceptions.RequestException:
107
- return "[ERROR] Could not fetch article: Network error", None
108
  except Exception as e:
109
  return f"[ERROR] Could not fetch article: {str(e)}", None
110
 
111
- def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
112
- """Search for articles in specific timeframe"""
 
 
 
113
 
114
- # Define search queries based on timeframe
115
  if timeframe == "recent":
116
- # Recent articles (news, updates, current events)
117
  search_queries = [
118
  f'"{name}" founder news 2024 2025',
119
  f'"{name}" CEO founder recent',
120
  f'"{name}" founder update latest'
121
  ]
122
- else: # historical
123
- # Historical articles (founding, establishment, origin stories)
124
  search_queries = [
125
  f'"{name}" founded established history',
126
  f'"{name}" founder origin story',
@@ -130,20 +145,31 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
130
 
131
  all_results = []
132
  max_retries = 2
133
- base_delay = 3
134
 
 
 
135
  for query_idx, search_query in enumerate(search_queries):
136
- if len(all_results) >= max_articles:
137
  break
138
 
 
 
 
 
139
  for attempt in range(max_retries):
 
 
 
140
  try:
141
  print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
142
- time.sleep(base_delay * (attempt + 1))
 
 
143
 
144
  configs = [
145
- {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
146
- {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
147
  ]
148
 
149
  config = configs[min(attempt, len(configs)-1)]
@@ -151,17 +177,15 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
151
  with DDGS(timeout=config['timeout']) as ddgs:
152
  search_params = {
153
  'keywords': search_query,
154
- 'max_results': max_articles - len(all_results) + 2, # Get a few extra to filter
155
  'safesearch': config['safesearch']
156
  }
157
  if config['region']:
158
  search_params['region'] = config['region']
159
 
160
  results = list(ddgs.text(**search_params))
161
- print(f"Found {len(results)} results for query {query_idx + 1}")
162
 
163
  if results:
164
- # Add unique results (avoid duplicates)
165
  existing_urls = {r.get('url', '') for r in all_results}
166
  for result in results:
167
  if len(all_results) >= max_articles:
@@ -175,114 +199,193 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
175
  except Exception as e:
176
  print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
177
  if attempt < max_retries - 1:
178
- time.sleep(base_delay * (attempt + 2))
179
 
180
  return all_results[:max_articles]
181
 
182
  def categorize_article_by_date(pub_date):
183
- """Categorize article as recent or historical based on publication date"""
184
  if not pub_date:
185
  return "unknown"
186
 
187
  one_year_ago = datetime.now() - timedelta(days=365)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- if pub_date >= one_year_ago:
190
- return "recent"
 
 
 
 
 
191
  else:
192
- return "historical"
193
 
194
- def search_articles(name: str, max_articles: int = 4) -> str:
195
- """Enhanced search that ensures both recent and historical articles"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
- # Split articles between recent and historical
198
  recent_count = max_articles // 2
199
  historical_count = max_articles - recent_count
200
 
201
- print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
 
202
 
203
  # Search for recent articles
204
- recent_results = search_articles_by_timeframe(name, "recent", recent_count)
205
- time.sleep(2) # Brief pause between timeframe searches
206
 
207
- # Search for historical articles
208
- historical_results = search_articles_by_timeframe(name, "historical", historical_count)
209
 
210
- # Combine and process all results
211
- all_results = []
212
 
213
- # Process recent articles
 
 
 
 
 
 
 
 
 
 
 
 
214
  for result in recent_results:
215
  result['expected_timeframe'] = 'recent'
216
  all_results.append(result)
217
 
218
- # Process historical articles
219
  for result in historical_results:
220
  result['expected_timeframe'] = 'historical'
221
  all_results.append(result)
222
 
223
  if not all_results:
 
 
224
  return f"[INFO] No articles found for {name}"
225
 
226
- # Fetch and categorize articles
 
 
 
227
  articles = []
228
  recent_found = 0
229
  historical_found = 0
230
 
231
- for i, result in enumerate(all_results, 1):
232
- url = result.get('href', 'No URL')
233
- title = result.get('title', 'No Title')
234
- snippet = result.get('body', 'No snippet available')
235
- expected_timeframe = result.get('expected_timeframe', 'unknown')
236
-
237
- if i > 1:
238
- time.sleep(2)
239
-
240
- full_text, pub_date = get_full_article(url)
241
- actual_timeframe = categorize_article_by_date(pub_date)
242
-
243
- # Count articles by actual timeframe
244
- if actual_timeframe == "recent":
245
- recent_found += 1
246
- elif actual_timeframe == "historical":
247
- historical_found += 1
248
 
249
- if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
250
- print(f"Using snippet fallback for article {i}")
251
- content = f"[SNIPPET ONLY]\n{snippet}"
252
- else:
253
- content = full_text
254
-
255
- # Create timeframe indicator
256
- timeframe_indicator = ""
257
- if pub_date:
258
- date_str = pub_date.strftime("%B %d, %Y")
259
- timeframe_indicator = f"πŸ“… **Published**: {date_str} ({actual_timeframe.title()})"
260
- else:
261
- timeframe_indicator = f"πŸ“… **Timeframe**: {expected_timeframe.title()} (estimated)"
262
-
263
- article = f"### {i}. {title}\n"
264
- article += f"[Source]({url})\n"
265
- article += f"{timeframe_indicator}\n\n"
266
- article += f"{content}\n"
267
- articles.append(article)
268
-
269
- # Add summary of coverage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
271
 
272
- return summary + "\n---\n".join(articles)
 
 
 
 
 
273
 
274
- def extract_entities(search_results: str, company_name: str) -> str:
275
- """Extract entities using Claude 4"""
 
 
 
 
276
  MAX_CHARS = 15000
277
  if len(search_results) > MAX_CHARS:
278
  trunc = search_results[:MAX_CHARS]
279
  last_period = trunc.rfind('. ')
280
  search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
281
 
 
 
 
282
  prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
283
  Only include founders who are explicitly mentioned as founders of {company_name}.
284
  Ignore founders of other companies that may be mentioned in the text.
285
-
286
  Return a JSON object with the following structure:
287
  {{
288
  "founders": [
@@ -292,15 +395,15 @@ Return a JSON object with the following structure:
292
  }}
293
  ]
294
  }}
295
-
296
  Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
297
-
298
  You have to examine every article available in the search results below.
299
-
300
  Text:
301
  {search_results}"""
302
 
303
  try:
 
 
 
304
  message = client.messages.create(
305
  model="claude-sonnet-4-20250514",
306
  max_tokens=1500,
@@ -312,39 +415,64 @@ Text:
312
  }
313
  ]
314
  )
315
- return message.content[0].text
 
 
 
 
 
 
 
 
 
316
 
317
  except Exception as e:
 
 
318
  return f"[ERROR] Extraction failed: {str(e)}"
319
 
320
- # === Gradio interface functions ===
321
 
322
- def search_only(name: str, article_count: int):
 
323
  if not name.strip():
324
- return "No name provided", ""
325
 
326
  try:
327
  start = time.time()
328
- articles_output = search_articles(name.strip(), max_articles=article_count)
 
 
 
 
 
 
329
  elapsed = time.time() - start
 
330
 
331
  results = f"βœ… **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
332
  results += articles_output
333
 
334
  return results, articles_output
 
335
  except Exception as e:
336
- return f"[ERROR] Search failed: {str(e)}", ""
 
337
 
338
- def extract_only(stored_results: str, company_name: str):
 
339
  if not stored_results.strip():
340
- return "No search results available. Please search first."
341
 
342
  if not company_name.strip():
343
- return "No company name provided. Please search first."
 
 
 
344
 
345
  try:
346
  start = time.time()
347
- entities = extract_entities(stored_results, company_name.strip())
348
  elapsed = time.time() - start
349
 
350
  # Try to format JSON for better readability
@@ -354,50 +482,103 @@ def extract_only(stored_results: str, company_name: str):
354
  return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
355
  except:
356
  return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
 
357
  except Exception as e:
358
- return f"[ERROR] Extraction failed: {str(e)}"
 
359
 
360
- # === Gradio UI ===
 
 
 
361
 
362
- with gr.Blocks(title="Enhanced Founder Finder") as demo:
 
 
363
  gr.Markdown("# πŸ”Ž Enhanced Founder Finder")
364
- gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
365
- gr.Markdown("*πŸš€ **New**: Automatically searches for both recent news AND historical founding information*")
366
- gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
367
 
368
  search_state = gr.State("")
369
 
370
  with gr.Row():
371
- name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
372
- article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
373
- with gr.Column():
374
- search_btn = gr.Button("πŸ” Enhanced Temporal Search", variant="primary")
375
- extract_btn = gr.Button("πŸ“Š Extract Founder Intelligence", variant="secondary")
 
 
 
 
 
 
 
 
 
376
 
377
- output1 = gr.Markdown(label="Search Results with Temporal Analysis")
378
- output2 = gr.Textbox(
379
- label="Founder Intelligence Report",
380
- lines=15,
381
- max_lines=25,
382
- show_copy_button=True
383
- )
384
 
385
- search_btn.click(
386
- fn=search_only,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  inputs=[name_input, article_count_slider],
388
- outputs=[output1, search_state]
 
 
 
 
 
 
389
  )
390
 
391
  extract_btn.click(
392
- fn=extract_only,
393
  inputs=[search_state, name_input],
394
- outputs=[output2]
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  )
396
 
397
  if __name__ == "__main__":
398
- demo.launch()
 
 
 
 
 
399
 
400
- ''' import gradio as gr
 
401
  import requests
402
  import time
403
  import re
@@ -405,6 +586,9 @@ from duckduckgo_search import DDGS
405
  from bs4 import BeautifulSoup
406
  import anthropic
407
  import os
 
 
 
408
 
409
  # Initialize Anthropic client
410
  client = anthropic.Anthropic(
@@ -413,6 +597,50 @@ client = anthropic.Anthropic(
413
 
414
  # === Model functions ===
415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  def get_full_article(url):
417
  try:
418
  headers = {
@@ -427,6 +655,9 @@ def get_full_article(url):
427
  response.raise_for_status()
428
  soup = BeautifulSoup(response.content, 'html.parser')
429
 
 
 
 
430
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
431
  element.decompose()
432
 
@@ -444,89 +675,185 @@ def get_full_article(url):
444
  text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
445
  full_text = '\n\n'.join(text_parts)
446
  if len(full_text) > 300:
447
- return full_text[:10000]
448
 
449
  body_text = soup.get_text(separator='\n\n', strip=True)
450
  body_text = re.sub(r'\n{3,}', '\n\n', body_text)
451
- return body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"
452
 
453
  except requests.exceptions.Timeout:
454
- return "[WARNING] Article fetch timeout - using snippet instead"
455
  except requests.exceptions.RequestException:
456
- return "[ERROR] Could not fetch article: Network error"
457
  except Exception as e:
458
- return f"[ERROR] Could not fetch article: {str(e)}"
459
 
460
- def search_articles(name: str, max_articles: int = 2) -> str:
461
- keywords = ['founder']
462
- search_query = f'"{name}" ({" AND ".join(keywords)}) site:news'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
- max_retries = 3
 
465
  base_delay = 3
466
 
467
- for attempt in range(max_retries):
468
- try:
469
- print(f"Search attempt {attempt + 1}: {search_query}")
470
- time.sleep(base_delay * (attempt + 1))
 
 
 
 
471
 
472
- configs = [
473
- {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
474
- {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'},
475
- {'timeout': 30, 'region': None, 'safesearch': 'moderate'}
476
- ]
477
 
478
- config = configs[min(attempt, len(configs)-1)]
479
 
480
- with DDGS(timeout=config['timeout']) as ddgs:
481
- search_params = {
482
- 'keywords': search_query,
483
- 'max_results': max_articles,
484
- 'safesearch': config['safesearch']
485
- }
486
- if config['region']:
487
- search_params['region'] = config['region']
488
 
489
- results = list(ddgs.text(**search_params))
490
- print(f"Found {len(results)} results on attempt {attempt + 1}")
491
 
492
- if not results:
493
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
- articles = []
496
- for i, result in enumerate(results, 1):
497
- url = result.get('href', 'No URL')
498
- title = result.get('title', 'No Title')
499
- snippet = result.get('body', 'No snippet available')
500
 
501
- if i > 1:
502
- time.sleep(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
 
504
- full_text = get_full_article(url)
505
- if any(error in full_text for error in ["[ERROR]", "timeout", "Network error"]):
506
- print(f"Using snippet fallback for article {i}")
507
- content = f"[SNIPPET ONLY]\n{snippet}"
508
- else:
509
- content = full_text
510
 
511
- article = f"### {i}. {title}\n"
512
- article += f"[Source]({url})\n\n"
513
- article += f"{content}\n"
514
- articles.append(article)
 
 
 
 
 
 
 
 
 
 
515
 
516
- return "\n---\n".join(articles)
 
 
 
 
 
 
517
 
518
- except Exception as e:
519
- print(f"Attempt {attempt + 1} failed: {str(e)}")
520
- if attempt < max_retries - 1:
521
- time.sleep(base_delay * (attempt + 2))
522
- else:
523
- return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"
524
 
525
- return f"[INFO] No articles found for {name}"
 
 
 
526
 
527
  def extract_entities(search_results: str, company_name: str) -> str:
528
  """Extract entities using Claude 4"""
529
- MAX_CHARS = 8000
530
  if len(search_results) > MAX_CHARS:
531
  trunc = search_results[:MAX_CHARS]
532
  last_period = trunc.rfind('. ')
@@ -535,18 +862,29 @@ def extract_entities(search_results: str, company_name: str) -> str:
535
  prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
536
  Only include founders who are explicitly mentioned as founders of {company_name}.
537
  Ignore founders of other companies that may be mentioned in the text.
538
- Return a JSON object with the following two keys:
539
- - "people": a list of names of people mentioned as founders of {company_name}
540
- - "organizations": a list of organization names mentioned as founders of {company_name}
541
- Respond only with valid JSON. Do not include any explanations, comments, or additional formatting. Double check that you included all founders. Double check that the founders you included are indeed founders of {company_name}.
 
 
 
 
 
 
 
 
 
 
 
542
  Text:
543
  {search_results}"""
544
 
545
  try:
546
  message = client.messages.create(
547
  model="claude-sonnet-4-20250514",
548
- max_tokens=1000,
549
- temperature=0.15,
550
  messages=[
551
  {
552
  "role": "user",
@@ -570,7 +908,7 @@ def search_only(name: str, article_count: int):
570
  articles_output = search_articles(name.strip(), max_articles=article_count)
571
  elapsed = time.time() - start
572
 
573
- results = f"βœ… Search completed for **{name}** in {elapsed:.1f}s\n\n"
574
  results += articles_output
575
 
576
  return results, articles_output
@@ -588,31 +926,39 @@ def extract_only(stored_results: str, company_name: str):
588
  start = time.time()
589
  entities = extract_entities(stored_results, company_name.strip())
590
  elapsed = time.time() - start
591
- return f"βœ… Extraction completed in {elapsed:.1f}s\n\n{entities}"
 
 
 
 
 
 
 
592
  except Exception as e:
593
  return f"[ERROR] Extraction failed: {str(e)}"
594
 
595
  # === Gradio UI ===
596
 
597
- with gr.Blocks(title="Founder Finder") as demo:
598
- gr.Markdown("# πŸ”Ž Founder Finder")
599
- gr.Markdown("Enter a business or project name to search for its founder.")
600
- gr.Markdown("*Note: Full article extraction may take 30–60 seconds.")
 
601
 
602
  search_state = gr.State("")
603
 
604
  with gr.Row():
605
  name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
606
- article_count_slider = gr.Slider(1, 10, value=2, step=1, label="Number of Articles")
607
  with gr.Column():
608
- search_btn = gr.Button("πŸ” Search Articles", variant="primary")
609
- extract_btn = gr.Button("πŸ“‹ Extract Entities", variant="secondary")
610
 
611
- output1 = gr.Markdown(label="Search Results")
612
  output2 = gr.Textbox(
613
- label="Extracted Entities and Relationships",
614
- lines=10,
615
- max_lines=20,
616
  show_copy_button=True
617
  )
618
 
@@ -630,5 +976,5 @@ with gr.Blocks(title="Founder Finder") as demo:
630
 
631
  if __name__ == "__main__":
632
  demo.launch()
633
- '''
634
-
 
9
  from datetime import datetime, timedelta
10
  from dateutil import parser
11
  import json
12
+ import threading
13
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
14
+ import signal
15
 
16
  # Initialize Anthropic client
17
  client = anthropic.Anthropic(
18
+ api_key=os.getenv("ANTHROPIC_API_KEY")
19
  )
20
 
21
+ # Global variable to track cancellation
22
+ cancel_operation = threading.Event()
23
+
24
+ def reset_cancellation():
25
+ """Reset the cancellation flag"""
26
+ cancel_operation.clear()
27
+
28
+ def check_cancellation():
29
+ """Check if operation should be cancelled"""
30
+ return cancel_operation.is_set()
31
+
32
+ # === Enhanced Model functions with progress tracking ===
33
 
34
  def extract_publication_date(soup, url):
35
+ """Extract publication date from article HTML - same as before"""
36
  try:
 
37
  date_selectors = [
38
  'time[datetime]',
39
  '.date', '.publish-date', '.published', '.post-date',
 
53
  except:
54
  continue
55
 
 
56
  date_patterns = [
57
+ r'(\w+ \d{1,2}, \d{4})',
58
+ r'(\d{1,2}/\d{1,2}/\d{4})',
59
+ r'(\d{4}-\d{2}-\d{2})'
60
  ]
61
 
62
+ text = soup.get_text()[:2000]
63
  for pattern in date_patterns:
64
  matches = re.findall(pattern, text)
65
  if matches:
 
73
 
74
  return None
75
 
76
+ def get_full_article_with_timeout(url, timeout=15):
77
+ """Enhanced article fetching with timeout and better error handling"""
78
+ if check_cancellation():
79
+ return "[CANCELLED] Operation was cancelled", None
80
+
81
  try:
82
  headers = {
83
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
 
87
  'Upgrade-Insecure-Requests': '1'
88
  }
89
 
90
+ response = requests.get(url, headers=headers, timeout=timeout, verify=True)
91
  response.raise_for_status()
92
  soup = BeautifulSoup(response.content, 'html.parser')
93
 
 
94
  pub_date = extract_publication_date(soup, url)
95
 
96
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
 
118
 
119
  except requests.exceptions.Timeout:
120
  return "[WARNING] Article fetch timeout - using snippet instead", None
121
+ except requests.exceptions.RequestException as e:
122
+ return f"[ERROR] Network error: {str(e)}", None
123
  except Exception as e:
124
  return f"[ERROR] Could not fetch article: {str(e)}", None
125
 
126
+ def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
127
+ """Enhanced search with progress tracking and better error handling"""
128
+
129
+ if check_cancellation():
130
+ return []
131
 
 
132
  if timeframe == "recent":
 
133
  search_queries = [
134
  f'"{name}" founder news 2024 2025',
135
  f'"{name}" CEO founder recent',
136
  f'"{name}" founder update latest'
137
  ]
138
+ else:
 
139
  search_queries = [
140
  f'"{name}" founded established history',
141
  f'"{name}" founder origin story',
 
145
 
146
  all_results = []
147
  max_retries = 2
148
+ base_delay = 2 # Reduced delay
149
 
150
+ total_queries = len(search_queries)
151
+
152
  for query_idx, search_query in enumerate(search_queries):
153
+ if len(all_results) >= max_articles or check_cancellation():
154
  break
155
 
156
+ if progress:
157
+ query_progress = (query_idx / total_queries) * 0.3 # 30% of progress for queries
158
+ progress(query_progress, desc=f"Searching {timeframe} articles ({query_idx + 1}/{total_queries})")
159
+
160
  for attempt in range(max_retries):
161
+ if check_cancellation():
162
+ return all_results
163
+
164
  try:
165
  print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
166
+
167
+ if attempt > 0:
168
+ time.sleep(base_delay * attempt)
169
 
170
  configs = [
171
+ {'timeout': 15, 'region': 'us-en', 'safesearch': 'moderate'},
172
+ {'timeout': 20, 'region': 'wt-wt', 'safesearch': 'off'}
173
  ]
174
 
175
  config = configs[min(attempt, len(configs)-1)]
 
177
  with DDGS(timeout=config['timeout']) as ddgs:
178
  search_params = {
179
  'keywords': search_query,
180
+ 'max_results': max_articles - len(all_results) + 2,
181
  'safesearch': config['safesearch']
182
  }
183
  if config['region']:
184
  search_params['region'] = config['region']
185
 
186
  results = list(ddgs.text(**search_params))
 
187
 
188
  if results:
 
189
  existing_urls = {r.get('url', '') for r in all_results}
190
  for result in results:
191
  if len(all_results) >= max_articles:
 
199
  except Exception as e:
200
  print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
201
  if attempt < max_retries - 1:
202
+ time.sleep(base_delay * (attempt + 1))
203
 
204
  return all_results[:max_articles]
205
 
206
  def categorize_article_by_date(pub_date):
207
+ """Same as before"""
208
  if not pub_date:
209
  return "unknown"
210
 
211
  one_year_ago = datetime.now() - timedelta(days=365)
212
+ return "recent" if pub_date >= one_year_ago else "historical"
213
+
214
+ def fetch_article_parallel(result, article_num, total_articles, progress=None):
215
+ """Fetch single article with progress update"""
216
+ if check_cancellation():
217
+ return None
218
+
219
+ url = result.get('href', 'No URL')
220
+ title = result.get('title', 'No Title')
221
+ snippet = result.get('body', 'No snippet available')
222
+ expected_timeframe = result.get('expected_timeframe', 'unknown')
223
+
224
+ if progress:
225
+ fetch_progress = 0.4 + (article_num / total_articles) * 0.5 # 40-90% of total progress
226
+ progress(fetch_progress, desc=f"Fetching article {article_num + 1}/{total_articles}: {title[:50]}...")
227
+
228
+ full_text, pub_date = get_full_article_with_timeout(url, timeout=12)
229
 
230
+ if check_cancellation():
231
+ return None
232
+
233
+ actual_timeframe = categorize_article_by_date(pub_date)
234
+
235
+ if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error", "[CANCELLED]"]):
236
+ content = f"[SNIPPET ONLY]\n{snippet}"
237
  else:
238
+ content = full_text
239
 
240
+ timeframe_indicator = ""
241
+ if pub_date:
242
+ date_str = pub_date.strftime("%B %d, %Y")
243
+ timeframe_indicator = f"πŸ“… **Published**: {date_str} ({actual_timeframe.title()})"
244
+ else:
245
+ timeframe_indicator = f"πŸ“… **Timeframe**: {expected_timeframe.title()} (estimated)"
246
+
247
+ article = f"### {article_num + 1}. {title}\n"
248
+ article += f"[Source]({url})\n"
249
+ article += f"{timeframe_indicator}\n\n"
250
+ article += f"{content}\n"
251
+
252
+ return {
253
+ 'article': article,
254
+ 'timeframe': actual_timeframe,
255
+ 'url': url,
256
+ 'title': title
257
+ }
258
+
259
+ def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) -> str:
260
+ """Enhanced search with progress tracking and parallel processing"""
261
+
262
+ reset_cancellation() # Reset cancellation flag
263
+
264
+ if progress:
265
+ progress(0, desc="Initializing enhanced search...")
266
 
 
267
  recent_count = max_articles // 2
268
  historical_count = max_articles - recent_count
269
 
270
+ if progress:
271
+ progress(0.05, desc=f"Planning search: {recent_count} recent + {historical_count} historical articles")
272
 
273
  # Search for recent articles
274
+ if progress:
275
+ progress(0.1, desc="Searching for recent articles...")
276
 
277
+ recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress)
 
278
 
279
+ if check_cancellation():
280
+ return "[CANCELLED] Search was cancelled by user"
281
 
282
+ if progress:
283
+ progress(0.3, desc="Searching for historical articles...")
284
+
285
+ # Brief pause between searches
286
+ time.sleep(1)
287
+
288
+ historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress)
289
+
290
+ if check_cancellation():
291
+ return "[CANCELLED] Search was cancelled by user"
292
+
293
+ # Combine results
294
+ all_results = []
295
  for result in recent_results:
296
  result['expected_timeframe'] = 'recent'
297
  all_results.append(result)
298
 
 
299
  for result in historical_results:
300
  result['expected_timeframe'] = 'historical'
301
  all_results.append(result)
302
 
303
  if not all_results:
304
+ if progress:
305
+ progress(1.0, desc="Search completed - no results found")
306
  return f"[INFO] No articles found for {name}"
307
 
308
+ if progress:
309
+ progress(0.4, desc=f"Found {len(all_results)} articles, now fetching content...")
310
+
311
+ # Fetch articles with parallel processing (but limited concurrency)
312
  articles = []
313
  recent_found = 0
314
  historical_found = 0
315
 
316
+ # Use ThreadPoolExecutor for controlled parallel fetching
317
+ max_workers = min(3, len(all_results)) # Limit concurrent requests
318
+
319
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
320
+ # Submit all tasks
321
+ future_to_result = {
322
+ executor.submit(fetch_article_parallel, result, i, len(all_results), progress): (result, i)
323
+ for i, result in enumerate(all_results)
324
+ }
 
 
 
 
 
 
 
 
325
 
326
+ # Collect results as they complete
327
+ for future in as_completed(future_to_result, timeout=60): # 60 second timeout
328
+ if check_cancellation():
329
+ # Cancel remaining futures
330
+ for f in future_to_result:
331
+ f.cancel()
332
+ return "[CANCELLED] Search was cancelled by user"
333
+
334
+ try:
335
+ result_data = future.result(timeout=15)
336
+ if result_data:
337
+ articles.append(result_data)
338
+
339
+ # Count by timeframe
340
+ if result_data['timeframe'] == "recent":
341
+ recent_found += 1
342
+ elif result_data['timeframe'] == "historical":
343
+ historical_found += 1
344
+
345
+ except TimeoutError:
346
+ print("Article fetch timed out")
347
+ continue
348
+ except Exception as e:
349
+ print(f"Error fetching article: {e}")
350
+ continue
351
+
352
+ if check_cancellation():
353
+ return "[CANCELLED] Search was cancelled by user"
354
+
355
+ if progress:
356
+ progress(0.95, desc="Formatting results...")
357
+
358
+ # Sort articles to maintain order
359
+ articles.sort(key=lambda x: all_results.index(next(r for r in all_results if r.get('href', '') == x['url'])))
360
+
361
+ # Create final output
362
  summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
363
 
364
+ article_texts = [article_data['article'] for article_data in articles]
365
+
366
+ if progress:
367
+ progress(1.0, desc=f"Search completed! Found {len(articles)} articles")
368
+
369
+ return summary + "\n---\n".join(article_texts)
370
 
371
+ def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
372
+ """Enhanced entity extraction with progress tracking"""
373
+
374
+ if progress:
375
+ progress(0, desc="Preparing text for analysis...")
376
+
377
  MAX_CHARS = 15000
378
  if len(search_results) > MAX_CHARS:
379
  trunc = search_results[:MAX_CHARS]
380
  last_period = trunc.rfind('. ')
381
  search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
382
 
383
+ if progress:
384
+ progress(0.2, desc="Analyzing articles with AI...")
385
+
386
  prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
387
  Only include founders who are explicitly mentioned as founders of {company_name}.
388
  Ignore founders of other companies that may be mentioned in the text.
 
389
  Return a JSON object with the following structure:
390
  {{
391
  "founders": [
 
395
  }}
396
  ]
397
  }}
 
398
  Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
 
399
  You have to examine every article available in the search results below.
 
400
  Text:
401
  {search_results}"""
402
 
403
  try:
404
+ if progress:
405
+ progress(0.5, desc="Sending request to AI model...")
406
+
407
  message = client.messages.create(
408
  model="claude-sonnet-4-20250514",
409
  max_tokens=1500,
 
415
  }
416
  ]
417
  )
418
+
419
+ if progress:
420
+ progress(0.9, desc="Processing AI response...")
421
+
422
+ result = message.content[0].text
423
+
424
+ if progress:
425
+ progress(1.0, desc="Analysis completed!")
426
+
427
+ return result
428
 
429
  except Exception as e:
430
+ if progress:
431
+ progress(1.0, desc="Analysis failed")
432
  return f"[ERROR] Extraction failed: {str(e)}"
433
 
434
+ # === Enhanced Gradio interface functions ===
435
 
436
+ def search_only_enhanced(name: str, article_count: int, progress=gr.Progress()):
437
+ """Enhanced search with progress tracking"""
438
  if not name.strip():
439
+ return "❌ No name provided", ""
440
 
441
  try:
442
  start = time.time()
443
+ progress(0, desc="Starting enhanced temporal search...")
444
+
445
+ articles_output = search_articles_enhanced(name.strip(), max_articles=article_count, progress=progress)
446
+
447
+ if "[CANCELLED]" in articles_output:
448
+ return articles_output, ""
449
+
450
  elapsed = time.time() - start
451
+ progress(1.0, desc=f"Search completed in {elapsed:.1f}s")
452
 
453
  results = f"βœ… **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
454
  results += articles_output
455
 
456
  return results, articles_output
457
+
458
  except Exception as e:
459
+ progress(1.0, desc="Search failed")
460
+ return f"❌ **Search failed**: {str(e)}", ""
461
 
462
+ def extract_only_enhanced(stored_results: str, company_name: str, progress=gr.Progress()):
463
+ """Enhanced extraction with progress tracking"""
464
  if not stored_results.strip():
465
+ return "❌ No search results available. Please search first."
466
 
467
  if not company_name.strip():
468
+ return "❌ No company name provided. Please search first."
469
+
470
+ if "[CANCELLED]" in stored_results:
471
+ return "❌ Cannot extract from cancelled search results. Please search again."
472
 
473
  try:
474
  start = time.time()
475
+ entities = extract_entities_enhanced(stored_results, company_name.strip(), progress)
476
  elapsed = time.time() - start
477
 
478
  # Try to format JSON for better readability
 
482
  return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
483
  except:
484
  return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
485
+
486
  except Exception as e:
487
+ progress(1.0, desc="Extraction failed")
488
+ return f"❌ **Extraction failed**: {str(e)}"
489
 
490
+ def cancel_search():
491
+ """Cancel the current search operation"""
492
+ cancel_operation.set()
493
+ return "πŸ›‘ **Cancellation requested** - stopping current operation..."
494
 
495
+ # === Enhanced Gradio UI ===
496
+
497
+ with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
498
  gr.Markdown("# πŸ”Ž Enhanced Founder Finder")
499
+ gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy** with **real-time progress tracking**.")
500
+ gr.Markdown("*πŸš€ **New Features**: Progress bars, cancellation support, parallel processing, better error handling*")
501
+ gr.Markdown("*⏱️ Note: Enhanced search typically takes 30–60 seconds with full progress visibility.*")
502
 
503
  search_state = gr.State("")
504
 
505
  with gr.Row():
506
+ with gr.Column(scale=2):
507
+ name_input = gr.Textbox(
508
+ label="Company Name",
509
+ placeholder="Enter business name (e.g., 'Tesla', 'SpaceX', 'Microsoft')",
510
+ lines=1
511
+ )
512
+ with gr.Column(scale=1):
513
+ article_count_slider = gr.Slider(
514
+ 2, 12,
515
+ value=4,
516
+ step=2,
517
+ label="Total Articles",
518
+ info="Split between recent/historical"
519
+ )
520
 
521
+ with gr.Row():
522
+ search_btn = gr.Button("πŸ” Enhanced Temporal Search", variant="primary", size="lg")
523
+ cancel_btn = gr.Button("πŸ›‘ Cancel Search", variant="secondary", size="lg")
524
+ extract_btn = gr.Button("πŸ“Š Extract Founder Intelligence", variant="secondary", size="lg")
 
 
 
525
 
526
+ with gr.Row():
527
+ status_output = gr.Markdown("Ready to search...")
528
+
529
+ with gr.Row():
530
+ with gr.Column():
531
+ output1 = gr.Markdown(label="Search Results with Temporal Analysis", height=400)
532
+ with gr.Column():
533
+ output2 = gr.Textbox(
534
+ label="Founder Intelligence Report",
535
+ lines=15,
536
+ max_lines=25,
537
+ show_copy_button=True
538
+ )
539
+
540
+ # Event handlers
541
+ search_click = search_btn.click(
542
+ fn=search_only_enhanced,
543
  inputs=[name_input, article_count_slider],
544
+ outputs=[output1, search_state],
545
+ show_progress=True
546
+ )
547
+
548
+ cancel_btn.click(
549
+ fn=cancel_search,
550
+ outputs=[status_output]
551
  )
552
 
553
  extract_btn.click(
554
+ fn=extract_only_enhanced,
555
  inputs=[search_state, name_input],
556
+ outputs=[output2],
557
+ show_progress=True
558
+ )
559
+
560
+ # Add some example companies
561
+ gr.Examples(
562
+ examples=[
563
+ ["Tesla", 4],
564
+ ["SpaceX", 6],
565
+ ["Microsoft", 4],
566
+ ["Apple", 6],
567
+ ["OpenAI", 4]
568
+ ],
569
+ inputs=[name_input, article_count_slider],
570
  )
571
 
572
  if __name__ == "__main__":
573
+ demo.launch(
574
+ share=False,
575
+ show_error=True,
576
+ show_tips=True,
577
+ height=800
578
+ )
579
 
580
+ '''
581
+ import gradio as gr
582
  import requests
583
  import time
584
  import re
 
586
  from bs4 import BeautifulSoup
587
  import anthropic
588
  import os
589
+ from datetime import datetime, timedelta
590
+ from dateutil import parser
591
+ import json
592
 
593
  # Initialize Anthropic client
594
  client = anthropic.Anthropic(
 
597
 
598
  # === Model functions ===
599
 
600
+ def extract_publication_date(soup, url):
601
+ """Extract publication date from article HTML"""
602
+ try:
603
+ # Common date selectors
604
+ date_selectors = [
605
+ 'time[datetime]',
606
+ '.date', '.publish-date', '.published', '.post-date',
607
+ '[class*="date"]', '[class*="time"]',
608
+ 'meta[property="article:published_time"]',
609
+ 'meta[name="publishdate"]',
610
+ 'meta[name="date"]'
611
+ ]
612
+
613
+ for selector in date_selectors:
614
+ element = soup.select_one(selector)
615
+ if element:
616
+ date_text = element.get('datetime') or element.get('content') or element.get_text()
617
+ if date_text:
618
+ try:
619
+ return parser.parse(date_text)
620
+ except:
621
+ continue
622
+
623
+ # Look for date patterns in text
624
+ date_patterns = [
625
+ r'(\w+ \d{1,2}, \d{4})', # January 15, 2023
626
+ r'(\d{1,2}/\d{1,2}/\d{4})', # 01/15/2023
627
+ r'(\d{4}-\d{2}-\d{2})' # 2023-01-15
628
+ ]
629
+
630
+ text = soup.get_text()[:2000] # First 2000 chars
631
+ for pattern in date_patterns:
632
+ matches = re.findall(pattern, text)
633
+ if matches:
634
+ try:
635
+ return parser.parse(matches[0])
636
+ except:
637
+ continue
638
+
639
+ except Exception as e:
640
+ print(f"Date extraction error for {url}: {e}")
641
+
642
+ return None
643
+
644
  def get_full_article(url):
645
  try:
646
  headers = {
 
655
  response.raise_for_status()
656
  soup = BeautifulSoup(response.content, 'html.parser')
657
 
658
+ # Extract publication date
659
+ pub_date = extract_publication_date(soup, url)
660
+
661
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
662
  element.decompose()
663
 
 
675
  text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
676
  full_text = '\n\n'.join(text_parts)
677
  if len(full_text) > 300:
678
+ return full_text[:10000], pub_date
679
 
680
  body_text = soup.get_text(separator='\n\n', strip=True)
681
  body_text = re.sub(r'\n{3,}', '\n\n', body_text)
682
+ return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
683
 
684
  except requests.exceptions.Timeout:
685
+ return "[WARNING] Article fetch timeout - using snippet instead", None
686
  except requests.exceptions.RequestException:
687
+ return "[ERROR] Could not fetch article: Network error", None
688
  except Exception as e:
689
+ return f"[ERROR] Could not fetch article: {str(e)}", None
690
 
691
+ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
692
+ """Search for articles in specific timeframe"""
693
+
694
+ # Define search queries based on timeframe
695
+ if timeframe == "recent":
696
+ # Recent articles (news, updates, current events)
697
+ search_queries = [
698
+ f'"{name}" founder news 2024 2025',
699
+ f'"{name}" CEO founder recent',
700
+ f'"{name}" founder update latest'
701
+ ]
702
+ else: # historical
703
+ # Historical articles (founding, establishment, origin stories)
704
+ search_queries = [
705
+ f'"{name}" founded established history',
706
+ f'"{name}" founder origin story',
707
+ f'"{name}" started began founder',
708
+ f'"{name}" founder early days'
709
+ ]
710
 
711
+ all_results = []
712
+ max_retries = 2
713
  base_delay = 3
714
 
715
+ for query_idx, search_query in enumerate(search_queries):
716
+ if len(all_results) >= max_articles:
717
+ break
718
+
719
+ for attempt in range(max_retries):
720
+ try:
721
+ print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
722
+ time.sleep(base_delay * (attempt + 1))
723
 
724
+ configs = [
725
+ {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
726
+ {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
727
+ ]
 
728
 
729
+ config = configs[min(attempt, len(configs)-1)]
730
 
731
+ with DDGS(timeout=config['timeout']) as ddgs:
732
+ search_params = {
733
+ 'keywords': search_query,
734
+ 'max_results': max_articles - len(all_results) + 2, # Get a few extra to filter
735
+ 'safesearch': config['safesearch']
736
+ }
737
+ if config['region']:
738
+ search_params['region'] = config['region']
739
 
740
+ results = list(ddgs.text(**search_params))
741
+ print(f"Found {len(results)} results for query {query_idx + 1}")
742
 
743
+ if results:
744
+ # Add unique results (avoid duplicates)
745
+ existing_urls = {r.get('url', '') for r in all_results}
746
+ for result in results:
747
+ if len(all_results) >= max_articles:
748
+ break
749
+ url = result.get('href', '')
750
+ if url and url not in existing_urls:
751
+ all_results.append(result)
752
+ existing_urls.add(url)
753
+ break
754
+
755
+ except Exception as e:
756
+ print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
757
+ if attempt < max_retries - 1:
758
+ time.sleep(base_delay * (attempt + 2))
759
 
760
+ return all_results[:max_articles]
 
 
 
 
761
 
762
+ def categorize_article_by_date(pub_date):
763
+ """Categorize article as recent or historical based on publication date"""
764
+ if not pub_date:
765
+ return "unknown"
766
+
767
+ one_year_ago = datetime.now() - timedelta(days=365)
768
+
769
+ if pub_date >= one_year_ago:
770
+ return "recent"
771
+ else:
772
+ return "historical"
773
+
774
+ def search_articles(name: str, max_articles: int = 4) -> str:
775
+ """Enhanced search that ensures both recent and historical articles"""
776
+
777
+ # Split articles between recent and historical
778
+ recent_count = max_articles // 2
779
+ historical_count = max_articles - recent_count
780
+
781
+ print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
782
+
783
+ # Search for recent articles
784
+ recent_results = search_articles_by_timeframe(name, "recent", recent_count)
785
+ time.sleep(2) # Brief pause between timeframe searches
786
+
787
+ # Search for historical articles
788
+ historical_results = search_articles_by_timeframe(name, "historical", historical_count)
789
+
790
+ # Combine and process all results
791
+ all_results = []
792
+
793
+ # Process recent articles
794
+ for result in recent_results:
795
+ result['expected_timeframe'] = 'recent'
796
+ all_results.append(result)
797
+
798
+ # Process historical articles
799
+ for result in historical_results:
800
+ result['expected_timeframe'] = 'historical'
801
+ all_results.append(result)
802
+
803
+ if not all_results:
804
+ return f"[INFO] No articles found for {name}"
805
+
806
+ # Fetch and categorize articles
807
+ articles = []
808
+ recent_found = 0
809
+ historical_found = 0
810
+
811
+ for i, result in enumerate(all_results, 1):
812
+ url = result.get('href', 'No URL')
813
+ title = result.get('title', 'No Title')
814
+ snippet = result.get('body', 'No snippet available')
815
+ expected_timeframe = result.get('expected_timeframe', 'unknown')
816
 
817
+ if i > 1:
818
+ time.sleep(2)
 
 
 
 
819
 
820
+ full_text, pub_date = get_full_article(url)
821
+ actual_timeframe = categorize_article_by_date(pub_date)
822
+
823
+ # Count articles by actual timeframe
824
+ if actual_timeframe == "recent":
825
+ recent_found += 1
826
+ elif actual_timeframe == "historical":
827
+ historical_found += 1
828
+
829
+ if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
830
+ print(f"Using snippet fallback for article {i}")
831
+ content = f"[SNIPPET ONLY]\n{snippet}"
832
+ else:
833
+ content = full_text
834
 
835
+ # Create timeframe indicator
836
+ timeframe_indicator = ""
837
+ if pub_date:
838
+ date_str = pub_date.strftime("%B %d, %Y")
839
+ timeframe_indicator = f"πŸ“… **Published**: {date_str} ({actual_timeframe.title()})"
840
+ else:
841
+ timeframe_indicator = f"πŸ“… **Timeframe**: {expected_timeframe.title()} (estimated)"
842
 
843
+ article = f"### {i}. {title}\n"
844
+ article += f"[Source]({url})\n"
845
+ article += f"{timeframe_indicator}\n\n"
846
+ article += f"{content}\n"
847
+ articles.append(article)
 
848
 
849
+ # Add summary of coverage
850
+ summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
851
+
852
+ return summary + "\n---\n".join(articles)
853
 
854
  def extract_entities(search_results: str, company_name: str) -> str:
855
  """Extract entities using Claude 4"""
856
+ MAX_CHARS = 15000
857
  if len(search_results) > MAX_CHARS:
858
  trunc = search_results[:MAX_CHARS]
859
  last_period = trunc.rfind('. ')
 
862
  prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
863
  Only include founders who are explicitly mentioned as founders of {company_name}.
864
  Ignore founders of other companies that may be mentioned in the text.
865
+
866
+ Return a JSON object with the following structure:
867
+ {{
868
+ "founders": [
869
+ {{
870
+ "name": "Founder Name",
871
+ "evidence": ["brief quote or context where they were mentioned as founder"]
872
+ }}
873
+ ]
874
+ }}
875
+
876
+ Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
877
+
878
+ You have to examine every article available in the search results below.
879
+
880
  Text:
881
  {search_results}"""
882
 
883
  try:
884
  message = client.messages.create(
885
  model="claude-sonnet-4-20250514",
886
+ max_tokens=1500,
887
+ temperature=0.1,
888
  messages=[
889
  {
890
  "role": "user",
 
908
  articles_output = search_articles(name.strip(), max_articles=article_count)
909
  elapsed = time.time() - start
910
 
911
+ results = f"βœ… **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
912
  results += articles_output
913
 
914
  return results, articles_output
 
926
  start = time.time()
927
  entities = extract_entities(stored_results, company_name.strip())
928
  elapsed = time.time() - start
929
+
930
+ # Try to format JSON for better readability
931
+ try:
932
+ parsed = json.loads(entities)
933
+ formatted = json.dumps(parsed, indent=2)
934
+ return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
935
+ except:
936
+ return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
937
  except Exception as e:
938
  return f"[ERROR] Extraction failed: {str(e)}"
939
 
940
  # === Gradio UI ===
941
 
942
+ with gr.Blocks(title="Enhanced Founder Finder") as demo:
943
+ gr.Markdown("# πŸ”Ž Enhanced Founder Finder")
944
+ gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
945
+ gr.Markdown("*πŸš€ **New**: Automatically searches for both recent news AND historical founding information*")
946
+ gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
947
 
948
  search_state = gr.State("")
949
 
950
  with gr.Row():
951
  name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
952
+ article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
953
  with gr.Column():
954
+ search_btn = gr.Button("πŸ” Enhanced Temporal Search", variant="primary")
955
+ extract_btn = gr.Button("πŸ“Š Extract Founder Intelligence", variant="secondary")
956
 
957
+ output1 = gr.Markdown(label="Search Results with Temporal Analysis")
958
  output2 = gr.Textbox(
959
+ label="Founder Intelligence Report",
960
+ lines=15,
961
+ max_lines=25,
962
  show_copy_button=True
963
  )
964
 
 
976
 
977
  if __name__ == "__main__":
978
  demo.launch()
979
+
980
+ '''