Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,18 +9,31 @@ import os
|
|
9 |
from datetime import datetime, timedelta
|
10 |
from dateutil import parser
|
11 |
import json
|
|
|
|
|
|
|
12 |
|
13 |
# Initialize Anthropic client
|
14 |
client = anthropic.Anthropic(
|
15 |
-
api_key=os.getenv("ANTHROPIC_API_KEY")
|
16 |
)
|
17 |
|
18 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def extract_publication_date(soup, url):
|
21 |
-
"""Extract publication date from article HTML"""
|
22 |
try:
|
23 |
-
# Common date selectors
|
24 |
date_selectors = [
|
25 |
'time[datetime]',
|
26 |
'.date', '.publish-date', '.published', '.post-date',
|
@@ -40,14 +53,13 @@ def extract_publication_date(soup, url):
|
|
40 |
except:
|
41 |
continue
|
42 |
|
43 |
-
# Look for date patterns in text
|
44 |
date_patterns = [
|
45 |
-
r'(\w+ \d{1,2}, \d{4})',
|
46 |
-
r'(\d{1,2}/\d{1,2}/\d{4})',
|
47 |
-
r'(\d{4}-\d{2}-\d{2})'
|
48 |
]
|
49 |
|
50 |
-
text = soup.get_text()[:2000]
|
51 |
for pattern in date_patterns:
|
52 |
matches = re.findall(pattern, text)
|
53 |
if matches:
|
@@ -61,7 +73,11 @@ def extract_publication_date(soup, url):
|
|
61 |
|
62 |
return None
|
63 |
|
64 |
-
def
|
|
|
|
|
|
|
|
|
65 |
try:
|
66 |
headers = {
|
67 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
@@ -71,11 +87,10 @@ def get_full_article(url):
|
|
71 |
'Upgrade-Insecure-Requests': '1'
|
72 |
}
|
73 |
|
74 |
-
response = requests.get(url, headers=headers, timeout=
|
75 |
response.raise_for_status()
|
76 |
soup = BeautifulSoup(response.content, 'html.parser')
|
77 |
|
78 |
-
# Extract publication date
|
79 |
pub_date = extract_publication_date(soup, url)
|
80 |
|
81 |
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
@@ -103,24 +118,24 @@ def get_full_article(url):
|
|
103 |
|
104 |
except requests.exceptions.Timeout:
|
105 |
return "[WARNING] Article fetch timeout - using snippet instead", None
|
106 |
-
except requests.exceptions.RequestException:
|
107 |
-
return "[ERROR]
|
108 |
except Exception as e:
|
109 |
return f"[ERROR] Could not fetch article: {str(e)}", None
|
110 |
|
111 |
-
def
|
112 |
-
"""
|
|
|
|
|
|
|
113 |
|
114 |
-
# Define search queries based on timeframe
|
115 |
if timeframe == "recent":
|
116 |
-
# Recent articles (news, updates, current events)
|
117 |
search_queries = [
|
118 |
f'"{name}" founder news 2024 2025',
|
119 |
f'"{name}" CEO founder recent',
|
120 |
f'"{name}" founder update latest'
|
121 |
]
|
122 |
-
else:
|
123 |
-
# Historical articles (founding, establishment, origin stories)
|
124 |
search_queries = [
|
125 |
f'"{name}" founded established history',
|
126 |
f'"{name}" founder origin story',
|
@@ -130,20 +145,31 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
|
|
130 |
|
131 |
all_results = []
|
132 |
max_retries = 2
|
133 |
-
base_delay =
|
134 |
|
|
|
|
|
135 |
for query_idx, search_query in enumerate(search_queries):
|
136 |
-
if len(all_results) >= max_articles:
|
137 |
break
|
138 |
|
|
|
|
|
|
|
|
|
139 |
for attempt in range(max_retries):
|
|
|
|
|
|
|
140 |
try:
|
141 |
print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
|
142 |
-
|
|
|
|
|
143 |
|
144 |
configs = [
|
145 |
-
{'timeout':
|
146 |
-
{'timeout':
|
147 |
]
|
148 |
|
149 |
config = configs[min(attempt, len(configs)-1)]
|
@@ -151,17 +177,15 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
|
|
151 |
with DDGS(timeout=config['timeout']) as ddgs:
|
152 |
search_params = {
|
153 |
'keywords': search_query,
|
154 |
-
'max_results': max_articles - len(all_results) + 2,
|
155 |
'safesearch': config['safesearch']
|
156 |
}
|
157 |
if config['region']:
|
158 |
search_params['region'] = config['region']
|
159 |
|
160 |
results = list(ddgs.text(**search_params))
|
161 |
-
print(f"Found {len(results)} results for query {query_idx + 1}")
|
162 |
|
163 |
if results:
|
164 |
-
# Add unique results (avoid duplicates)
|
165 |
existing_urls = {r.get('url', '') for r in all_results}
|
166 |
for result in results:
|
167 |
if len(all_results) >= max_articles:
|
@@ -175,114 +199,193 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
|
|
175 |
except Exception as e:
|
176 |
print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
|
177 |
if attempt < max_retries - 1:
|
178 |
-
time.sleep(base_delay * (attempt +
|
179 |
|
180 |
return all_results[:max_articles]
|
181 |
|
182 |
def categorize_article_by_date(pub_date):
|
183 |
-
"""
|
184 |
if not pub_date:
|
185 |
return "unknown"
|
186 |
|
187 |
one_year_ago = datetime.now() - timedelta(days=365)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
-
if
|
190 |
-
return
|
|
|
|
|
|
|
|
|
|
|
191 |
else:
|
192 |
-
|
193 |
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
-
# Split articles between recent and historical
|
198 |
recent_count = max_articles // 2
|
199 |
historical_count = max_articles - recent_count
|
200 |
|
201 |
-
|
|
|
202 |
|
203 |
# Search for recent articles
|
204 |
-
|
205 |
-
|
206 |
|
207 |
-
|
208 |
-
historical_results = search_articles_by_timeframe(name, "historical", historical_count)
|
209 |
|
210 |
-
|
211 |
-
|
212 |
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
for result in recent_results:
|
215 |
result['expected_timeframe'] = 'recent'
|
216 |
all_results.append(result)
|
217 |
|
218 |
-
# Process historical articles
|
219 |
for result in historical_results:
|
220 |
result['expected_timeframe'] = 'historical'
|
221 |
all_results.append(result)
|
222 |
|
223 |
if not all_results:
|
|
|
|
|
224 |
return f"[INFO] No articles found for {name}"
|
225 |
|
226 |
-
|
|
|
|
|
|
|
227 |
articles = []
|
228 |
recent_found = 0
|
229 |
historical_found = 0
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
full_text, pub_date = get_full_article(url)
|
241 |
-
actual_timeframe = categorize_article_by_date(pub_date)
|
242 |
-
|
243 |
-
# Count articles by actual timeframe
|
244 |
-
if actual_timeframe == "recent":
|
245 |
-
recent_found += 1
|
246 |
-
elif actual_timeframe == "historical":
|
247 |
-
historical_found += 1
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
|
271 |
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
-
def
|
275 |
-
"""
|
|
|
|
|
|
|
|
|
276 |
MAX_CHARS = 15000
|
277 |
if len(search_results) > MAX_CHARS:
|
278 |
trunc = search_results[:MAX_CHARS]
|
279 |
last_period = trunc.rfind('. ')
|
280 |
search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
|
281 |
|
|
|
|
|
|
|
282 |
prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
|
283 |
Only include founders who are explicitly mentioned as founders of {company_name}.
|
284 |
Ignore founders of other companies that may be mentioned in the text.
|
285 |
-
|
286 |
Return a JSON object with the following structure:
|
287 |
{{
|
288 |
"founders": [
|
@@ -292,15 +395,15 @@ Return a JSON object with the following structure:
|
|
292 |
}}
|
293 |
]
|
294 |
}}
|
295 |
-
|
296 |
Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
|
297 |
-
|
298 |
You have to examine every article available in the search results below.
|
299 |
-
|
300 |
Text:
|
301 |
{search_results}"""
|
302 |
|
303 |
try:
|
|
|
|
|
|
|
304 |
message = client.messages.create(
|
305 |
model="claude-sonnet-4-20250514",
|
306 |
max_tokens=1500,
|
@@ -312,39 +415,64 @@ Text:
|
|
312 |
}
|
313 |
]
|
314 |
)
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
except Exception as e:
|
|
|
|
|
318 |
return f"[ERROR] Extraction failed: {str(e)}"
|
319 |
|
320 |
-
# === Gradio interface functions ===
|
321 |
|
322 |
-
def
|
|
|
323 |
if not name.strip():
|
324 |
-
return "No name provided", ""
|
325 |
|
326 |
try:
|
327 |
start = time.time()
|
328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
elapsed = time.time() - start
|
|
|
330 |
|
331 |
results = f"β
**Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
|
332 |
results += articles_output
|
333 |
|
334 |
return results, articles_output
|
|
|
335 |
except Exception as e:
|
336 |
-
|
|
|
337 |
|
338 |
-
def
|
|
|
339 |
if not stored_results.strip():
|
340 |
-
return "No search results available. Please search first."
|
341 |
|
342 |
if not company_name.strip():
|
343 |
-
return "No company name provided. Please search first."
|
|
|
|
|
|
|
344 |
|
345 |
try:
|
346 |
start = time.time()
|
347 |
-
entities =
|
348 |
elapsed = time.time() - start
|
349 |
|
350 |
# Try to format JSON for better readability
|
@@ -354,50 +482,103 @@ def extract_only(stored_results: str, company_name: str):
|
|
354 |
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
|
355 |
except:
|
356 |
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
|
|
|
357 |
except Exception as e:
|
358 |
-
|
|
|
359 |
|
360 |
-
|
|
|
|
|
|
|
361 |
|
362 |
-
|
|
|
|
|
363 |
gr.Markdown("# π Enhanced Founder Finder")
|
364 |
-
gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
|
365 |
-
gr.Markdown("*π **New**:
|
366 |
-
gr.Markdown("*β±οΈ Note: Enhanced search
|
367 |
|
368 |
search_state = gr.State("")
|
369 |
|
370 |
with gr.Row():
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
max_lines=25,
|
382 |
-
show_copy_button=True
|
383 |
-
)
|
384 |
|
385 |
-
|
386 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
inputs=[name_input, article_count_slider],
|
388 |
-
outputs=[output1, search_state]
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
)
|
390 |
|
391 |
extract_btn.click(
|
392 |
-
fn=
|
393 |
inputs=[search_state, name_input],
|
394 |
-
outputs=[output2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
)
|
396 |
|
397 |
if __name__ == "__main__":
|
398 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
399 |
|
400 |
-
'''
|
|
|
401 |
import requests
|
402 |
import time
|
403 |
import re
|
@@ -405,6 +586,9 @@ from duckduckgo_search import DDGS
|
|
405 |
from bs4 import BeautifulSoup
|
406 |
import anthropic
|
407 |
import os
|
|
|
|
|
|
|
408 |
|
409 |
# Initialize Anthropic client
|
410 |
client = anthropic.Anthropic(
|
@@ -413,6 +597,50 @@ client = anthropic.Anthropic(
|
|
413 |
|
414 |
# === Model functions ===
|
415 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
def get_full_article(url):
|
417 |
try:
|
418 |
headers = {
|
@@ -427,6 +655,9 @@ def get_full_article(url):
|
|
427 |
response.raise_for_status()
|
428 |
soup = BeautifulSoup(response.content, 'html.parser')
|
429 |
|
|
|
|
|
|
|
430 |
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
431 |
element.decompose()
|
432 |
|
@@ -444,89 +675,185 @@ def get_full_article(url):
|
|
444 |
text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
|
445 |
full_text = '\n\n'.join(text_parts)
|
446 |
if len(full_text) > 300:
|
447 |
-
return full_text[:10000]
|
448 |
|
449 |
body_text = soup.get_text(separator='\n\n', strip=True)
|
450 |
body_text = re.sub(r'\n{3,}', '\n\n', body_text)
|
451 |
-
return body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"
|
452 |
|
453 |
except requests.exceptions.Timeout:
|
454 |
-
return "[WARNING] Article fetch timeout - using snippet instead"
|
455 |
except requests.exceptions.RequestException:
|
456 |
-
return "[ERROR] Could not fetch article: Network error"
|
457 |
except Exception as e:
|
458 |
-
return f"[ERROR] Could not fetch article: {str(e)}"
|
459 |
|
460 |
-
def
|
461 |
-
|
462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
|
464 |
-
|
|
|
465 |
base_delay = 3
|
466 |
|
467 |
-
for
|
468 |
-
|
469 |
-
|
470 |
-
|
|
|
|
|
|
|
|
|
471 |
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
]
|
477 |
|
478 |
-
|
479 |
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
|
489 |
-
|
490 |
-
|
491 |
|
492 |
-
|
493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
|
496 |
-
for i, result in enumerate(results, 1):
|
497 |
-
url = result.get('href', 'No URL')
|
498 |
-
title = result.get('title', 'No Title')
|
499 |
-
snippet = result.get('body', 'No snippet available')
|
500 |
|
501 |
-
|
502 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
|
504 |
-
|
505 |
-
|
506 |
-
print(f"Using snippet fallback for article {i}")
|
507 |
-
content = f"[SNIPPET ONLY]\n{snippet}"
|
508 |
-
else:
|
509 |
-
content = full_text
|
510 |
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"
|
524 |
|
525 |
-
|
|
|
|
|
|
|
526 |
|
527 |
def extract_entities(search_results: str, company_name: str) -> str:
|
528 |
"""Extract entities using Claude 4"""
|
529 |
-
MAX_CHARS =
|
530 |
if len(search_results) > MAX_CHARS:
|
531 |
trunc = search_results[:MAX_CHARS]
|
532 |
last_period = trunc.rfind('. ')
|
@@ -535,18 +862,29 @@ def extract_entities(search_results: str, company_name: str) -> str:
|
|
535 |
prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
|
536 |
Only include founders who are explicitly mentioned as founders of {company_name}.
|
537 |
Ignore founders of other companies that may be mentioned in the text.
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
Text:
|
543 |
{search_results}"""
|
544 |
|
545 |
try:
|
546 |
message = client.messages.create(
|
547 |
model="claude-sonnet-4-20250514",
|
548 |
-
max_tokens=
|
549 |
-
temperature=0.
|
550 |
messages=[
|
551 |
{
|
552 |
"role": "user",
|
@@ -570,7 +908,7 @@ def search_only(name: str, article_count: int):
|
|
570 |
articles_output = search_articles(name.strip(), max_articles=article_count)
|
571 |
elapsed = time.time() - start
|
572 |
|
573 |
-
results = f"β
Search completed for **{name}** in {elapsed:.1f}s\n\n"
|
574 |
results += articles_output
|
575 |
|
576 |
return results, articles_output
|
@@ -588,31 +926,39 @@ def extract_only(stored_results: str, company_name: str):
|
|
588 |
start = time.time()
|
589 |
entities = extract_entities(stored_results, company_name.strip())
|
590 |
elapsed = time.time() - start
|
591 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
except Exception as e:
|
593 |
return f"[ERROR] Extraction failed: {str(e)}"
|
594 |
|
595 |
# === Gradio UI ===
|
596 |
|
597 |
-
with gr.Blocks(title="Founder Finder") as demo:
|
598 |
-
gr.Markdown("# π Founder Finder")
|
599 |
-
gr.Markdown("Enter a business or project name to search for its founder
|
600 |
-
gr.Markdown("
|
|
|
601 |
|
602 |
search_state = gr.State("")
|
603 |
|
604 |
with gr.Row():
|
605 |
name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
|
606 |
-
article_count_slider = gr.Slider(
|
607 |
with gr.Column():
|
608 |
-
search_btn = gr.Button("π Search
|
609 |
-
extract_btn = gr.Button("
|
610 |
|
611 |
-
output1 = gr.Markdown(label="Search Results")
|
612 |
output2 = gr.Textbox(
|
613 |
-
label="
|
614 |
-
lines=
|
615 |
-
max_lines=
|
616 |
show_copy_button=True
|
617 |
)
|
618 |
|
@@ -630,5 +976,5 @@ with gr.Blocks(title="Founder Finder") as demo:
|
|
630 |
|
631 |
if __name__ == "__main__":
|
632 |
demo.launch()
|
633 |
-
|
634 |
-
|
|
|
9 |
from datetime import datetime, timedelta
|
10 |
from dateutil import parser
|
11 |
import json
|
12 |
+
import threading
|
13 |
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
|
14 |
+
import signal
|
15 |
|
16 |
# Initialize Anthropic client
|
17 |
client = anthropic.Anthropic(
|
18 |
+
api_key=os.getenv("ANTHROPIC_API_KEY")
|
19 |
)
|
20 |
|
21 |
+
# Global variable to track cancellation
|
22 |
+
cancel_operation = threading.Event()
|
23 |
+
|
24 |
+
def reset_cancellation():
|
25 |
+
"""Reset the cancellation flag"""
|
26 |
+
cancel_operation.clear()
|
27 |
+
|
28 |
+
def check_cancellation():
|
29 |
+
"""Check if operation should be cancelled"""
|
30 |
+
return cancel_operation.is_set()
|
31 |
+
|
32 |
+
# === Enhanced Model functions with progress tracking ===
|
33 |
|
34 |
def extract_publication_date(soup, url):
|
35 |
+
"""Extract publication date from article HTML - same as before"""
|
36 |
try:
|
|
|
37 |
date_selectors = [
|
38 |
'time[datetime]',
|
39 |
'.date', '.publish-date', '.published', '.post-date',
|
|
|
53 |
except:
|
54 |
continue
|
55 |
|
|
|
56 |
date_patterns = [
|
57 |
+
r'(\w+ \d{1,2}, \d{4})',
|
58 |
+
r'(\d{1,2}/\d{1,2}/\d{4})',
|
59 |
+
r'(\d{4}-\d{2}-\d{2})'
|
60 |
]
|
61 |
|
62 |
+
text = soup.get_text()[:2000]
|
63 |
for pattern in date_patterns:
|
64 |
matches = re.findall(pattern, text)
|
65 |
if matches:
|
|
|
73 |
|
74 |
return None
|
75 |
|
76 |
+
def get_full_article_with_timeout(url, timeout=15):
|
77 |
+
"""Enhanced article fetching with timeout and better error handling"""
|
78 |
+
if check_cancellation():
|
79 |
+
return "[CANCELLED] Operation was cancelled", None
|
80 |
+
|
81 |
try:
|
82 |
headers = {
|
83 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
|
|
87 |
'Upgrade-Insecure-Requests': '1'
|
88 |
}
|
89 |
|
90 |
+
response = requests.get(url, headers=headers, timeout=timeout, verify=True)
|
91 |
response.raise_for_status()
|
92 |
soup = BeautifulSoup(response.content, 'html.parser')
|
93 |
|
|
|
94 |
pub_date = extract_publication_date(soup, url)
|
95 |
|
96 |
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
|
|
118 |
|
119 |
except requests.exceptions.Timeout:
|
120 |
return "[WARNING] Article fetch timeout - using snippet instead", None
|
121 |
+
except requests.exceptions.RequestException as e:
|
122 |
+
return f"[ERROR] Network error: {str(e)}", None
|
123 |
except Exception as e:
|
124 |
return f"[ERROR] Could not fetch article: {str(e)}", None
|
125 |
|
126 |
+
def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
|
127 |
+
"""Enhanced search with progress tracking and better error handling"""
|
128 |
+
|
129 |
+
if check_cancellation():
|
130 |
+
return []
|
131 |
|
|
|
132 |
if timeframe == "recent":
|
|
|
133 |
search_queries = [
|
134 |
f'"{name}" founder news 2024 2025',
|
135 |
f'"{name}" CEO founder recent',
|
136 |
f'"{name}" founder update latest'
|
137 |
]
|
138 |
+
else:
|
|
|
139 |
search_queries = [
|
140 |
f'"{name}" founded established history',
|
141 |
f'"{name}" founder origin story',
|
|
|
145 |
|
146 |
all_results = []
|
147 |
max_retries = 2
|
148 |
+
base_delay = 2 # Reduced delay
|
149 |
|
150 |
+
total_queries = len(search_queries)
|
151 |
+
|
152 |
for query_idx, search_query in enumerate(search_queries):
|
153 |
+
if len(all_results) >= max_articles or check_cancellation():
|
154 |
break
|
155 |
|
156 |
+
if progress:
|
157 |
+
query_progress = (query_idx / total_queries) * 0.3 # 30% of progress for queries
|
158 |
+
progress(query_progress, desc=f"Searching {timeframe} articles ({query_idx + 1}/{total_queries})")
|
159 |
+
|
160 |
for attempt in range(max_retries):
|
161 |
+
if check_cancellation():
|
162 |
+
return all_results
|
163 |
+
|
164 |
try:
|
165 |
print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
|
166 |
+
|
167 |
+
if attempt > 0:
|
168 |
+
time.sleep(base_delay * attempt)
|
169 |
|
170 |
configs = [
|
171 |
+
{'timeout': 15, 'region': 'us-en', 'safesearch': 'moderate'},
|
172 |
+
{'timeout': 20, 'region': 'wt-wt', 'safesearch': 'off'}
|
173 |
]
|
174 |
|
175 |
config = configs[min(attempt, len(configs)-1)]
|
|
|
177 |
with DDGS(timeout=config['timeout']) as ddgs:
|
178 |
search_params = {
|
179 |
'keywords': search_query,
|
180 |
+
'max_results': max_articles - len(all_results) + 2,
|
181 |
'safesearch': config['safesearch']
|
182 |
}
|
183 |
if config['region']:
|
184 |
search_params['region'] = config['region']
|
185 |
|
186 |
results = list(ddgs.text(**search_params))
|
|
|
187 |
|
188 |
if results:
|
|
|
189 |
existing_urls = {r.get('url', '') for r in all_results}
|
190 |
for result in results:
|
191 |
if len(all_results) >= max_articles:
|
|
|
199 |
except Exception as e:
|
200 |
print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
|
201 |
if attempt < max_retries - 1:
|
202 |
+
time.sleep(base_delay * (attempt + 1))
|
203 |
|
204 |
return all_results[:max_articles]
|
205 |
|
206 |
def categorize_article_by_date(pub_date):
|
207 |
+
"""Same as before"""
|
208 |
if not pub_date:
|
209 |
return "unknown"
|
210 |
|
211 |
one_year_ago = datetime.now() - timedelta(days=365)
|
212 |
+
return "recent" if pub_date >= one_year_ago else "historical"
|
213 |
+
|
214 |
+
def fetch_article_parallel(result, article_num, total_articles, progress=None):
|
215 |
+
"""Fetch single article with progress update"""
|
216 |
+
if check_cancellation():
|
217 |
+
return None
|
218 |
+
|
219 |
+
url = result.get('href', 'No URL')
|
220 |
+
title = result.get('title', 'No Title')
|
221 |
+
snippet = result.get('body', 'No snippet available')
|
222 |
+
expected_timeframe = result.get('expected_timeframe', 'unknown')
|
223 |
+
|
224 |
+
if progress:
|
225 |
+
fetch_progress = 0.4 + (article_num / total_articles) * 0.5 # 40-90% of total progress
|
226 |
+
progress(fetch_progress, desc=f"Fetching article {article_num + 1}/{total_articles}: {title[:50]}...")
|
227 |
+
|
228 |
+
full_text, pub_date = get_full_article_with_timeout(url, timeout=12)
|
229 |
|
230 |
+
if check_cancellation():
|
231 |
+
return None
|
232 |
+
|
233 |
+
actual_timeframe = categorize_article_by_date(pub_date)
|
234 |
+
|
235 |
+
if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error", "[CANCELLED]"]):
|
236 |
+
content = f"[SNIPPET ONLY]\n{snippet}"
|
237 |
else:
|
238 |
+
content = full_text
|
239 |
|
240 |
+
timeframe_indicator = ""
|
241 |
+
if pub_date:
|
242 |
+
date_str = pub_date.strftime("%B %d, %Y")
|
243 |
+
timeframe_indicator = f"π
**Published**: {date_str} ({actual_timeframe.title()})"
|
244 |
+
else:
|
245 |
+
timeframe_indicator = f"π
**Timeframe**: {expected_timeframe.title()} (estimated)"
|
246 |
+
|
247 |
+
article = f"### {article_num + 1}. {title}\n"
|
248 |
+
article += f"[Source]({url})\n"
|
249 |
+
article += f"{timeframe_indicator}\n\n"
|
250 |
+
article += f"{content}\n"
|
251 |
+
|
252 |
+
return {
|
253 |
+
'article': article,
|
254 |
+
'timeframe': actual_timeframe,
|
255 |
+
'url': url,
|
256 |
+
'title': title
|
257 |
+
}
|
258 |
+
|
259 |
+
def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) -> str:
|
260 |
+
"""Enhanced search with progress tracking and parallel processing"""
|
261 |
+
|
262 |
+
reset_cancellation() # Reset cancellation flag
|
263 |
+
|
264 |
+
if progress:
|
265 |
+
progress(0, desc="Initializing enhanced search...")
|
266 |
|
|
|
267 |
recent_count = max_articles // 2
|
268 |
historical_count = max_articles - recent_count
|
269 |
|
270 |
+
if progress:
|
271 |
+
progress(0.05, desc=f"Planning search: {recent_count} recent + {historical_count} historical articles")
|
272 |
|
273 |
# Search for recent articles
|
274 |
+
if progress:
|
275 |
+
progress(0.1, desc="Searching for recent articles...")
|
276 |
|
277 |
+
recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress)
|
|
|
278 |
|
279 |
+
if check_cancellation():
|
280 |
+
return "[CANCELLED] Search was cancelled by user"
|
281 |
|
282 |
+
if progress:
|
283 |
+
progress(0.3, desc="Searching for historical articles...")
|
284 |
+
|
285 |
+
# Brief pause between searches
|
286 |
+
time.sleep(1)
|
287 |
+
|
288 |
+
historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress)
|
289 |
+
|
290 |
+
if check_cancellation():
|
291 |
+
return "[CANCELLED] Search was cancelled by user"
|
292 |
+
|
293 |
+
# Combine results
|
294 |
+
all_results = []
|
295 |
for result in recent_results:
|
296 |
result['expected_timeframe'] = 'recent'
|
297 |
all_results.append(result)
|
298 |
|
|
|
299 |
for result in historical_results:
|
300 |
result['expected_timeframe'] = 'historical'
|
301 |
all_results.append(result)
|
302 |
|
303 |
if not all_results:
|
304 |
+
if progress:
|
305 |
+
progress(1.0, desc="Search completed - no results found")
|
306 |
return f"[INFO] No articles found for {name}"
|
307 |
|
308 |
+
if progress:
|
309 |
+
progress(0.4, desc=f"Found {len(all_results)} articles, now fetching content...")
|
310 |
+
|
311 |
+
# Fetch articles with parallel processing (but limited concurrency)
|
312 |
articles = []
|
313 |
recent_found = 0
|
314 |
historical_found = 0
|
315 |
|
316 |
+
# Use ThreadPoolExecutor for controlled parallel fetching
|
317 |
+
max_workers = min(3, len(all_results)) # Limit concurrent requests
|
318 |
+
|
319 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
320 |
+
# Submit all tasks
|
321 |
+
future_to_result = {
|
322 |
+
executor.submit(fetch_article_parallel, result, i, len(all_results), progress): (result, i)
|
323 |
+
for i, result in enumerate(all_results)
|
324 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
+
# Collect results as they complete
|
327 |
+
for future in as_completed(future_to_result, timeout=60): # 60 second timeout
|
328 |
+
if check_cancellation():
|
329 |
+
# Cancel remaining futures
|
330 |
+
for f in future_to_result:
|
331 |
+
f.cancel()
|
332 |
+
return "[CANCELLED] Search was cancelled by user"
|
333 |
+
|
334 |
+
try:
|
335 |
+
result_data = future.result(timeout=15)
|
336 |
+
if result_data:
|
337 |
+
articles.append(result_data)
|
338 |
+
|
339 |
+
# Count by timeframe
|
340 |
+
if result_data['timeframe'] == "recent":
|
341 |
+
recent_found += 1
|
342 |
+
elif result_data['timeframe'] == "historical":
|
343 |
+
historical_found += 1
|
344 |
+
|
345 |
+
except TimeoutError:
|
346 |
+
print("Article fetch timed out")
|
347 |
+
continue
|
348 |
+
except Exception as e:
|
349 |
+
print(f"Error fetching article: {e}")
|
350 |
+
continue
|
351 |
+
|
352 |
+
if check_cancellation():
|
353 |
+
return "[CANCELLED] Search was cancelled by user"
|
354 |
+
|
355 |
+
if progress:
|
356 |
+
progress(0.95, desc="Formatting results...")
|
357 |
+
|
358 |
+
# Sort articles to maintain order
|
359 |
+
articles.sort(key=lambda x: all_results.index(next(r for r in all_results if r.get('href', '') == x['url'])))
|
360 |
+
|
361 |
+
# Create final output
|
362 |
summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
|
363 |
|
364 |
+
article_texts = [article_data['article'] for article_data in articles]
|
365 |
+
|
366 |
+
if progress:
|
367 |
+
progress(1.0, desc=f"Search completed! Found {len(articles)} articles")
|
368 |
+
|
369 |
+
return summary + "\n---\n".join(article_texts)
|
370 |
|
371 |
+
def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
|
372 |
+
"""Enhanced entity extraction with progress tracking"""
|
373 |
+
|
374 |
+
if progress:
|
375 |
+
progress(0, desc="Preparing text for analysis...")
|
376 |
+
|
377 |
MAX_CHARS = 15000
|
378 |
if len(search_results) > MAX_CHARS:
|
379 |
trunc = search_results[:MAX_CHARS]
|
380 |
last_period = trunc.rfind('. ')
|
381 |
search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
|
382 |
|
383 |
+
if progress:
|
384 |
+
progress(0.2, desc="Analyzing articles with AI...")
|
385 |
+
|
386 |
prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
|
387 |
Only include founders who are explicitly mentioned as founders of {company_name}.
|
388 |
Ignore founders of other companies that may be mentioned in the text.
|
|
|
389 |
Return a JSON object with the following structure:
|
390 |
{{
|
391 |
"founders": [
|
|
|
395 |
}}
|
396 |
]
|
397 |
}}
|
|
|
398 |
Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
|
|
|
399 |
You have to examine every article available in the search results below.
|
|
|
400 |
Text:
|
401 |
{search_results}"""
|
402 |
|
403 |
try:
|
404 |
+
if progress:
|
405 |
+
progress(0.5, desc="Sending request to AI model...")
|
406 |
+
|
407 |
message = client.messages.create(
|
408 |
model="claude-sonnet-4-20250514",
|
409 |
max_tokens=1500,
|
|
|
415 |
}
|
416 |
]
|
417 |
)
|
418 |
+
|
419 |
+
if progress:
|
420 |
+
progress(0.9, desc="Processing AI response...")
|
421 |
+
|
422 |
+
result = message.content[0].text
|
423 |
+
|
424 |
+
if progress:
|
425 |
+
progress(1.0, desc="Analysis completed!")
|
426 |
+
|
427 |
+
return result
|
428 |
|
429 |
except Exception as e:
|
430 |
+
if progress:
|
431 |
+
progress(1.0, desc="Analysis failed")
|
432 |
return f"[ERROR] Extraction failed: {str(e)}"
|
433 |
|
434 |
+
# === Enhanced Gradio interface functions ===
|
435 |
|
436 |
+
def search_only_enhanced(name: str, article_count: int, progress=gr.Progress()):
|
437 |
+
"""Enhanced search with progress tracking"""
|
438 |
if not name.strip():
|
439 |
+
return "β No name provided", ""
|
440 |
|
441 |
try:
|
442 |
start = time.time()
|
443 |
+
progress(0, desc="Starting enhanced temporal search...")
|
444 |
+
|
445 |
+
articles_output = search_articles_enhanced(name.strip(), max_articles=article_count, progress=progress)
|
446 |
+
|
447 |
+
if "[CANCELLED]" in articles_output:
|
448 |
+
return articles_output, ""
|
449 |
+
|
450 |
elapsed = time.time() - start
|
451 |
+
progress(1.0, desc=f"Search completed in {elapsed:.1f}s")
|
452 |
|
453 |
results = f"β
**Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
|
454 |
results += articles_output
|
455 |
|
456 |
return results, articles_output
|
457 |
+
|
458 |
except Exception as e:
|
459 |
+
progress(1.0, desc="Search failed")
|
460 |
+
return f"β **Search failed**: {str(e)}", ""
|
461 |
|
462 |
+
def extract_only_enhanced(stored_results: str, company_name: str, progress=gr.Progress()):
|
463 |
+
"""Enhanced extraction with progress tracking"""
|
464 |
if not stored_results.strip():
|
465 |
+
return "β No search results available. Please search first."
|
466 |
|
467 |
if not company_name.strip():
|
468 |
+
return "β No company name provided. Please search first."
|
469 |
+
|
470 |
+
if "[CANCELLED]" in stored_results:
|
471 |
+
return "β Cannot extract from cancelled search results. Please search again."
|
472 |
|
473 |
try:
|
474 |
start = time.time()
|
475 |
+
entities = extract_entities_enhanced(stored_results, company_name.strip(), progress)
|
476 |
elapsed = time.time() - start
|
477 |
|
478 |
# Try to format JSON for better readability
|
|
|
482 |
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
|
483 |
except:
|
484 |
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
|
485 |
+
|
486 |
except Exception as e:
|
487 |
+
progress(1.0, desc="Extraction failed")
|
488 |
+
return f"β **Extraction failed**: {str(e)}"
|
489 |
|
490 |
+
def cancel_search():
|
491 |
+
"""Cancel the current search operation"""
|
492 |
+
cancel_operation.set()
|
493 |
+
return "π **Cancellation requested** - stopping current operation..."
|
494 |
|
495 |
+
# === Enhanced Gradio UI ===
|
496 |
+
|
497 |
+
with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
|
498 |
gr.Markdown("# π Enhanced Founder Finder")
|
499 |
+
gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy** with **real-time progress tracking**.")
|
500 |
+
gr.Markdown("*π **New Features**: Progress bars, cancellation support, parallel processing, better error handling*")
|
501 |
+
gr.Markdown("*β±οΈ Note: Enhanced search typically takes 30β60 seconds with full progress visibility.*")
|
502 |
|
503 |
search_state = gr.State("")
|
504 |
|
505 |
with gr.Row():
|
506 |
+
with gr.Column(scale=2):
|
507 |
+
name_input = gr.Textbox(
|
508 |
+
label="Company Name",
|
509 |
+
placeholder="Enter business name (e.g., 'Tesla', 'SpaceX', 'Microsoft')",
|
510 |
+
lines=1
|
511 |
+
)
|
512 |
+
with gr.Column(scale=1):
|
513 |
+
article_count_slider = gr.Slider(
|
514 |
+
2, 12,
|
515 |
+
value=4,
|
516 |
+
step=2,
|
517 |
+
label="Total Articles",
|
518 |
+
info="Split between recent/historical"
|
519 |
+
)
|
520 |
|
521 |
+
with gr.Row():
|
522 |
+
search_btn = gr.Button("π Enhanced Temporal Search", variant="primary", size="lg")
|
523 |
+
cancel_btn = gr.Button("π Cancel Search", variant="secondary", size="lg")
|
524 |
+
extract_btn = gr.Button("π Extract Founder Intelligence", variant="secondary", size="lg")
|
|
|
|
|
|
|
525 |
|
526 |
+
with gr.Row():
|
527 |
+
status_output = gr.Markdown("Ready to search...")
|
528 |
+
|
529 |
+
with gr.Row():
|
530 |
+
with gr.Column():
|
531 |
+
output1 = gr.Markdown(label="Search Results with Temporal Analysis", height=400)
|
532 |
+
with gr.Column():
|
533 |
+
output2 = gr.Textbox(
|
534 |
+
label="Founder Intelligence Report",
|
535 |
+
lines=15,
|
536 |
+
max_lines=25,
|
537 |
+
show_copy_button=True
|
538 |
+
)
|
539 |
+
|
540 |
+
# Event handlers
|
541 |
+
search_click = search_btn.click(
|
542 |
+
fn=search_only_enhanced,
|
543 |
inputs=[name_input, article_count_slider],
|
544 |
+
outputs=[output1, search_state],
|
545 |
+
show_progress=True
|
546 |
+
)
|
547 |
+
|
548 |
+
cancel_btn.click(
|
549 |
+
fn=cancel_search,
|
550 |
+
outputs=[status_output]
|
551 |
)
|
552 |
|
553 |
extract_btn.click(
|
554 |
+
fn=extract_only_enhanced,
|
555 |
inputs=[search_state, name_input],
|
556 |
+
outputs=[output2],
|
557 |
+
show_progress=True
|
558 |
+
)
|
559 |
+
|
560 |
+
# Add some example companies
|
561 |
+
gr.Examples(
|
562 |
+
examples=[
|
563 |
+
["Tesla", 4],
|
564 |
+
["SpaceX", 6],
|
565 |
+
["Microsoft", 4],
|
566 |
+
["Apple", 6],
|
567 |
+
["OpenAI", 4]
|
568 |
+
],
|
569 |
+
inputs=[name_input, article_count_slider],
|
570 |
)
|
571 |
|
572 |
if __name__ == "__main__":
|
573 |
+
demo.launch(
|
574 |
+
share=False,
|
575 |
+
show_error=True,
|
576 |
+
show_tips=True,
|
577 |
+
height=800
|
578 |
+
)
|
579 |
|
580 |
+
'''
|
581 |
+
import gradio as gr
|
582 |
import requests
|
583 |
import time
|
584 |
import re
|
|
|
586 |
from bs4 import BeautifulSoup
|
587 |
import anthropic
|
588 |
import os
|
589 |
+
from datetime import datetime, timedelta
|
590 |
+
from dateutil import parser
|
591 |
+
import json
|
592 |
|
593 |
# Initialize Anthropic client
|
594 |
client = anthropic.Anthropic(
|
|
|
597 |
|
598 |
# === Model functions ===
|
599 |
|
600 |
+
def extract_publication_date(soup, url):
|
601 |
+
"""Extract publication date from article HTML"""
|
602 |
+
try:
|
603 |
+
# Common date selectors
|
604 |
+
date_selectors = [
|
605 |
+
'time[datetime]',
|
606 |
+
'.date', '.publish-date', '.published', '.post-date',
|
607 |
+
'[class*="date"]', '[class*="time"]',
|
608 |
+
'meta[property="article:published_time"]',
|
609 |
+
'meta[name="publishdate"]',
|
610 |
+
'meta[name="date"]'
|
611 |
+
]
|
612 |
+
|
613 |
+
for selector in date_selectors:
|
614 |
+
element = soup.select_one(selector)
|
615 |
+
if element:
|
616 |
+
date_text = element.get('datetime') or element.get('content') or element.get_text()
|
617 |
+
if date_text:
|
618 |
+
try:
|
619 |
+
return parser.parse(date_text)
|
620 |
+
except:
|
621 |
+
continue
|
622 |
+
|
623 |
+
# Look for date patterns in text
|
624 |
+
date_patterns = [
|
625 |
+
r'(\w+ \d{1,2}, \d{4})', # January 15, 2023
|
626 |
+
r'(\d{1,2}/\d{1,2}/\d{4})', # 01/15/2023
|
627 |
+
r'(\d{4}-\d{2}-\d{2})' # 2023-01-15
|
628 |
+
]
|
629 |
+
|
630 |
+
text = soup.get_text()[:2000] # First 2000 chars
|
631 |
+
for pattern in date_patterns:
|
632 |
+
matches = re.findall(pattern, text)
|
633 |
+
if matches:
|
634 |
+
try:
|
635 |
+
return parser.parse(matches[0])
|
636 |
+
except:
|
637 |
+
continue
|
638 |
+
|
639 |
+
except Exception as e:
|
640 |
+
print(f"Date extraction error for {url}: {e}")
|
641 |
+
|
642 |
+
return None
|
643 |
+
|
644 |
def get_full_article(url):
|
645 |
try:
|
646 |
headers = {
|
|
|
655 |
response.raise_for_status()
|
656 |
soup = BeautifulSoup(response.content, 'html.parser')
|
657 |
|
658 |
+
# Extract publication date
|
659 |
+
pub_date = extract_publication_date(soup, url)
|
660 |
+
|
661 |
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
662 |
element.decompose()
|
663 |
|
|
|
675 |
text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
|
676 |
full_text = '\n\n'.join(text_parts)
|
677 |
if len(full_text) > 300:
|
678 |
+
return full_text[:10000], pub_date
|
679 |
|
680 |
body_text = soup.get_text(separator='\n\n', strip=True)
|
681 |
body_text = re.sub(r'\n{3,}', '\n\n', body_text)
|
682 |
+
return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
|
683 |
|
684 |
except requests.exceptions.Timeout:
|
685 |
+
return "[WARNING] Article fetch timeout - using snippet instead", None
|
686 |
except requests.exceptions.RequestException:
|
687 |
+
return "[ERROR] Could not fetch article: Network error", None
|
688 |
except Exception as e:
|
689 |
+
return f"[ERROR] Could not fetch article: {str(e)}", None
|
690 |
|
691 |
+
def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
|
692 |
+
"""Search for articles in specific timeframe"""
|
693 |
+
|
694 |
+
# Define search queries based on timeframe
|
695 |
+
if timeframe == "recent":
|
696 |
+
# Recent articles (news, updates, current events)
|
697 |
+
search_queries = [
|
698 |
+
f'"{name}" founder news 2024 2025',
|
699 |
+
f'"{name}" CEO founder recent',
|
700 |
+
f'"{name}" founder update latest'
|
701 |
+
]
|
702 |
+
else: # historical
|
703 |
+
# Historical articles (founding, establishment, origin stories)
|
704 |
+
search_queries = [
|
705 |
+
f'"{name}" founded established history',
|
706 |
+
f'"{name}" founder origin story',
|
707 |
+
f'"{name}" started began founder',
|
708 |
+
f'"{name}" founder early days'
|
709 |
+
]
|
710 |
|
711 |
+
all_results = []
|
712 |
+
max_retries = 2
|
713 |
base_delay = 3
|
714 |
|
715 |
+
for query_idx, search_query in enumerate(search_queries):
|
716 |
+
if len(all_results) >= max_articles:
|
717 |
+
break
|
718 |
+
|
719 |
+
for attempt in range(max_retries):
|
720 |
+
try:
|
721 |
+
print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
|
722 |
+
time.sleep(base_delay * (attempt + 1))
|
723 |
|
724 |
+
configs = [
|
725 |
+
{'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
|
726 |
+
{'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
|
727 |
+
]
|
|
|
728 |
|
729 |
+
config = configs[min(attempt, len(configs)-1)]
|
730 |
|
731 |
+
with DDGS(timeout=config['timeout']) as ddgs:
|
732 |
+
search_params = {
|
733 |
+
'keywords': search_query,
|
734 |
+
'max_results': max_articles - len(all_results) + 2, # Get a few extra to filter
|
735 |
+
'safesearch': config['safesearch']
|
736 |
+
}
|
737 |
+
if config['region']:
|
738 |
+
search_params['region'] = config['region']
|
739 |
|
740 |
+
results = list(ddgs.text(**search_params))
|
741 |
+
print(f"Found {len(results)} results for query {query_idx + 1}")
|
742 |
|
743 |
+
if results:
|
744 |
+
# Add unique results (avoid duplicates)
|
745 |
+
existing_urls = {r.get('url', '') for r in all_results}
|
746 |
+
for result in results:
|
747 |
+
if len(all_results) >= max_articles:
|
748 |
+
break
|
749 |
+
url = result.get('href', '')
|
750 |
+
if url and url not in existing_urls:
|
751 |
+
all_results.append(result)
|
752 |
+
existing_urls.add(url)
|
753 |
+
break
|
754 |
+
|
755 |
+
except Exception as e:
|
756 |
+
print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
|
757 |
+
if attempt < max_retries - 1:
|
758 |
+
time.sleep(base_delay * (attempt + 2))
|
759 |
|
760 |
+
return all_results[:max_articles]
|
|
|
|
|
|
|
|
|
761 |
|
762 |
+
def categorize_article_by_date(pub_date):
|
763 |
+
"""Categorize article as recent or historical based on publication date"""
|
764 |
+
if not pub_date:
|
765 |
+
return "unknown"
|
766 |
+
|
767 |
+
one_year_ago = datetime.now() - timedelta(days=365)
|
768 |
+
|
769 |
+
if pub_date >= one_year_ago:
|
770 |
+
return "recent"
|
771 |
+
else:
|
772 |
+
return "historical"
|
773 |
+
|
774 |
+
def search_articles(name: str, max_articles: int = 4) -> str:
|
775 |
+
"""Enhanced search that ensures both recent and historical articles"""
|
776 |
+
|
777 |
+
# Split articles between recent and historical
|
778 |
+
recent_count = max_articles // 2
|
779 |
+
historical_count = max_articles - recent_count
|
780 |
+
|
781 |
+
print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
|
782 |
+
|
783 |
+
# Search for recent articles
|
784 |
+
recent_results = search_articles_by_timeframe(name, "recent", recent_count)
|
785 |
+
time.sleep(2) # Brief pause between timeframe searches
|
786 |
+
|
787 |
+
# Search for historical articles
|
788 |
+
historical_results = search_articles_by_timeframe(name, "historical", historical_count)
|
789 |
+
|
790 |
+
# Combine and process all results
|
791 |
+
all_results = []
|
792 |
+
|
793 |
+
# Process recent articles
|
794 |
+
for result in recent_results:
|
795 |
+
result['expected_timeframe'] = 'recent'
|
796 |
+
all_results.append(result)
|
797 |
+
|
798 |
+
# Process historical articles
|
799 |
+
for result in historical_results:
|
800 |
+
result['expected_timeframe'] = 'historical'
|
801 |
+
all_results.append(result)
|
802 |
+
|
803 |
+
if not all_results:
|
804 |
+
return f"[INFO] No articles found for {name}"
|
805 |
+
|
806 |
+
# Fetch and categorize articles
|
807 |
+
articles = []
|
808 |
+
recent_found = 0
|
809 |
+
historical_found = 0
|
810 |
+
|
811 |
+
for i, result in enumerate(all_results, 1):
|
812 |
+
url = result.get('href', 'No URL')
|
813 |
+
title = result.get('title', 'No Title')
|
814 |
+
snippet = result.get('body', 'No snippet available')
|
815 |
+
expected_timeframe = result.get('expected_timeframe', 'unknown')
|
816 |
|
817 |
+
if i > 1:
|
818 |
+
time.sleep(2)
|
|
|
|
|
|
|
|
|
819 |
|
820 |
+
full_text, pub_date = get_full_article(url)
|
821 |
+
actual_timeframe = categorize_article_by_date(pub_date)
|
822 |
+
|
823 |
+
# Count articles by actual timeframe
|
824 |
+
if actual_timeframe == "recent":
|
825 |
+
recent_found += 1
|
826 |
+
elif actual_timeframe == "historical":
|
827 |
+
historical_found += 1
|
828 |
+
|
829 |
+
if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
|
830 |
+
print(f"Using snippet fallback for article {i}")
|
831 |
+
content = f"[SNIPPET ONLY]\n{snippet}"
|
832 |
+
else:
|
833 |
+
content = full_text
|
834 |
|
835 |
+
# Create timeframe indicator
|
836 |
+
timeframe_indicator = ""
|
837 |
+
if pub_date:
|
838 |
+
date_str = pub_date.strftime("%B %d, %Y")
|
839 |
+
timeframe_indicator = f"π
**Published**: {date_str} ({actual_timeframe.title()})"
|
840 |
+
else:
|
841 |
+
timeframe_indicator = f"π
**Timeframe**: {expected_timeframe.title()} (estimated)"
|
842 |
|
843 |
+
article = f"### {i}. {title}\n"
|
844 |
+
article += f"[Source]({url})\n"
|
845 |
+
article += f"{timeframe_indicator}\n\n"
|
846 |
+
article += f"{content}\n"
|
847 |
+
articles.append(article)
|
|
|
848 |
|
849 |
+
# Add summary of coverage
|
850 |
+
summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
|
851 |
+
|
852 |
+
return summary + "\n---\n".join(articles)
|
853 |
|
854 |
def extract_entities(search_results: str, company_name: str) -> str:
|
855 |
"""Extract entities using Claude 4"""
|
856 |
+
MAX_CHARS = 15000
|
857 |
if len(search_results) > MAX_CHARS:
|
858 |
trunc = search_results[:MAX_CHARS]
|
859 |
last_period = trunc.rfind('. ')
|
|
|
862 |
prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
|
863 |
Only include founders who are explicitly mentioned as founders of {company_name}.
|
864 |
Ignore founders of other companies that may be mentioned in the text.
|
865 |
+
|
866 |
+
Return a JSON object with the following structure:
|
867 |
+
{{
|
868 |
+
"founders": [
|
869 |
+
{{
|
870 |
+
"name": "Founder Name",
|
871 |
+
"evidence": ["brief quote or context where they were mentioned as founder"]
|
872 |
+
}}
|
873 |
+
]
|
874 |
+
}}
|
875 |
+
|
876 |
+
Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
|
877 |
+
|
878 |
+
You have to examine every article available in the search results below.
|
879 |
+
|
880 |
Text:
|
881 |
{search_results}"""
|
882 |
|
883 |
try:
|
884 |
message = client.messages.create(
|
885 |
model="claude-sonnet-4-20250514",
|
886 |
+
max_tokens=1500,
|
887 |
+
temperature=0.1,
|
888 |
messages=[
|
889 |
{
|
890 |
"role": "user",
|
|
|
908 |
articles_output = search_articles(name.strip(), max_articles=article_count)
|
909 |
elapsed = time.time() - start
|
910 |
|
911 |
+
results = f"β
**Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
|
912 |
results += articles_output
|
913 |
|
914 |
return results, articles_output
|
|
|
926 |
start = time.time()
|
927 |
entities = extract_entities(stored_results, company_name.strip())
|
928 |
elapsed = time.time() - start
|
929 |
+
|
930 |
+
# Try to format JSON for better readability
|
931 |
+
try:
|
932 |
+
parsed = json.loads(entities)
|
933 |
+
formatted = json.dumps(parsed, indent=2)
|
934 |
+
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
|
935 |
+
except:
|
936 |
+
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
|
937 |
except Exception as e:
|
938 |
return f"[ERROR] Extraction failed: {str(e)}"
|
939 |
|
940 |
# === Gradio UI ===
|
941 |
|
942 |
+
with gr.Blocks(title="Enhanced Founder Finder") as demo:
|
943 |
+
gr.Markdown("# π Enhanced Founder Finder")
|
944 |
+
gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
|
945 |
+
gr.Markdown("*π **New**: Automatically searches for both recent news AND historical founding information*")
|
946 |
+
gr.Markdown("*β±οΈ Note: Enhanced search may take 60β90 seconds for comprehensive results.*")
|
947 |
|
948 |
search_state = gr.State("")
|
949 |
|
950 |
with gr.Row():
|
951 |
name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
|
952 |
+
article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
|
953 |
with gr.Column():
|
954 |
+
search_btn = gr.Button("π Enhanced Temporal Search", variant="primary")
|
955 |
+
extract_btn = gr.Button("π Extract Founder Intelligence", variant="secondary")
|
956 |
|
957 |
+
output1 = gr.Markdown(label="Search Results with Temporal Analysis")
|
958 |
output2 = gr.Textbox(
|
959 |
+
label="Founder Intelligence Report",
|
960 |
+
lines=15,
|
961 |
+
max_lines=25,
|
962 |
show_copy_button=True
|
963 |
)
|
964 |
|
|
|
976 |
|
977 |
if __name__ == "__main__":
|
978 |
demo.launch()
|
979 |
+
|
980 |
+
'''
|