dygoo commited on
Commit
ddd51b5
·
verified ·
1 Parent(s): 99f18e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -406
app.py CHANGED
@@ -13,8 +13,7 @@ anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
13
 
14
  def search_workflow(name: str, progress=gr.Progress()):
15
  """
16
- A simple, sequential, and robust function to search for articles.
17
- It fetches exactly 8 articles: 4 recent, 4 historical.
18
  """
19
  if not name or not name.strip():
20
  return "❌ Please enter a company name.", ""
@@ -80,7 +79,7 @@ def search_workflow(name: str, progress=gr.Progress()):
80
 
81
  def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
82
  """
83
- A simple and robust function to extract founders from text using the AI model.
84
  """
85
  if not raw_text or not raw_text.strip():
86
  return "❌ Please run a search first to get text to analyze."
@@ -135,8 +134,8 @@ ARTICLES:
135
  # === 3. Simplified Gradio UI ===
136
 
137
  with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
138
- gr.Markdown("# 🔎 Simple Founder Finder")
139
- gr.Markdown("A simplified and robust tool to find company founders. **Step 1:** Search for articles. **Step 2:** Extract founders from the results.")
140
 
141
  # Hidden state to pass text from search to extraction
142
  search_results_for_ai = gr.State("")
@@ -178,404 +177,3 @@ demo.queue()
178
  if __name__ == "__main__":
179
  demo.launch(show_error=True)
180
 
181
- '''
182
- import gradio as gr
183
- import requests
184
- import time
185
- import re
186
- from duckduckgo_search import DDGS
187
- from bs4 import BeautifulSoup
188
- import anthropic
189
- import os
190
- from datetime import datetime, timedelta
191
- from dateutil import parser
192
- import json
193
-
194
- # Initialize Anthropic client
195
- client = anthropic.Anthropic(
196
- api_key=os.getenv("ANTHROPIC_API_KEY") # Set as secret in HF Space settings
197
- )
198
-
199
- # === Model functions ===
200
-
201
- def extract_publication_date(soup, url):
202
- """Extract publication date from article HTML"""
203
- try:
204
- # Common date selectors
205
- date_selectors = [
206
- 'time[datetime]',
207
- '.date', '.publish-date', '.published', '.post-date',
208
- '[class*="date"]', '[class*="time"]',
209
- 'meta[property="article:published_time"]',
210
- 'meta[name="publishdate"]',
211
- 'meta[name="date"]'
212
- ]
213
-
214
- for selector in date_selectors:
215
- element = soup.select_one(selector)
216
- if element:
217
- date_text = element.get('datetime') or element.get('content') or element.get_text()
218
- if date_text:
219
- try:
220
- return parser.parse(date_text)
221
- except:
222
- continue
223
-
224
- # Look for date patterns in text
225
- date_patterns = [
226
- r'(\w+ \d{1,2}, \d{4})', # January 15, 2023
227
- r'(\d{1,2}/\d{1,2}/\d{4})', # 01/15/2023
228
- r'(\d{4}-\d{2}-\d{2})' # 2023-01-15
229
- ]
230
-
231
- text = soup.get_text()[:2000] # First 2000 chars
232
- for pattern in date_patterns:
233
- matches = re.findall(pattern, text)
234
- if matches:
235
- try:
236
- return parser.parse(matches[0])
237
- except:
238
- continue
239
-
240
- except Exception as e:
241
- print(f"Date extraction error for {url}: {e}")
242
-
243
- return None
244
-
245
- def get_full_article(url):
246
- try:
247
- headers = {
248
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
249
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
250
- 'Accept-Language': 'en-US,en;q=0.5',
251
- 'Connection': 'keep-alive',
252
- 'Upgrade-Insecure-Requests': '1'
253
- }
254
-
255
- response = requests.get(url, headers=headers, timeout=20, verify=True)
256
- response.raise_for_status()
257
- soup = BeautifulSoup(response.content, 'html.parser')
258
-
259
- # Extract publication date
260
- pub_date = extract_publication_date(soup, url)
261
-
262
- for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
263
- element.decompose()
264
-
265
- article_selectors = [
266
- 'article', '.article-content', '.post-content', '.story-body', '.story-content',
267
- '.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
268
- '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
269
- ]
270
-
271
- for selector in article_selectors:
272
- content = soup.select_one(selector)
273
- if content:
274
- paragraphs = content.find_all(['p', 'div'], string=True)
275
- if paragraphs:
276
- text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
277
- full_text = '\n\n'.join(text_parts)
278
- if len(full_text) > 300:
279
- return full_text[:10000], pub_date
280
-
281
- body_text = soup.get_text(separator='\n\n', strip=True)
282
- body_text = re.sub(r'\n{3,}', '\n\n', body_text)
283
- return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
284
-
285
- except requests.exceptions.Timeout:
286
- return "[WARNING] Article fetch timeout - using snippet instead", None
287
- except requests.exceptions.RequestException:
288
- return "[ERROR] Could not fetch article: Network error", None
289
- except Exception as e:
290
- return f"[ERROR] Could not fetch article: {str(e)}", None
291
-
292
- def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
293
- """Search for articles in specific timeframe"""
294
-
295
- # Define search queries based on timeframe
296
- if timeframe == "recent":
297
- # Recent articles (news, updates, current events)
298
- search_queries = [
299
- f'"{name}" founder news 2024 2025',
300
- f'"{name}" CEO founder recent',
301
- f'"{name}" founder update latest'
302
- ]
303
- else: # historical
304
- # Historical articles (founding, establishment, origin stories)
305
- search_queries = [
306
- f'"{name}" founded established history',
307
- f'"{name}" founder origin story',
308
- f'"{name}" started began founder',
309
- f'"{name}" founder early days'
310
- ]
311
-
312
- all_results = []
313
- max_retries = 2
314
- base_delay = 3
315
-
316
- for query_idx, search_query in enumerate(search_queries):
317
- if len(all_results) >= max_articles:
318
- break
319
-
320
- for attempt in range(max_retries):
321
- try:
322
- print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
323
- time.sleep(base_delay * (attempt + 1))
324
-
325
- configs = [
326
- {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
327
- {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
328
- ]
329
-
330
- config = configs[min(attempt, len(configs)-1)]
331
-
332
- with DDGS(timeout=config['timeout']) as ddgs:
333
- search_params = {
334
- 'keywords': search_query,
335
- 'max_results': max_articles - len(all_results) + 2, # Get a few extra to filter
336
- 'safesearch': config['safesearch']
337
- }
338
- if config['region']:
339
- search_params['region'] = config['region']
340
-
341
- results = list(ddgs.text(**search_params))
342
- print(f"Found {len(results)} results for query {query_idx + 1}")
343
-
344
- if results:
345
- # Add unique results (avoid duplicates)
346
- existing_urls = {r.get('url', '') for r in all_results}
347
- for result in results:
348
- if len(all_results) >= max_articles:
349
- break
350
- url = result.get('href', '')
351
- if url and url not in existing_urls:
352
- all_results.append(result)
353
- existing_urls.add(url)
354
- break
355
-
356
- except Exception as e:
357
- print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
358
- if attempt < max_retries - 1:
359
- time.sleep(base_delay * (attempt + 2))
360
-
361
- return all_results[:max_articles]
362
-
363
- def categorize_article_by_date(pub_date):
364
- """Categorize article as recent or historical based on publication date"""
365
- if not pub_date:
366
- return "unknown"
367
-
368
- one_year_ago = datetime.now() - timedelta(days=365)
369
-
370
- if pub_date >= one_year_ago:
371
- return "recent"
372
- else:
373
- return "historical"
374
-
375
- def search_articles(name: str, max_articles: int = 4) -> str:
376
- """Enhanced search that ensures both recent and historical articles"""
377
-
378
- # Split articles between recent and historical
379
- recent_count = max_articles // 2
380
- historical_count = max_articles - recent_count
381
-
382
- print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
383
-
384
- # Search for recent articles
385
- recent_results = search_articles_by_timeframe(name, "recent", recent_count)
386
- time.sleep(2) # Brief pause between timeframe searches
387
-
388
- # Search for historical articles
389
- historical_results = search_articles_by_timeframe(name, "historical", historical_count)
390
-
391
- # Combine and process all results
392
- all_results = []
393
-
394
- # Process recent articles
395
- for result in recent_results:
396
- result['expected_timeframe'] = 'recent'
397
- all_results.append(result)
398
-
399
- # Process historical articles
400
- for result in historical_results:
401
- result['expected_timeframe'] = 'historical'
402
- all_results.append(result)
403
-
404
- if not all_results:
405
- return f"[INFO] No articles found for {name}"
406
-
407
- # Fetch and categorize articles
408
- articles = []
409
- recent_found = 0
410
- historical_found = 0
411
-
412
- for i, result in enumerate(all_results, 1):
413
- url = result.get('href', 'No URL')
414
- title = result.get('title', 'No Title')
415
- snippet = result.get('body', 'No snippet available')
416
- expected_timeframe = result.get('expected_timeframe', 'unknown')
417
-
418
- if i > 1:
419
- time.sleep(2)
420
-
421
- full_text, pub_date = get_full_article(url)
422
- actual_timeframe = categorize_article_by_date(pub_date)
423
-
424
- # Count articles by actual timeframe
425
- if actual_timeframe == "recent":
426
- recent_found += 1
427
- elif actual_timeframe == "historical":
428
- historical_found += 1
429
-
430
- if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
431
- print(f"Using snippet fallback for article {i}")
432
- content = f"[SNIPPET ONLY]\n{snippet}"
433
- else:
434
- content = full_text
435
-
436
- # Create timeframe indicator
437
- timeframe_indicator = ""
438
- if pub_date:
439
- date_str = pub_date.strftime("%B %d, %Y")
440
- timeframe_indicator = f"📅 **Published**: {date_str} ({actual_timeframe.title()})"
441
- else:
442
- timeframe_indicator = f"📅 **Timeframe**: {expected_timeframe.title()} (estimated)"
443
-
444
- article = f"### {i}. {title}\n"
445
- article += f"[Source]({url})\n"
446
- article += f"{timeframe_indicator}\n\n"
447
- article += f"{content}\n"
448
- articles.append(article)
449
-
450
- # Add summary of coverage
451
- summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
452
-
453
- return summary + "\n---\n".join(articles)
454
-
455
- def extract_entities(search_results: str, company_name: str) -> str:
456
- """Extract entities using Claude 4"""
457
- MAX_CHARS = 15000
458
- if len(search_results) > MAX_CHARS:
459
- trunc = search_results[:MAX_CHARS]
460
- last_period = trunc.rfind('. ')
461
- search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
462
-
463
- prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
464
- Only include founders who are explicitly mentioned as founders of {company_name}.
465
- Ignore founders of other companies that may be mentioned in the text.
466
-
467
- Return a JSON object with the following structure:
468
- {{
469
- "founders": [
470
- {{
471
- "name": "Founder Name",
472
- "evidence": ["brief quote or context where they were mentioned as founder"]
473
- }}
474
- ]
475
- }}
476
-
477
- Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
478
-
479
- You have to examine every article available in the search results below.
480
-
481
- Text:
482
- {search_results}"""
483
-
484
- try:
485
- message = client.messages.create(
486
- model="claude-sonnet-4-20250514",
487
- max_tokens=1500,
488
- temperature=0.1,
489
- messages=[
490
- {
491
- "role": "user",
492
- "content": prompt
493
- }
494
- ]
495
- )
496
- return message.content[0].text
497
-
498
- except Exception as e:
499
- return f"[ERROR] Extraction failed: {str(e)}"
500
-
501
- # === Gradio interface functions ===
502
-
503
- def search_only(name: str, article_count: int):
504
- if not name.strip():
505
- return "No name provided", ""
506
-
507
- try:
508
- start = time.time()
509
- articles_output = search_articles(name.strip(), max_articles=article_count)
510
- elapsed = time.time() - start
511
-
512
- results = f"✅ **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
513
- results += articles_output
514
-
515
- return results, articles_output
516
- except Exception as e:
517
- return f"[ERROR] Search failed: {str(e)}", ""
518
-
519
- def extract_only(stored_results: str, company_name: str):
520
- if not stored_results.strip():
521
- return "No search results available. Please search first."
522
-
523
- if not company_name.strip():
524
- return "No company name provided. Please search first."
525
-
526
- try:
527
- start = time.time()
528
- entities = extract_entities(stored_results, company_name.strip())
529
- elapsed = time.time() - start
530
-
531
- # Try to format JSON for better readability
532
- try:
533
- parsed = json.loads(entities)
534
- formatted = json.dumps(parsed, indent=2)
535
- return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
536
- except:
537
- return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
538
- except Exception as e:
539
- return f"[ERROR] Extraction failed: {str(e)}"
540
-
541
- # === Gradio UI ===
542
-
543
- with gr.Blocks(title="Enhanced Founder Finder") as demo:
544
- gr.Markdown("# 🔎 Enhanced Founder Finder")
545
- gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
546
- gr.Markdown("*🚀 **New**: Automatically searches for both recent news AND historical founding information*")
547
- gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
548
-
549
- search_state = gr.State("")
550
-
551
- with gr.Row():
552
- name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
553
- article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
554
- with gr.Column():
555
- search_btn = gr.Button("🔍 Enhanced Temporal Search", variant="primary")
556
- extract_btn = gr.Button("📊 Extract Founder Intelligence", variant="secondary")
557
-
558
- output1 = gr.Markdown(label="Search Results with Temporal Analysis")
559
- output2 = gr.Textbox(
560
- label="Founder Intelligence Report",
561
- lines=15,
562
- max_lines=25,
563
- show_copy_button=True
564
- )
565
-
566
- search_btn.click(
567
- fn=search_only,
568
- inputs=[name_input, article_count_slider],
569
- outputs=[output1, search_state]
570
- )
571
-
572
- extract_btn.click(
573
- fn=extract_only,
574
- inputs=[search_state, name_input],
575
- outputs=[output2]
576
- )
577
-
578
- if __name__ == "__main__":
579
- demo.launch()
580
-
581
- '''
 
13
 
14
  def search_workflow(name: str, progress=gr.Progress()):
15
  """
16
+ A simple function to search for articles, fetching exactly 8 news articles: 4 recent, 4 historical.
 
17
  """
18
  if not name or not name.strip():
19
  return "❌ Please enter a company name.", ""
 
79
 
80
  def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
81
  """
82
+ A simple and robust tool to extract founders from text using the AI model.
83
  """
84
  if not raw_text or not raw_text.strip():
85
  return "❌ Please run a search first to get text to analyze."
 
134
  # === 3. Simplified Gradio UI ===
135
 
136
  with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
137
+ gr.Markdown("# 🔎 Founder Finder")
138
+ gr.Markdown("A tool to find company founders. **Step 1:** Search for articles. **Step 2:** Extract founders from the results.")
139
 
140
  # Hidden state to pass text from search to extraction
141
  search_results_for_ai = gr.State("")
 
177
  if __name__ == "__main__":
178
  demo.launch(show_error=True)
179