dygoo commited on
Commit
3238b9e
Β·
verified Β·
1 Parent(s): 65d6d08

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +407 -1
app.py CHANGED
@@ -6,6 +6,412 @@ from duckduckgo_search import DDGS
6
  from bs4 import BeautifulSoup
7
  import anthropic
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Initialize Anthropic client
11
  client = anthropic.Anthropic(
@@ -231,5 +637,5 @@ with gr.Blocks(title="Founder Finder") as demo:
231
 
232
  if __name__ == "__main__":
233
  demo.launch()
234
-
235
 
 
6
  from bs4 import BeautifulSoup
7
  import anthropic
8
  import os
9
+ from datetime import datetime, timedelta
10
+ from dateutil import parser
11
+ import json
12
+
13
+ # Initialize Anthropic client
14
+ client = anthropic.Anthropic(
15
+ api_key=os.getenv("ANTHROPIC_API_KEY") # Set as secret in HF Space settings
16
+ )
17
+
18
+ # === Model functions ===
19
+
20
+ def extract_publication_date(soup, url):
21
+ """Extract publication date from article HTML"""
22
+ try:
23
+ # Common date selectors
24
+ date_selectors = [
25
+ 'time[datetime]',
26
+ '.date', '.publish-date', '.published', '.post-date',
27
+ '[class*="date"]', '[class*="time"]',
28
+ 'meta[property="article:published_time"]',
29
+ 'meta[name="publishdate"]',
30
+ 'meta[name="date"]'
31
+ ]
32
+
33
+ for selector in date_selectors:
34
+ element = soup.select_one(selector)
35
+ if element:
36
+ date_text = element.get('datetime') or element.get('content') or element.get_text()
37
+ if date_text:
38
+ try:
39
+ return parser.parse(date_text)
40
+ except:
41
+ continue
42
+
43
+ # Look for date patterns in text
44
+ date_patterns = [
45
+ r'(\w+ \d{1,2}, \d{4})', # January 15, 2023
46
+ r'(\d{1,2}/\d{1,2}/\d{4})', # 01/15/2023
47
+ r'(\d{4}-\d{2}-\d{2})' # 2023-01-15
48
+ ]
49
+
50
+ text = soup.get_text()[:2000] # First 2000 chars
51
+ for pattern in date_patterns:
52
+ matches = re.findall(pattern, text)
53
+ if matches:
54
+ try:
55
+ return parser.parse(matches[0])
56
+ except:
57
+ continue
58
+
59
+ except Exception as e:
60
+ print(f"Date extraction error for {url}: {e}")
61
+
62
+ return None
63
+
64
+ def get_full_article(url):
65
+ try:
66
+ headers = {
67
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
68
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
69
+ 'Accept-Language': 'en-US,en;q=0.5',
70
+ 'Connection': 'keep-alive',
71
+ 'Upgrade-Insecure-Requests': '1'
72
+ }
73
+
74
+ response = requests.get(url, headers=headers, timeout=20, verify=True)
75
+ response.raise_for_status()
76
+ soup = BeautifulSoup(response.content, 'html.parser')
77
+
78
+ # Extract publication date
79
+ pub_date = extract_publication_date(soup, url)
80
+
81
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
82
+ element.decompose()
83
+
84
+ article_selectors = [
85
+ 'article', '.article-content', '.post-content', '.story-body', '.story-content',
86
+ '.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
87
+ '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
88
+ ]
89
+
90
+ for selector in article_selectors:
91
+ content = soup.select_one(selector)
92
+ if content:
93
+ paragraphs = content.find_all(['p', 'div'], string=True)
94
+ if paragraphs:
95
+ text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
96
+ full_text = '\n\n'.join(text_parts)
97
+ if len(full_text) > 300:
98
+ return full_text[:10000], pub_date
99
+
100
+ body_text = soup.get_text(separator='\n\n', strip=True)
101
+ body_text = re.sub(r'\n{3,}', '\n\n', body_text)
102
+ return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
103
+
104
+ except requests.exceptions.Timeout:
105
+ return "[WARNING] Article fetch timeout - using snippet instead", None
106
+ except requests.exceptions.RequestException:
107
+ return "[ERROR] Could not fetch article: Network error", None
108
+ except Exception as e:
109
+ return f"[ERROR] Could not fetch article: {str(e)}", None
110
+
111
+ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
112
+ """Search for articles in specific timeframe"""
113
+
114
+ # Define search queries based on timeframe
115
+ if timeframe == "recent":
116
+ # Recent articles (news, updates, current events)
117
+ search_queries = [
118
+ f'"{name}" founder news 2024 2025',
119
+ f'"{name}" CEO founder recent',
120
+ f'"{name}" founder update latest'
121
+ ]
122
+ else: # historical
123
+ # Historical articles (founding, establishment, origin stories)
124
+ search_queries = [
125
+ f'"{name}" founded established history',
126
+ f'"{name}" founder origin story',
127
+ f'"{name}" started began founder',
128
+ f'"{name}" founder early days'
129
+ ]
130
+
131
+ all_results = []
132
+ max_retries = 2
133
+ base_delay = 3
134
+
135
+ for query_idx, search_query in enumerate(search_queries):
136
+ if len(all_results) >= max_articles:
137
+ break
138
+
139
+ for attempt in range(max_retries):
140
+ try:
141
+ print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
142
+ time.sleep(base_delay * (attempt + 1))
143
+
144
+ configs = [
145
+ {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
146
+ {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
147
+ ]
148
+
149
+ config = configs[min(attempt, len(configs)-1)]
150
+
151
+ with DDGS(timeout=config['timeout']) as ddgs:
152
+ search_params = {
153
+ 'keywords': search_query,
154
+ 'max_results': max_articles - len(all_results) + 2, # Get a few extra to filter
155
+ 'safesearch': config['safesearch']
156
+ }
157
+ if config['region']:
158
+ search_params['region'] = config['region']
159
+
160
+ results = list(ddgs.text(**search_params))
161
+ print(f"Found {len(results)} results for query {query_idx + 1}")
162
+
163
+ if results:
164
+ # Add unique results (avoid duplicates)
165
+ existing_urls = {r.get('url', '') for r in all_results}
166
+ for result in results:
167
+ if len(all_results) >= max_articles:
168
+ break
169
+ url = result.get('href', '')
170
+ if url and url not in existing_urls:
171
+ all_results.append(result)
172
+ existing_urls.add(url)
173
+ break
174
+
175
+ except Exception as e:
176
+ print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
177
+ if attempt < max_retries - 1:
178
+ time.sleep(base_delay * (attempt + 2))
179
+
180
+ return all_results[:max_articles]
181
+
182
+ def categorize_article_by_date(pub_date):
183
+ """Categorize article as recent or historical based on publication date"""
184
+ if not pub_date:
185
+ return "unknown"
186
+
187
+ one_year_ago = datetime.now() - timedelta(days=365)
188
+
189
+ if pub_date >= one_year_ago:
190
+ return "recent"
191
+ else:
192
+ return "historical"
193
+
194
+ def search_articles(name: str, max_articles: int = 4) -> str:
195
+ """Enhanced search that ensures both recent and historical articles"""
196
+
197
+ # Split articles between recent and historical
198
+ recent_count = max_articles // 2
199
+ historical_count = max_articles - recent_count
200
+
201
+ print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
202
+
203
+ # Search for recent articles
204
+ recent_results = search_articles_by_timeframe(name, "recent", recent_count)
205
+ time.sleep(2) # Brief pause between timeframe searches
206
+
207
+ # Search for historical articles
208
+ historical_results = search_articles_by_timeframe(name, "historical", historical_count)
209
+
210
+ # Combine and process all results
211
+ all_results = []
212
+
213
+ # Process recent articles
214
+ for result in recent_results:
215
+ result['expected_timeframe'] = 'recent'
216
+ all_results.append(result)
217
+
218
+ # Process historical articles
219
+ for result in historical_results:
220
+ result['expected_timeframe'] = 'historical'
221
+ all_results.append(result)
222
+
223
+ if not all_results:
224
+ return f"[INFO] No articles found for {name}"
225
+
226
+ # Fetch and categorize articles
227
+ articles = []
228
+ recent_found = 0
229
+ historical_found = 0
230
+
231
+ for i, result in enumerate(all_results, 1):
232
+ url = result.get('href', 'No URL')
233
+ title = result.get('title', 'No Title')
234
+ snippet = result.get('body', 'No snippet available')
235
+ expected_timeframe = result.get('expected_timeframe', 'unknown')
236
+
237
+ if i > 1:
238
+ time.sleep(2)
239
+
240
+ full_text, pub_date = get_full_article(url)
241
+ actual_timeframe = categorize_article_by_date(pub_date)
242
+
243
+ # Count articles by actual timeframe
244
+ if actual_timeframe == "recent":
245
+ recent_found += 1
246
+ elif actual_timeframe == "historical":
247
+ historical_found += 1
248
+
249
+ if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
250
+ print(f"Using snippet fallback for article {i}")
251
+ content = f"[SNIPPET ONLY]\n{snippet}"
252
+ else:
253
+ content = full_text
254
+
255
+ # Create timeframe indicator
256
+ timeframe_indicator = ""
257
+ if pub_date:
258
+ date_str = pub_date.strftime("%B %d, %Y")
259
+ timeframe_indicator = f"πŸ“… **Published**: {date_str} ({actual_timeframe.title()})"
260
+ else:
261
+ timeframe_indicator = f"πŸ“… **Timeframe**: {expected_timeframe.title()} (estimated)"
262
+
263
+ article = f"### {i}. {title}\n"
264
+ article += f"[Source]({url})\n"
265
+ article += f"{timeframe_indicator}\n\n"
266
+ article += f"{content}\n"
267
+ articles.append(article)
268
+
269
+ # Add summary of coverage
270
+ summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
271
+
272
+ return summary + "\n---\n".join(articles)
273
+
274
+ def extract_entities(search_results: str, company_name: str) -> str:
275
+ """Extract entities using Claude 4"""
276
+ MAX_CHARS = 12000 # Increased to handle more content
277
+ if len(search_results) > MAX_CHARS:
278
+ trunc = search_results[:MAX_CHARS]
279
+ last_period = trunc.rfind('. ')
280
+ search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
281
+
282
+ prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
283
+ Only include founders who are explicitly mentioned as founders of {company_name}.
284
+ Ignore founders of other companies that may be mentioned in the text.
285
+
286
+ Also identify the temporal context for each founder mention (recent news vs historical founding information).
287
+
288
+ Return a JSON object with the following structure:
289
+ {{
290
+ "founders": [
291
+ {{
292
+ "name": "Founder Name",
293
+ "type": "person" or "organization",
294
+ "context": "recent" or "historical" or "both",
295
+ "evidence": ["brief quote or context where they were mentioned as founder"]
296
+ }}
297
+ ],
298
+ "founding_timeline": {{
299
+ "founding_date": "date if mentioned",
300
+ "key_events": ["important founding milestones mentioned"]
301
+ }},
302
+ "confidence": "high/medium/low based on clarity of founder information"
303
+ }}
304
+
305
+ Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
306
+
307
+ Text:
308
+ {search_results}"""
309
+
310
+ try:
311
+ message = client.messages.create(
312
+ model="claude-sonnet-4-20250514",
313
+ max_tokens=1500,
314
+ temperature=0.1,
315
+ messages=[
316
+ {
317
+ "role": "user",
318
+ "content": prompt
319
+ }
320
+ ]
321
+ )
322
+ return message.content[0].text
323
+
324
+ except Exception as e:
325
+ return f"[ERROR] Extraction failed: {str(e)}"
326
+
327
+ # === Gradio interface functions ===
328
+
329
+ def search_only(name: str, article_count: int):
330
+ if not name.strip():
331
+ return "No name provided", ""
332
+
333
+ try:
334
+ start = time.time()
335
+ articles_output = search_articles(name.strip(), max_articles=article_count)
336
+ elapsed = time.time() - start
337
+
338
+ results = f"βœ… **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
339
+ results += articles_output
340
+
341
+ return results, articles_output
342
+ except Exception as e:
343
+ return f"[ERROR] Search failed: {str(e)}", ""
344
+
345
+ def extract_only(stored_results: str, company_name: str):
346
+ if not stored_results.strip():
347
+ return "No search results available. Please search first."
348
+
349
+ if not company_name.strip():
350
+ return "No company name provided. Please search first."
351
+
352
+ try:
353
+ start = time.time()
354
+ entities = extract_entities(stored_results, company_name.strip())
355
+ elapsed = time.time() - start
356
+
357
+ # Try to format JSON for better readability
358
+ try:
359
+ parsed = json.loads(entities)
360
+ formatted = json.dumps(parsed, indent=2)
361
+ return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
362
+ except:
363
+ return f"βœ… **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
364
+ except Exception as e:
365
+ return f"[ERROR] Extraction failed: {str(e)}"
366
+
367
+ # === Gradio UI ===
368
+
369
+ with gr.Blocks(title="Enhanced Founder Finder") as demo:
370
+ gr.Markdown("# πŸ”Ž Enhanced Founder Finder")
371
+ gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
372
+ gr.Markdown("*πŸš€ **New**: Automatically searches for both recent news AND historical founding information*")
373
+ gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
374
+
375
+ search_state = gr.State("")
376
+
377
+ with gr.Row():
378
+ name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
379
+ article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
380
+ with gr.Column():
381
+ search_btn = gr.Button("πŸ” Enhanced Temporal Search", variant="primary")
382
+ extract_btn = gr.Button("πŸ“Š Extract Founder Intelligence", variant="secondary")
383
+
384
+ output1 = gr.Markdown(label="Search Results with Temporal Analysis")
385
+ output2 = gr.Textbox(
386
+ label="Founder Intelligence Report",
387
+ lines=15,
388
+ max_lines=25,
389
+ show_copy_button=True
390
+ )
391
+
392
+ search_btn.click(
393
+ fn=search_only,
394
+ inputs=[name_input, article_count_slider],
395
+ outputs=[output1, search_state]
396
+ )
397
+
398
+ extract_btn.click(
399
+ fn=extract_only,
400
+ inputs=[search_state, name_input],
401
+ outputs=[output2]
402
+ )
403
+
404
+ if __name__ == "__main__":
405
+ demo.launch()
406
+
407
+ ''' import gradio as gr
408
+ import requests
409
+ import time
410
+ import re
411
+ from duckduckgo_search import DDGS
412
+ from bs4 import BeautifulSoup
413
+ import anthropic
414
+ import os
415
 
416
  # Initialize Anthropic client
417
  client = anthropic.Anthropic(
 
637
 
638
  if __name__ == "__main__":
639
  demo.launch()
640
+ '''
641