Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,412 @@ from duckduckgo_search import DDGS
|
|
6 |
from bs4 import BeautifulSoup
|
7 |
import anthropic
|
8 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Initialize Anthropic client
|
11 |
client = anthropic.Anthropic(
|
@@ -231,5 +637,5 @@ with gr.Blocks(title="Founder Finder") as demo:
|
|
231 |
|
232 |
if __name__ == "__main__":
|
233 |
demo.launch()
|
234 |
-
|
235 |
|
|
|
6 |
from bs4 import BeautifulSoup
|
7 |
import anthropic
|
8 |
import os
|
9 |
+
from datetime import datetime, timedelta
|
10 |
+
from dateutil import parser
|
11 |
+
import json
|
12 |
+
|
13 |
+
# Initialize Anthropic client
|
14 |
+
client = anthropic.Anthropic(
|
15 |
+
api_key=os.getenv("ANTHROPIC_API_KEY") # Set as secret in HF Space settings
|
16 |
+
)
|
17 |
+
|
18 |
+
# === Model functions ===
|
19 |
+
|
20 |
+
def extract_publication_date(soup, url):
|
21 |
+
"""Extract publication date from article HTML"""
|
22 |
+
try:
|
23 |
+
# Common date selectors
|
24 |
+
date_selectors = [
|
25 |
+
'time[datetime]',
|
26 |
+
'.date', '.publish-date', '.published', '.post-date',
|
27 |
+
'[class*="date"]', '[class*="time"]',
|
28 |
+
'meta[property="article:published_time"]',
|
29 |
+
'meta[name="publishdate"]',
|
30 |
+
'meta[name="date"]'
|
31 |
+
]
|
32 |
+
|
33 |
+
for selector in date_selectors:
|
34 |
+
element = soup.select_one(selector)
|
35 |
+
if element:
|
36 |
+
date_text = element.get('datetime') or element.get('content') or element.get_text()
|
37 |
+
if date_text:
|
38 |
+
try:
|
39 |
+
return parser.parse(date_text)
|
40 |
+
except:
|
41 |
+
continue
|
42 |
+
|
43 |
+
# Look for date patterns in text
|
44 |
+
date_patterns = [
|
45 |
+
r'(\w+ \d{1,2}, \d{4})', # January 15, 2023
|
46 |
+
r'(\d{1,2}/\d{1,2}/\d{4})', # 01/15/2023
|
47 |
+
r'(\d{4}-\d{2}-\d{2})' # 2023-01-15
|
48 |
+
]
|
49 |
+
|
50 |
+
text = soup.get_text()[:2000] # First 2000 chars
|
51 |
+
for pattern in date_patterns:
|
52 |
+
matches = re.findall(pattern, text)
|
53 |
+
if matches:
|
54 |
+
try:
|
55 |
+
return parser.parse(matches[0])
|
56 |
+
except:
|
57 |
+
continue
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
print(f"Date extraction error for {url}: {e}")
|
61 |
+
|
62 |
+
return None
|
63 |
+
|
64 |
+
def get_full_article(url):
|
65 |
+
try:
|
66 |
+
headers = {
|
67 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
68 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
69 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
70 |
+
'Connection': 'keep-alive',
|
71 |
+
'Upgrade-Insecure-Requests': '1'
|
72 |
+
}
|
73 |
+
|
74 |
+
response = requests.get(url, headers=headers, timeout=20, verify=True)
|
75 |
+
response.raise_for_status()
|
76 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
77 |
+
|
78 |
+
# Extract publication date
|
79 |
+
pub_date = extract_publication_date(soup, url)
|
80 |
+
|
81 |
+
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
82 |
+
element.decompose()
|
83 |
+
|
84 |
+
article_selectors = [
|
85 |
+
'article', '.article-content', '.post-content', '.story-body', '.story-content',
|
86 |
+
'.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
|
87 |
+
'[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
|
88 |
+
]
|
89 |
+
|
90 |
+
for selector in article_selectors:
|
91 |
+
content = soup.select_one(selector)
|
92 |
+
if content:
|
93 |
+
paragraphs = content.find_all(['p', 'div'], string=True)
|
94 |
+
if paragraphs:
|
95 |
+
text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
|
96 |
+
full_text = '\n\n'.join(text_parts)
|
97 |
+
if len(full_text) > 300:
|
98 |
+
return full_text[:10000], pub_date
|
99 |
+
|
100 |
+
body_text = soup.get_text(separator='\n\n', strip=True)
|
101 |
+
body_text = re.sub(r'\n{3,}', '\n\n', body_text)
|
102 |
+
return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
|
103 |
+
|
104 |
+
except requests.exceptions.Timeout:
|
105 |
+
return "[WARNING] Article fetch timeout - using snippet instead", None
|
106 |
+
except requests.exceptions.RequestException:
|
107 |
+
return "[ERROR] Could not fetch article: Network error", None
|
108 |
+
except Exception as e:
|
109 |
+
return f"[ERROR] Could not fetch article: {str(e)}", None
|
110 |
+
|
111 |
+
def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
|
112 |
+
"""Search for articles in specific timeframe"""
|
113 |
+
|
114 |
+
# Define search queries based on timeframe
|
115 |
+
if timeframe == "recent":
|
116 |
+
# Recent articles (news, updates, current events)
|
117 |
+
search_queries = [
|
118 |
+
f'"{name}" founder news 2024 2025',
|
119 |
+
f'"{name}" CEO founder recent',
|
120 |
+
f'"{name}" founder update latest'
|
121 |
+
]
|
122 |
+
else: # historical
|
123 |
+
# Historical articles (founding, establishment, origin stories)
|
124 |
+
search_queries = [
|
125 |
+
f'"{name}" founded established history',
|
126 |
+
f'"{name}" founder origin story',
|
127 |
+
f'"{name}" started began founder',
|
128 |
+
f'"{name}" founder early days'
|
129 |
+
]
|
130 |
+
|
131 |
+
all_results = []
|
132 |
+
max_retries = 2
|
133 |
+
base_delay = 3
|
134 |
+
|
135 |
+
for query_idx, search_query in enumerate(search_queries):
|
136 |
+
if len(all_results) >= max_articles:
|
137 |
+
break
|
138 |
+
|
139 |
+
for attempt in range(max_retries):
|
140 |
+
try:
|
141 |
+
print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
|
142 |
+
time.sleep(base_delay * (attempt + 1))
|
143 |
+
|
144 |
+
configs = [
|
145 |
+
{'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
|
146 |
+
{'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
|
147 |
+
]
|
148 |
+
|
149 |
+
config = configs[min(attempt, len(configs)-1)]
|
150 |
+
|
151 |
+
with DDGS(timeout=config['timeout']) as ddgs:
|
152 |
+
search_params = {
|
153 |
+
'keywords': search_query,
|
154 |
+
'max_results': max_articles - len(all_results) + 2, # Get a few extra to filter
|
155 |
+
'safesearch': config['safesearch']
|
156 |
+
}
|
157 |
+
if config['region']:
|
158 |
+
search_params['region'] = config['region']
|
159 |
+
|
160 |
+
results = list(ddgs.text(**search_params))
|
161 |
+
print(f"Found {len(results)} results for query {query_idx + 1}")
|
162 |
+
|
163 |
+
if results:
|
164 |
+
# Add unique results (avoid duplicates)
|
165 |
+
existing_urls = {r.get('url', '') for r in all_results}
|
166 |
+
for result in results:
|
167 |
+
if len(all_results) >= max_articles:
|
168 |
+
break
|
169 |
+
url = result.get('href', '')
|
170 |
+
if url and url not in existing_urls:
|
171 |
+
all_results.append(result)
|
172 |
+
existing_urls.add(url)
|
173 |
+
break
|
174 |
+
|
175 |
+
except Exception as e:
|
176 |
+
print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
|
177 |
+
if attempt < max_retries - 1:
|
178 |
+
time.sleep(base_delay * (attempt + 2))
|
179 |
+
|
180 |
+
return all_results[:max_articles]
|
181 |
+
|
182 |
+
def categorize_article_by_date(pub_date):
|
183 |
+
"""Categorize article as recent or historical based on publication date"""
|
184 |
+
if not pub_date:
|
185 |
+
return "unknown"
|
186 |
+
|
187 |
+
one_year_ago = datetime.now() - timedelta(days=365)
|
188 |
+
|
189 |
+
if pub_date >= one_year_ago:
|
190 |
+
return "recent"
|
191 |
+
else:
|
192 |
+
return "historical"
|
193 |
+
|
194 |
+
def search_articles(name: str, max_articles: int = 4) -> str:
|
195 |
+
"""Enhanced search that ensures both recent and historical articles"""
|
196 |
+
|
197 |
+
# Split articles between recent and historical
|
198 |
+
recent_count = max_articles // 2
|
199 |
+
historical_count = max_articles - recent_count
|
200 |
+
|
201 |
+
print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
|
202 |
+
|
203 |
+
# Search for recent articles
|
204 |
+
recent_results = search_articles_by_timeframe(name, "recent", recent_count)
|
205 |
+
time.sleep(2) # Brief pause between timeframe searches
|
206 |
+
|
207 |
+
# Search for historical articles
|
208 |
+
historical_results = search_articles_by_timeframe(name, "historical", historical_count)
|
209 |
+
|
210 |
+
# Combine and process all results
|
211 |
+
all_results = []
|
212 |
+
|
213 |
+
# Process recent articles
|
214 |
+
for result in recent_results:
|
215 |
+
result['expected_timeframe'] = 'recent'
|
216 |
+
all_results.append(result)
|
217 |
+
|
218 |
+
# Process historical articles
|
219 |
+
for result in historical_results:
|
220 |
+
result['expected_timeframe'] = 'historical'
|
221 |
+
all_results.append(result)
|
222 |
+
|
223 |
+
if not all_results:
|
224 |
+
return f"[INFO] No articles found for {name}"
|
225 |
+
|
226 |
+
# Fetch and categorize articles
|
227 |
+
articles = []
|
228 |
+
recent_found = 0
|
229 |
+
historical_found = 0
|
230 |
+
|
231 |
+
for i, result in enumerate(all_results, 1):
|
232 |
+
url = result.get('href', 'No URL')
|
233 |
+
title = result.get('title', 'No Title')
|
234 |
+
snippet = result.get('body', 'No snippet available')
|
235 |
+
expected_timeframe = result.get('expected_timeframe', 'unknown')
|
236 |
+
|
237 |
+
if i > 1:
|
238 |
+
time.sleep(2)
|
239 |
+
|
240 |
+
full_text, pub_date = get_full_article(url)
|
241 |
+
actual_timeframe = categorize_article_by_date(pub_date)
|
242 |
+
|
243 |
+
# Count articles by actual timeframe
|
244 |
+
if actual_timeframe == "recent":
|
245 |
+
recent_found += 1
|
246 |
+
elif actual_timeframe == "historical":
|
247 |
+
historical_found += 1
|
248 |
+
|
249 |
+
if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
|
250 |
+
print(f"Using snippet fallback for article {i}")
|
251 |
+
content = f"[SNIPPET ONLY]\n{snippet}"
|
252 |
+
else:
|
253 |
+
content = full_text
|
254 |
+
|
255 |
+
# Create timeframe indicator
|
256 |
+
timeframe_indicator = ""
|
257 |
+
if pub_date:
|
258 |
+
date_str = pub_date.strftime("%B %d, %Y")
|
259 |
+
timeframe_indicator = f"π
**Published**: {date_str} ({actual_timeframe.title()})"
|
260 |
+
else:
|
261 |
+
timeframe_indicator = f"π
**Timeframe**: {expected_timeframe.title()} (estimated)"
|
262 |
+
|
263 |
+
article = f"### {i}. {title}\n"
|
264 |
+
article += f"[Source]({url})\n"
|
265 |
+
article += f"{timeframe_indicator}\n\n"
|
266 |
+
article += f"{content}\n"
|
267 |
+
articles.append(article)
|
268 |
+
|
269 |
+
# Add summary of coverage
|
270 |
+
summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
|
271 |
+
|
272 |
+
return summary + "\n---\n".join(articles)
|
273 |
+
|
274 |
+
def extract_entities(search_results: str, company_name: str) -> str:
|
275 |
+
"""Extract entities using Claude 4"""
|
276 |
+
MAX_CHARS = 12000 # Increased to handle more content
|
277 |
+
if len(search_results) > MAX_CHARS:
|
278 |
+
trunc = search_results[:MAX_CHARS]
|
279 |
+
last_period = trunc.rfind('. ')
|
280 |
+
search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
|
281 |
+
|
282 |
+
prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
|
283 |
+
Only include founders who are explicitly mentioned as founders of {company_name}.
|
284 |
+
Ignore founders of other companies that may be mentioned in the text.
|
285 |
+
|
286 |
+
Also identify the temporal context for each founder mention (recent news vs historical founding information).
|
287 |
+
|
288 |
+
Return a JSON object with the following structure:
|
289 |
+
{{
|
290 |
+
"founders": [
|
291 |
+
{{
|
292 |
+
"name": "Founder Name",
|
293 |
+
"type": "person" or "organization",
|
294 |
+
"context": "recent" or "historical" or "both",
|
295 |
+
"evidence": ["brief quote or context where they were mentioned as founder"]
|
296 |
+
}}
|
297 |
+
],
|
298 |
+
"founding_timeline": {{
|
299 |
+
"founding_date": "date if mentioned",
|
300 |
+
"key_events": ["important founding milestones mentioned"]
|
301 |
+
}},
|
302 |
+
"confidence": "high/medium/low based on clarity of founder information"
|
303 |
+
}}
|
304 |
+
|
305 |
+
Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
|
306 |
+
|
307 |
+
Text:
|
308 |
+
{search_results}"""
|
309 |
+
|
310 |
+
try:
|
311 |
+
message = client.messages.create(
|
312 |
+
model="claude-sonnet-4-20250514",
|
313 |
+
max_tokens=1500,
|
314 |
+
temperature=0.1,
|
315 |
+
messages=[
|
316 |
+
{
|
317 |
+
"role": "user",
|
318 |
+
"content": prompt
|
319 |
+
}
|
320 |
+
]
|
321 |
+
)
|
322 |
+
return message.content[0].text
|
323 |
+
|
324 |
+
except Exception as e:
|
325 |
+
return f"[ERROR] Extraction failed: {str(e)}"
|
326 |
+
|
327 |
+
# === Gradio interface functions ===
|
328 |
+
|
329 |
+
def search_only(name: str, article_count: int):
|
330 |
+
if not name.strip():
|
331 |
+
return "No name provided", ""
|
332 |
+
|
333 |
+
try:
|
334 |
+
start = time.time()
|
335 |
+
articles_output = search_articles(name.strip(), max_articles=article_count)
|
336 |
+
elapsed = time.time() - start
|
337 |
+
|
338 |
+
results = f"β
**Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
|
339 |
+
results += articles_output
|
340 |
+
|
341 |
+
return results, articles_output
|
342 |
+
except Exception as e:
|
343 |
+
return f"[ERROR] Search failed: {str(e)}", ""
|
344 |
+
|
345 |
+
def extract_only(stored_results: str, company_name: str):
|
346 |
+
if not stored_results.strip():
|
347 |
+
return "No search results available. Please search first."
|
348 |
+
|
349 |
+
if not company_name.strip():
|
350 |
+
return "No company name provided. Please search first."
|
351 |
+
|
352 |
+
try:
|
353 |
+
start = time.time()
|
354 |
+
entities = extract_entities(stored_results, company_name.strip())
|
355 |
+
elapsed = time.time() - start
|
356 |
+
|
357 |
+
# Try to format JSON for better readability
|
358 |
+
try:
|
359 |
+
parsed = json.loads(entities)
|
360 |
+
formatted = json.dumps(parsed, indent=2)
|
361 |
+
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
|
362 |
+
except:
|
363 |
+
return f"β
**Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
|
364 |
+
except Exception as e:
|
365 |
+
return f"[ERROR] Extraction failed: {str(e)}"
|
366 |
+
|
367 |
+
# === Gradio UI ===
|
368 |
+
|
369 |
+
with gr.Blocks(title="Enhanced Founder Finder") as demo:
|
370 |
+
gr.Markdown("# π Enhanced Founder Finder")
|
371 |
+
gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
|
372 |
+
gr.Markdown("*π **New**: Automatically searches for both recent news AND historical founding information*")
|
373 |
+
gr.Markdown("*β±οΈ Note: Enhanced search may take 60β90 seconds for comprehensive results.*")
|
374 |
+
|
375 |
+
search_state = gr.State("")
|
376 |
+
|
377 |
+
with gr.Row():
|
378 |
+
name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
|
379 |
+
article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
|
380 |
+
with gr.Column():
|
381 |
+
search_btn = gr.Button("π Enhanced Temporal Search", variant="primary")
|
382 |
+
extract_btn = gr.Button("π Extract Founder Intelligence", variant="secondary")
|
383 |
+
|
384 |
+
output1 = gr.Markdown(label="Search Results with Temporal Analysis")
|
385 |
+
output2 = gr.Textbox(
|
386 |
+
label="Founder Intelligence Report",
|
387 |
+
lines=15,
|
388 |
+
max_lines=25,
|
389 |
+
show_copy_button=True
|
390 |
+
)
|
391 |
+
|
392 |
+
search_btn.click(
|
393 |
+
fn=search_only,
|
394 |
+
inputs=[name_input, article_count_slider],
|
395 |
+
outputs=[output1, search_state]
|
396 |
+
)
|
397 |
+
|
398 |
+
extract_btn.click(
|
399 |
+
fn=extract_only,
|
400 |
+
inputs=[search_state, name_input],
|
401 |
+
outputs=[output2]
|
402 |
+
)
|
403 |
+
|
404 |
+
if __name__ == "__main__":
|
405 |
+
demo.launch()
|
406 |
+
|
407 |
+
''' import gradio as gr
|
408 |
+
import requests
|
409 |
+
import time
|
410 |
+
import re
|
411 |
+
from duckduckgo_search import DDGS
|
412 |
+
from bs4 import BeautifulSoup
|
413 |
+
import anthropic
|
414 |
+
import os
|
415 |
|
416 |
# Initialize Anthropic client
|
417 |
client = anthropic.Anthropic(
|
|
|
637 |
|
638 |
if __name__ == "__main__":
|
639 |
demo.launch()
|
640 |
+
'''
|
641 |
|