dygoo commited on
Commit
41dc56e
Β·
verified Β·
1 Parent(s): 054936e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -143
app.py CHANGED
@@ -21,18 +21,16 @@ client = anthropic.Anthropic(
21
  cancel_operation = threading.Event()
22
 
23
  def reset_cancellation():
24
- """Reset the cancellation flag"""
25
  cancel_operation.clear()
26
 
27
  def check_cancellation():
28
- """Check if operation should be cancelled"""
29
  return cancel_operation.is_set()
30
 
31
- # === Model Functions ===
32
 
33
  def extract_publication_date(soup, url):
34
  try:
35
- # ... (Function content is correct, keeping it for brevity) ...
36
  date_selectors = [
37
  'time[datetime]', '.date', '.publish-date', '.published', '.post-date',
38
  '[class*="date"]', '[class*="time"]',
@@ -56,20 +54,21 @@ def extract_publication_date(soup, url):
56
  print(f"Date extraction error for {url}: {e}")
57
  return None
58
 
59
-
60
  def get_full_article_with_timeout(url, timeout=15):
61
- # ... (Function content is correct, keeping it for brevity) ...
62
  if check_cancellation(): return "[CANCELLED] Operation was cancelled", None
63
  try:
64
  headers = {
65
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 
66
  'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
67
  }
68
  response = requests.get(url, headers=headers, timeout=timeout, verify=True)
69
  response.raise_for_status()
70
  soup = BeautifulSoup(response.content, 'html.parser')
71
  pub_date = extract_publication_date(soup, url)
72
- for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']): element.decompose()
 
73
  article_selectors = [
74
  'article', '.article-content', '.post-content', '.story-body', '.story-content', '.entry-content', '.content-body',
75
  '.article-body', 'main article', 'main .content', 'main', '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
@@ -77,221 +76,237 @@ def get_full_article_with_timeout(url, timeout=15):
77
  for selector in article_selectors:
78
  content = soup.select_one(selector)
79
  if content:
80
- paragraphs = content.find_all(['p', 'div'], string=True)
81
- if paragraphs:
82
- text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
83
  full_text = '\n\n'.join(text_parts)
84
  if len(full_text) > 300: return full_text[:10000], pub_date
85
  body_text = soup.get_text(separator='\n\n', strip=True)
86
- body_text = re.sub(r'\n{3,}', '\n\n', body_text)
87
  return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
88
- except requests.exceptions.Timeout: return "[WARNING] Article fetch timeout - using snippet instead", None
89
- except requests.exceptions.RequestException as e: return f"[ERROR] Network error: {str(e)}", None
90
- except Exception as e: return f"[ERROR] Could not fetch article: {str(e)}", None
91
-
92
 
93
  def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
94
  if check_cancellation(): return []
95
- if timeframe == "recent":
96
- search_queries = [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent', f'"{name}" founder update latest']
97
- else:
98
- search_queries = [f'"{name}" founded established history', f'"{name}" founder origin story', f'"{name}" started began founder']
99
-
100
- all_results, max_retries, total_queries = [], 2, len(search_queries)
101
 
 
102
  for query_idx, search_query in enumerate(search_queries):
103
  if len(all_results) >= max_articles or check_cancellation(): break
104
  if progress:
105
- progress((query_idx / total_queries) * 0.3, desc=f"Searching {timeframe} articles ({query_idx + 1}/{total_queries})")
106
 
107
- for attempt in range(max_retries):
108
- if check_cancellation(): return all_results
109
- try:
110
- with DDGS(timeout=15) as ddgs:
111
- # FIX: Directly get results and check if it's None.
112
- # This prevents crashes if the library returns None instead of an empty list.
113
- results = ddgs.text(keywords=search_query, max_results=max_articles - len(all_results) + 2, safesearch='moderate', region='us-en')
114
-
115
- if results:
116
- existing_urls = {r.get('url', '') for r in all_results}
117
- for result in results:
 
118
  if len(all_results) >= max_articles: break
119
- url = result.get('href') # Check for href directly
120
- if url and url not in existing_urls:
121
- all_results.append(result)
122
- existing_urls.add(url)
123
- break # Break from retry loop on success
124
- except Exception as e:
125
- print(f"DDGS Search Attempt {attempt + 1} failed for '{search_query}': {e}")
126
- if attempt < max_retries - 1: time.sleep(1)
127
-
128
- return all_results[:max_articles]
129
-
130
 
131
  def categorize_article_by_date(pub_date):
132
  if not pub_date: return "unknown"
133
  return "recent" if pub_date >= (datetime.now() - timedelta(days=365)) else "historical"
134
 
135
-
136
  def fetch_article_parallel(result, article_num, total_articles, progress=None):
137
- if check_cancellation(): return None
138
- url, title = result.get('href', 'No URL'), result.get('title', 'No Title')
 
 
139
  if progress:
140
- progress(0.4 + (article_num / total_articles) * 0.5, desc=f"Fetching article {article_num + 1}/{total_articles}: {title[:50]}...")
141
 
142
- full_text, pub_date = get_full_article_with_timeout(url, timeout=12)
143
  if check_cancellation(): return None
144
 
145
  actual_timeframe = categorize_article_by_date(pub_date)
146
- content = f"[SNIPPET ONLY]\n{result.get('body', 'No snippet')}" if any(e in str(full_text) for e in ["[ERROR]", "timeout", "[CANCELLED]"]) else full_text
147
  timeframe_indicator = f"πŸ“… **Published**: {pub_date.strftime('%B %d, %Y')} ({actual_timeframe.title()})" if pub_date else f"πŸ“… **Timeframe**: {result.get('expected_timeframe', 'unknown').title()} (estimated)"
148
 
149
- article = f"### {article_num + 1}. {title}\n[Source]({url})\n{timeframe_indicator}\n\n{content}\n"
150
- return {'article': article, 'timeframe': actual_timeframe, 'url': url, 'title': title}
151
 
152
-
153
- def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) -> str:
154
  reset_cancellation()
155
  if progress: progress(0, desc="Initializing...")
156
 
157
  recent_count, historical_count = max_articles // 2, max_articles - (max_articles // 2)
158
- if progress: progress(0.05, desc=f"Planning search: {recent_count} recent + {historical_count} historical")
159
 
160
- recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress) or []
161
- if check_cancellation(): return "[CANCELLED] Search was cancelled."
162
 
163
- time.sleep(1)
164
 
165
- historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress) or []
166
- if check_cancellation(): return "[CANCELLED] Search was cancelled."
167
 
168
- all_results = []
169
- # FIX: This loop is now safe because recent_results is guaranteed to be a list.
170
- for r in recent_results:
171
- r['expected_timeframe'] = 'recent'
172
- all_results.append(r)
173
- for r in historical_results:
174
- r['expected_timeframe'] = 'historical'
175
- all_results.append(r)
176
 
177
- if not all_results:
178
- if progress: progress(1.0, desc="Completed - no results found.")
179
  return f"[INFO] No articles found for '{name}'."
180
 
181
- if progress: progress(0.4, desc=f"Found {len(all_results)} articles, now fetching...")
182
 
183
  articles, recent_found, historical_found = [], 0, 0
184
- with ThreadPoolExecutor(max_workers=min(3, len(all_results))) as executor:
185
- future_to_article = {executor.submit(fetch_article_parallel, r, i, len(all_results), progress): r for i, r in enumerate(all_results)}
186
  for future in as_completed(future_to_article):
187
- if check_cancellation():
188
- for f in future_to_article: f.cancel()
189
- return "[CANCELLED] Search was cancelled."
190
  try:
191
  result_data = future.result(timeout=20)
192
  if result_data:
193
  articles.append(result_data)
194
- if result_data['timeframe'] == "recent": recent_found += 1
195
- elif result_data['timeframe'] == "historical": historical_found += 1
196
  except Exception as e:
197
- print(f"Error fetching article result: {e}")
198
-
199
- if check_cancellation(): return "[CANCELLED] Search was cancelled."
200
- if progress: progress(0.95, desc="Formatting results...")
201
 
202
- # FIX: Replaced fragile sorting logic with a robust and efficient dictionary lookup.
203
- # This prevents any possibility of an IndexError or StopIteration during sorting.
204
- url_to_index = {res.get('href'): i for i, res in enumerate(all_results) if res.get('href')}
 
 
 
205
  articles.sort(key=lambda x: url_to_index.get(x.get('url'), 999))
206
 
207
- summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical\n\n"
208
- article_texts = [a['article'] for a in articles]
209
- if progress: progress(1.0, desc=f"Search completed! Found {len(articles)} articles.")
210
  return summary + "\n---\n".join(article_texts)
211
 
212
-
213
  def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
214
- if progress: progress(0, desc="Preparing text for analysis...")
 
215
  MAX_CHARS = 15000
216
  if len(search_results) > MAX_CHARS:
217
- search_results = search_results[:search_results.rfind('. ', 0, MAX_CHARS) + 1]
218
 
219
- prompt = f"""...""" # Prompt is fine
 
 
 
 
220
 
221
  try:
222
- if progress: progress(0.5, desc="Sending request to AI model...")
223
- message = client.messages.create(model="claude-sonnet-4-20250514", max_tokens=1500, temperature=0.1, messages=[{"role": "user", "content": prompt}])
 
 
 
 
224
 
225
- # FIX: Robust check for API response content. Prevents IndexError.
226
- if not message.content or not isinstance(message.content, list):
227
- return json.dumps({"error": "API returned an invalid or empty response."})
 
 
 
 
 
 
228
 
229
- result = message.content[0].text
230
- if progress: progress(1.0, desc="Analysis completed!")
231
- return result
 
232
 
233
  except Exception as e:
234
- if progress: progress(1.0, desc="Analysis failed")
235
- return json.dumps({"error": "Extraction failed due to an exception", "details": str(e)})
236
-
237
-
238
- # === Gradio Interface (No changes needed here) ===
239
- def search_only_enhanced(name: str, article_count: int, progress=gr.Progress()):
240
- if not name or not name.strip(): return "❌ No name provided", ""
241
- try:
242
- start = time.time()
243
- articles_output = search_articles_enhanced(name.strip(), int(article_count), progress=progress)
244
- if "[CANCELLED]" in articles_output: return "πŸ›‘ Search was cancelled by user.", ""
245
- elapsed = time.time() - start
246
- results = f"βœ… **Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n{articles_output}"
247
- return results, articles_output
248
- except Exception as e:
249
- return f"❌ **Search failed unexpectedly**: {e}", ""
250
 
 
 
251
 
252
- def extract_only_enhanced(stored_results: str, company_name: str, progress=gr.Progress()):
253
- if not stored_results or not stored_results.strip(): return "❌ No search results. Please search first."
254
- if not company_name or not company_name.strip(): return "❌ No company name provided for context."
255
- if "[CANCELLED]" in stored_results: return "❌ Cannot extract from cancelled results. Please search again."
256
 
257
  try:
258
- start = time.time()
259
- entities = extract_entities_enhanced(stored_results, company_name.strip(), progress=progress)
260
- elapsed = time.time() - start
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  try:
262
- parsed = json.loads(entities)
263
  formatted = json.dumps(parsed, indent=2)
264
- return f"βœ… **Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
265
  except (json.JSONDecodeError, TypeError):
266
- return f"⚠️ **Extraction Note**: Model did not return valid JSON. Completed in {elapsed:.1f}s\n\n{entities}"
 
 
 
 
267
  except Exception as e:
268
- return f"❌ **Extraction failed unexpectedly**: {e}"
 
 
 
269
 
270
- def cancel_search():
271
  cancel_operation.set()
272
  return "πŸ›‘ Cancellation requested..."
273
 
274
-
275
  with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
276
  gr.Markdown("# πŸ”Ž Enhanced Founder Finder")
277
- gr.Markdown("Enter a business name to find its founders using a temporal search strategy.")
278
- search_state = gr.State("")
279
  with gr.Row():
280
  with gr.Column(scale=2): name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'Tesla', 'SpaceX'")
281
- with gr.Column(scale=1): article_count_slider = gr.Slider(2, 10, value=4, step=2, label="Total Articles")
 
282
  with gr.Row():
283
- search_btn = gr.Button("πŸ” Temporal Search", variant="primary")
284
  cancel_btn = gr.Button("πŸ›‘ Cancel", variant="stop")
285
- extract_btn = gr.Button("πŸ“Š Extract Founders", variant="secondary")
286
- with gr.Row(): status_output = gr.Markdown("Ready to search...")
287
- with gr.Row():
288
- output1 = gr.Markdown(label="Search Results & Temporal Analysis")
289
- output2 = gr.Markdown(label="Founder Intelligence Report")
290
 
291
- search_event = search_btn.click(fn=search_only_enhanced, inputs=[name_input, article_count_slider], outputs=[output1, search_state])
292
- cancel_btn.click(fn=cancel_search, inputs=None, outputs=status_output, cancels=[search_event])
293
- extract_btn.click(fn=extract_only_enhanced, inputs=[search_state, name_input], outputs=[output2])
294
- gr.Examples(examples=[["OpenAI", 4], ["SpaceX", 6], ["Microsoft", 4]], inputs=[name_input, article_count_slider])
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  demo.queue()
297
 
 
21
  cancel_operation = threading.Event()
22
 
23
  def reset_cancellation():
 
24
  cancel_operation.clear()
25
 
26
  def check_cancellation():
 
27
  return cancel_operation.is_set()
28
 
29
+ # === Model Functions (Hardened for Stability) ===
30
 
31
  def extract_publication_date(soup, url):
32
  try:
33
+ # This function is generally safe, no changes needed.
34
  date_selectors = [
35
  'time[datetime]', '.date', '.publish-date', '.published', '.post-date',
36
  '[class*="date"]', '[class*="time"]',
 
54
  print(f"Date extraction error for {url}: {e}")
55
  return None
56
 
 
57
  def get_full_article_with_timeout(url, timeout=15):
58
+ # This function is generally safe, no changes needed.
59
  if check_cancellation(): return "[CANCELLED] Operation was cancelled", None
60
  try:
61
  headers = {
62
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
63
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
64
  'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
65
  }
66
  response = requests.get(url, headers=headers, timeout=timeout, verify=True)
67
  response.raise_for_status()
68
  soup = BeautifulSoup(response.content, 'html.parser')
69
  pub_date = extract_publication_date(soup, url)
70
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
71
+ element.decompose()
72
  article_selectors = [
73
  'article', '.article-content', '.post-content', '.story-body', '.story-content', '.entry-content', '.content-body',
74
  '.article-body', 'main article', 'main .content', 'main', '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
 
76
  for selector in article_selectors:
77
  content = soup.select_one(selector)
78
  if content:
79
+ text_parts = [p.get_text(strip=True) for p in content.find_all(['p', 'div'], string=True) if len(p.get_text(strip=True)) > 30]
80
+ if text_parts:
 
81
  full_text = '\n\n'.join(text_parts)
82
  if len(full_text) > 300: return full_text[:10000], pub_date
83
  body_text = soup.get_text(separator='\n\n', strip=True)
 
84
  return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
85
+ except requests.exceptions.RequestException as e:
86
+ return f"[ERROR] Network error for {url}: {e}", None
87
+ except Exception as e:
88
+ return f"[ERROR] Could not fetch article {url}: {e}", None
89
 
90
  def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
91
  if check_cancellation(): return []
92
+ queries = {
93
+ "recent": [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent'],
94
+ "historical": [f'"{name}" founded established history', f'"{name}" founder origin story']
95
+ }
96
+ search_queries = queries.get(timeframe, [])
 
97
 
98
+ all_results, total_queries = [], len(search_queries)
99
  for query_idx, search_query in enumerate(search_queries):
100
  if len(all_results) >= max_articles or check_cancellation(): break
101
  if progress:
102
+ progress((query_idx / total_queries) * 0.3, desc=f"Searching {timeframe} ({query_idx+1}/{total_queries})")
103
 
104
+ try:
105
+ with DDGS(timeout=15) as ddgs:
106
+ # Use safe '.text()' which is designed to handle errors gracefully
107
+ results = ddgs.text(keywords=search_query, max_results=max_articles - len(all_results) + 2, safesearch='moderate', region='us-en')
108
+
109
+ # Defensively check if results is a list-like object
110
+ if results:
111
+ existing_urls = {r.get('href', '') for r in all_results}
112
+ for result in results:
113
+ # Ensure result is a dictionary and has the 'href' key
114
+ if isinstance(result, dict) and result.get('href') and result.get('href') not in existing_urls:
115
+ all_results.append(result)
116
  if len(all_results) >= max_articles: break
117
+ except Exception as e:
118
+ print(f"DDGS Search failed for '{search_query}': {e}")
119
+
120
+ return all_results
 
 
 
 
 
 
 
121
 
122
  def categorize_article_by_date(pub_date):
123
  if not pub_date: return "unknown"
124
  return "recent" if pub_date >= (datetime.now() - timedelta(days=365)) else "historical"
125
 
 
126
  def fetch_article_parallel(result, article_num, total_articles, progress=None):
127
+ if check_cancellation() or not isinstance(result, dict): return None
128
+
129
+ url = result.get('href', 'No URL')
130
+ title = result.get('title', 'No Title')
131
  if progress:
132
+ progress(0.4 + (article_num / total_articles) * 0.5, desc=f"Fetching {article_num+1}/{total_articles}: {title[:50]}...")
133
 
134
+ full_text, pub_date = get_full_article_with_timeout(url)
135
  if check_cancellation(): return None
136
 
137
  actual_timeframe = categorize_article_by_date(pub_date)
138
+ content = f"[SNIPPET ONLY]\n{result.get('body', 'No snippet')}" if "[ERROR]" in str(full_text) or "[WARNING]" in str(full_text) else full_text
139
  timeframe_indicator = f"πŸ“… **Published**: {pub_date.strftime('%B %d, %Y')} ({actual_timeframe.title()})" if pub_date else f"πŸ“… **Timeframe**: {result.get('expected_timeframe', 'unknown').title()} (estimated)"
140
 
141
+ article = f"### {article_num+1}. {title}\n[Source]({url})\n{timeframe_indicator}\n\n{content}\n"
142
+ return {'article': article, 'timeframe': actual_timeframe, 'url': url}
143
 
144
+ def search_articles_enhanced(name: str, max_articles: int, progress=None) -> str:
 
145
  reset_cancellation()
146
  if progress: progress(0, desc="Initializing...")
147
 
148
  recent_count, historical_count = max_articles // 2, max_articles - (max_articles // 2)
 
149
 
150
+ recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress)
151
+ if check_cancellation(): return "[CANCELLED]"
152
 
153
+ time.sleep(1) # Brief pause
154
 
155
+ historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress)
156
+ if check_cancellation(): return "[CANCELLED]"
157
 
158
+ all_source_results = []
159
+ for r in recent_results: r['expected_timeframe'] = 'recent'; all_source_results.append(r)
160
+ for r in historical_results: r['expected_timeframe'] = 'historical'; all_source_results.append(r)
 
 
 
 
 
161
 
162
+ if not all_source_results:
 
163
  return f"[INFO] No articles found for '{name}'."
164
 
165
+ if progress: progress(0.4, desc=f"Found {len(all_source_results)} articles, fetching content...")
166
 
167
  articles, recent_found, historical_found = [], 0, 0
168
+ with ThreadPoolExecutor(max_workers=min(3, len(all_source_results))) as executor:
169
+ future_to_article = {executor.submit(fetch_article_parallel, r, i, len(all_source_results), progress): r for i, r in enumerate(all_source_results)}
170
  for future in as_completed(future_to_article):
171
+ if check_cancellation(): return "[CANCELLED]"
 
 
172
  try:
173
  result_data = future.result(timeout=20)
174
  if result_data:
175
  articles.append(result_data)
176
+ if result_data.get('timeframe') == "recent": recent_found += 1
177
+ elif result_data.get('timeframe') == "historical": historical_found += 1
178
  except Exception as e:
179
+ print(f"Error processing article future: {e}")
 
 
 
180
 
181
+ if not articles:
182
+ return f"[ERROR] Could not fetch content for any of the {len(all_source_results)} articles found."
183
+
184
+ if progress: progress(0.95, desc="Formatting results...")
185
+
186
+ url_to_index = {res.get('href'): i for i, res in enumerate(all_source_results)}
187
  articles.sort(key=lambda x: url_to_index.get(x.get('url'), 999))
188
 
189
+ summary = f"**Search Summary**: Found content for {len(articles)} articles ({recent_found} recent, {historical_found} historical)\n\n"
190
+ article_texts = [a.get('article', '[Content Missing]') for a in articles]
 
191
  return summary + "\n---\n".join(article_texts)
192
 
 
193
  def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
194
+ """BULLETPROOF VERSION: This function is now hardened against the IndexError."""
195
+ if progress: progress(0, desc="Preparing text...")
196
  MAX_CHARS = 15000
197
  if len(search_results) > MAX_CHARS:
198
+ search_results = search_results[:search_results.rfind('.', 0, MAX_CHARS) + 1]
199
 
200
+ prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
201
+ Return a JSON object with the following structure: {{"founders": [{{"name": "Founder Name", "evidence": ["brief quote"]}}]}}
202
+ Respond only with valid JSON.
203
+ Text:
204
+ {search_results}"""
205
 
206
  try:
207
+ if progress: progress(0.5, desc="Analyzing with AI...")
208
+ message = client.messages.create(
209
+ model="claude-sonnet-4-20250514",
210
+ max_tokens=1500, temperature=0.1,
211
+ messages=[{"role": "user", "content": prompt}]
212
+ )
213
 
214
+ # FIX: THE DEFINITIVE FIX FOR THE 'list index out of range' ERROR
215
+ # Check if the response is valid and has content before accessing it.
216
+ if message and isinstance(message.content, list) and len(message.content) > 0:
217
+ # Safely access the first text block
218
+ first_block = message.content[0]
219
+ if hasattr(first_block, 'text'):
220
+ result = first_block.text
221
+ if progress: progress(1.0, desc="Analysis completed!")
222
+ return result
223
 
224
+ # If the checks above fail, we land here.
225
+ if progress: progress(1.0, desc="Analysis failed.")
226
+ print(f"API Error: Received invalid response from Anthropic. Response: {message}")
227
+ return json.dumps({"error": "API returned an invalid or empty response.", "details": "The model may have refused to answer due to content filters or an internal error."})
228
 
229
  except Exception as e:
230
+ if progress: progress(1.0, desc="Analysis failed.")
231
+ print(f"Extraction Exception: {e}")
232
+ return json.dumps({"error": "An exception occurred during AI extraction.", "details": str(e)})
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ # === Gradio Interface Functions and UI ===
235
+ # This section has been simplified to use the more robust functions above.
236
 
237
+ def search_and_extract_flow(name: str, article_count: int, progress=gr.Progress()):
238
+ """A single, robust function for the entire search and extract workflow."""
239
+ if not name or not name.strip():
240
+ return "❌ Please provide a company name.", "", "Ready."
241
 
242
  try:
243
+ # Step 1: Search for articles
244
+ start_time = time.time()
245
+ articles_output = search_articles_enhanced(name.strip(), int(article_count), progress=progress)
246
+
247
+ if "[CANCELLED]" in articles_output:
248
+ return "πŸ›‘ Search cancelled.", "", "Cancelled."
249
+ if "[ERROR]" in articles_output or "[INFO]" in articles_output:
250
+ return articles_output, "", "Search completed with info/error."
251
+
252
+ search_results_md = f"βœ… **Search** completed in {time.time() - start_time:.1f}s\n\n{articles_output}"
253
+
254
+ # Step 2: Extract entities from the results
255
+ progress(0, desc="Starting extraction...")
256
+ extraction_start_time = time.time()
257
+ entities_json = extract_entities_enhanced(articles_output, name.strip(), progress)
258
+
259
+ # Step 3: Format the JSON for display
260
  try:
261
+ parsed = json.loads(entities_json)
262
  formatted = json.dumps(parsed, indent=2)
263
+ extraction_results_md = f"βœ… **Extraction** completed in {time.time() - extraction_start_time:.1f}s\n\n```json\n{formatted}\n```"
264
  except (json.JSONDecodeError, TypeError):
265
+ extraction_results_md = f"⚠️ **Extraction Warning**: Model did not return valid JSON.\n\n{entities_json}"
266
+
267
+ status = f"Completed in {time.time() - start_time:.1f}s"
268
+ return search_results_md, extraction_results_md, status
269
+
270
  except Exception as e:
271
+ # This is a final catch-all for any truly unexpected errors.
272
+ error_message = f"❌ **An unexpected error occurred**: {e}"
273
+ print(error_message) # Log to console for debugging
274
+ return error_message, "", "Failed."
275
 
276
+ def cancel_flow():
277
  cancel_operation.set()
278
  return "πŸ›‘ Cancellation requested..."
279
 
 
280
  with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
281
  gr.Markdown("# πŸ”Ž Enhanced Founder Finder")
282
+ gr.Markdown("Enter a business name to find its founders. The process involves searching for articles and then using AI to extract founder names.")
283
+
284
  with gr.Row():
285
  with gr.Column(scale=2): name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'Tesla', 'SpaceX'")
286
+ with gr.Column(scale=1): article_count_slider = gr.Slider(2, 10, value=4, step=2, label="Articles to Search")
287
+
288
  with gr.Row():
289
+ run_btn = gr.Button("πŸ” Find Founders", variant="primary")
290
  cancel_btn = gr.Button("πŸ›‘ Cancel", variant="stop")
291
+
292
+ status_output = gr.Markdown("Ready...")
 
 
 
293
 
294
+ with gr.Tab("Founder Intelligence Report"):
295
+ output_extract = gr.Markdown(label="Extracted Founder Information")
296
+ with gr.Tab("Raw Search Results"):
297
+ output_search = gr.Markdown(label="Article Search & Temporal Analysis")
298
+
299
+ run_event = run_btn.click(
300
+ fn=search_and_extract_flow,
301
+ inputs=[name_input, article_count_slider],
302
+ outputs=[output_search, output_extract, status_output]
303
+ )
304
+ cancel_btn.click(fn=cancel_flow, inputs=None, outputs=status_output, cancels=[run_event])
305
+
306
+ gr.Examples(
307
+ examples=[["OpenAI", 4], ["SpaceX", 6], ["Microsoft", 4], ["Anthropic", 4]],
308
+ inputs=[name_input, article_count_slider],
309
+ )
310
 
311
  demo.queue()
312