Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -21,18 +21,16 @@ client = anthropic.Anthropic(
|
|
21 |
cancel_operation = threading.Event()
|
22 |
|
23 |
def reset_cancellation():
|
24 |
-
"""Reset the cancellation flag"""
|
25 |
cancel_operation.clear()
|
26 |
|
27 |
def check_cancellation():
|
28 |
-
"""Check if operation should be cancelled"""
|
29 |
return cancel_operation.is_set()
|
30 |
|
31 |
-
# === Model Functions ===
|
32 |
|
33 |
def extract_publication_date(soup, url):
|
34 |
try:
|
35 |
-
#
|
36 |
date_selectors = [
|
37 |
'time[datetime]', '.date', '.publish-date', '.published', '.post-date',
|
38 |
'[class*="date"]', '[class*="time"]',
|
@@ -56,20 +54,21 @@ def extract_publication_date(soup, url):
|
|
56 |
print(f"Date extraction error for {url}: {e}")
|
57 |
return None
|
58 |
|
59 |
-
|
60 |
def get_full_article_with_timeout(url, timeout=15):
|
61 |
-
#
|
62 |
if check_cancellation(): return "[CANCELLED] Operation was cancelled", None
|
63 |
try:
|
64 |
headers = {
|
65 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
|
|
|
66 |
'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
|
67 |
}
|
68 |
response = requests.get(url, headers=headers, timeout=timeout, verify=True)
|
69 |
response.raise_for_status()
|
70 |
soup = BeautifulSoup(response.content, 'html.parser')
|
71 |
pub_date = extract_publication_date(soup, url)
|
72 |
-
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
|
|
73 |
article_selectors = [
|
74 |
'article', '.article-content', '.post-content', '.story-body', '.story-content', '.entry-content', '.content-body',
|
75 |
'.article-body', 'main article', 'main .content', 'main', '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
|
@@ -77,221 +76,237 @@ def get_full_article_with_timeout(url, timeout=15):
|
|
77 |
for selector in article_selectors:
|
78 |
content = soup.select_one(selector)
|
79 |
if content:
|
80 |
-
|
81 |
-
if
|
82 |
-
text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
|
83 |
full_text = '\n\n'.join(text_parts)
|
84 |
if len(full_text) > 300: return full_text[:10000], pub_date
|
85 |
body_text = soup.get_text(separator='\n\n', strip=True)
|
86 |
-
body_text = re.sub(r'\n{3,}', '\n\n', body_text)
|
87 |
return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
|
88 |
-
except requests.exceptions.
|
89 |
-
|
90 |
-
except Exception as e:
|
91 |
-
|
92 |
|
93 |
def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
|
94 |
if check_cancellation(): return []
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
all_results, max_retries, total_queries = [], 2, len(search_queries)
|
101 |
|
|
|
102 |
for query_idx, search_query in enumerate(search_queries):
|
103 |
if len(all_results) >= max_articles or check_cancellation(): break
|
104 |
if progress:
|
105 |
-
progress((query_idx / total_queries) * 0.3, desc=f"Searching {timeframe}
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
if len(all_results) >= max_articles: break
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
break # Break from retry loop on success
|
124 |
-
except Exception as e:
|
125 |
-
print(f"DDGS Search Attempt {attempt + 1} failed for '{search_query}': {e}")
|
126 |
-
if attempt < max_retries - 1: time.sleep(1)
|
127 |
-
|
128 |
-
return all_results[:max_articles]
|
129 |
-
|
130 |
|
131 |
def categorize_article_by_date(pub_date):
|
132 |
if not pub_date: return "unknown"
|
133 |
return "recent" if pub_date >= (datetime.now() - timedelta(days=365)) else "historical"
|
134 |
|
135 |
-
|
136 |
def fetch_article_parallel(result, article_num, total_articles, progress=None):
|
137 |
-
if check_cancellation(): return None
|
138 |
-
|
|
|
|
|
139 |
if progress:
|
140 |
-
progress(0.4 + (article_num / total_articles) * 0.5, desc=f"Fetching
|
141 |
|
142 |
-
full_text, pub_date = get_full_article_with_timeout(url
|
143 |
if check_cancellation(): return None
|
144 |
|
145 |
actual_timeframe = categorize_article_by_date(pub_date)
|
146 |
-
content = f"[SNIPPET ONLY]\n{result.get('body', 'No snippet')}" if
|
147 |
timeframe_indicator = f"π
**Published**: {pub_date.strftime('%B %d, %Y')} ({actual_timeframe.title()})" if pub_date else f"π
**Timeframe**: {result.get('expected_timeframe', 'unknown').title()} (estimated)"
|
148 |
|
149 |
-
article = f"### {article_num
|
150 |
-
return {'article': article, 'timeframe': actual_timeframe, 'url': url
|
151 |
|
152 |
-
|
153 |
-
def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) -> str:
|
154 |
reset_cancellation()
|
155 |
if progress: progress(0, desc="Initializing...")
|
156 |
|
157 |
recent_count, historical_count = max_articles // 2, max_articles - (max_articles // 2)
|
158 |
-
if progress: progress(0.05, desc=f"Planning search: {recent_count} recent + {historical_count} historical")
|
159 |
|
160 |
-
recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress)
|
161 |
-
if check_cancellation(): return "[CANCELLED]
|
162 |
|
163 |
-
time.sleep(1)
|
164 |
|
165 |
-
historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress)
|
166 |
-
if check_cancellation(): return "[CANCELLED]
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
for r in
|
171 |
-
r['expected_timeframe'] = 'recent'
|
172 |
-
all_results.append(r)
|
173 |
-
for r in historical_results:
|
174 |
-
r['expected_timeframe'] = 'historical'
|
175 |
-
all_results.append(r)
|
176 |
|
177 |
-
if not
|
178 |
-
if progress: progress(1.0, desc="Completed - no results found.")
|
179 |
return f"[INFO] No articles found for '{name}'."
|
180 |
|
181 |
-
if progress: progress(0.4, desc=f"Found {len(
|
182 |
|
183 |
articles, recent_found, historical_found = [], 0, 0
|
184 |
-
with ThreadPoolExecutor(max_workers=min(3, len(
|
185 |
-
future_to_article = {executor.submit(fetch_article_parallel, r, i, len(
|
186 |
for future in as_completed(future_to_article):
|
187 |
-
if check_cancellation():
|
188 |
-
for f in future_to_article: f.cancel()
|
189 |
-
return "[CANCELLED] Search was cancelled."
|
190 |
try:
|
191 |
result_data = future.result(timeout=20)
|
192 |
if result_data:
|
193 |
articles.append(result_data)
|
194 |
-
if result_data
|
195 |
-
elif result_data
|
196 |
except Exception as e:
|
197 |
-
print(f"Error
|
198 |
-
|
199 |
-
if check_cancellation(): return "[CANCELLED] Search was cancelled."
|
200 |
-
if progress: progress(0.95, desc="Formatting results...")
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
205 |
articles.sort(key=lambda x: url_to_index.get(x.get('url'), 999))
|
206 |
|
207 |
-
summary = f"**Search Summary**: Found {len(articles)} articles
|
208 |
-
article_texts = [a
|
209 |
-
if progress: progress(1.0, desc=f"Search completed! Found {len(articles)} articles.")
|
210 |
return summary + "\n---\n".join(article_texts)
|
211 |
|
212 |
-
|
213 |
def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
|
214 |
-
|
|
|
215 |
MAX_CHARS = 15000
|
216 |
if len(search_results) > MAX_CHARS:
|
217 |
-
search_results = search_results[:search_results.rfind('.
|
218 |
|
219 |
-
prompt = f"""
|
|
|
|
|
|
|
|
|
220 |
|
221 |
try:
|
222 |
-
if progress: progress(0.5, desc="
|
223 |
-
message = client.messages.create(
|
|
|
|
|
|
|
|
|
224 |
|
225 |
-
# FIX:
|
226 |
-
if
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
-
|
230 |
-
if progress: progress(1.0, desc="Analysis
|
231 |
-
|
|
|
232 |
|
233 |
except Exception as e:
|
234 |
-
if progress: progress(1.0, desc="Analysis failed")
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
# === Gradio Interface (No changes needed here) ===
|
239 |
-
def search_only_enhanced(name: str, article_count: int, progress=gr.Progress()):
|
240 |
-
if not name or not name.strip(): return "β No name provided", ""
|
241 |
-
try:
|
242 |
-
start = time.time()
|
243 |
-
articles_output = search_articles_enhanced(name.strip(), int(article_count), progress=progress)
|
244 |
-
if "[CANCELLED]" in articles_output: return "π Search was cancelled by user.", ""
|
245 |
-
elapsed = time.time() - start
|
246 |
-
results = f"β
**Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n{articles_output}"
|
247 |
-
return results, articles_output
|
248 |
-
except Exception as e:
|
249 |
-
return f"β **Search failed unexpectedly**: {e}", ""
|
250 |
|
|
|
|
|
251 |
|
252 |
-
def
|
253 |
-
|
254 |
-
if not
|
255 |
-
|
256 |
|
257 |
try:
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
try:
|
262 |
-
parsed = json.loads(
|
263 |
formatted = json.dumps(parsed, indent=2)
|
264 |
-
|
265 |
except (json.JSONDecodeError, TypeError):
|
266 |
-
|
|
|
|
|
|
|
|
|
267 |
except Exception as e:
|
268 |
-
|
|
|
|
|
|
|
269 |
|
270 |
-
def
|
271 |
cancel_operation.set()
|
272 |
return "π Cancellation requested..."
|
273 |
|
274 |
-
|
275 |
with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
|
276 |
gr.Markdown("# π Enhanced Founder Finder")
|
277 |
-
gr.Markdown("Enter a business name to find its founders using
|
278 |
-
|
279 |
with gr.Row():
|
280 |
with gr.Column(scale=2): name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'Tesla', 'SpaceX'")
|
281 |
-
with gr.Column(scale=1): article_count_slider = gr.Slider(2, 10, value=4, step=2, label="
|
|
|
282 |
with gr.Row():
|
283 |
-
|
284 |
cancel_btn = gr.Button("π Cancel", variant="stop")
|
285 |
-
|
286 |
-
|
287 |
-
with gr.Row():
|
288 |
-
output1 = gr.Markdown(label="Search Results & Temporal Analysis")
|
289 |
-
output2 = gr.Markdown(label="Founder Intelligence Report")
|
290 |
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
demo.queue()
|
297 |
|
|
|
21 |
cancel_operation = threading.Event()
|
22 |
|
23 |
def reset_cancellation():
|
|
|
24 |
cancel_operation.clear()
|
25 |
|
26 |
def check_cancellation():
|
|
|
27 |
return cancel_operation.is_set()
|
28 |
|
29 |
+
# === Model Functions (Hardened for Stability) ===
|
30 |
|
31 |
def extract_publication_date(soup, url):
|
32 |
try:
|
33 |
+
# This function is generally safe, no changes needed.
|
34 |
date_selectors = [
|
35 |
'time[datetime]', '.date', '.publish-date', '.published', '.post-date',
|
36 |
'[class*="date"]', '[class*="time"]',
|
|
|
54 |
print(f"Date extraction error for {url}: {e}")
|
55 |
return None
|
56 |
|
|
|
57 |
def get_full_article_with_timeout(url, timeout=15):
|
58 |
+
# This function is generally safe, no changes needed.
|
59 |
if check_cancellation(): return "[CANCELLED] Operation was cancelled", None
|
60 |
try:
|
61 |
headers = {
|
62 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
63 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
64 |
'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
|
65 |
}
|
66 |
response = requests.get(url, headers=headers, timeout=timeout, verify=True)
|
67 |
response.raise_for_status()
|
68 |
soup = BeautifulSoup(response.content, 'html.parser')
|
69 |
pub_date = extract_publication_date(soup, url)
|
70 |
+
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
71 |
+
element.decompose()
|
72 |
article_selectors = [
|
73 |
'article', '.article-content', '.post-content', '.story-body', '.story-content', '.entry-content', '.content-body',
|
74 |
'.article-body', 'main article', 'main .content', 'main', '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
|
|
|
76 |
for selector in article_selectors:
|
77 |
content = soup.select_one(selector)
|
78 |
if content:
|
79 |
+
text_parts = [p.get_text(strip=True) for p in content.find_all(['p', 'div'], string=True) if len(p.get_text(strip=True)) > 30]
|
80 |
+
if text_parts:
|
|
|
81 |
full_text = '\n\n'.join(text_parts)
|
82 |
if len(full_text) > 300: return full_text[:10000], pub_date
|
83 |
body_text = soup.get_text(separator='\n\n', strip=True)
|
|
|
84 |
return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
|
85 |
+
except requests.exceptions.RequestException as e:
|
86 |
+
return f"[ERROR] Network error for {url}: {e}", None
|
87 |
+
except Exception as e:
|
88 |
+
return f"[ERROR] Could not fetch article {url}: {e}", None
|
89 |
|
90 |
def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
|
91 |
if check_cancellation(): return []
|
92 |
+
queries = {
|
93 |
+
"recent": [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent'],
|
94 |
+
"historical": [f'"{name}" founded established history', f'"{name}" founder origin story']
|
95 |
+
}
|
96 |
+
search_queries = queries.get(timeframe, [])
|
|
|
97 |
|
98 |
+
all_results, total_queries = [], len(search_queries)
|
99 |
for query_idx, search_query in enumerate(search_queries):
|
100 |
if len(all_results) >= max_articles or check_cancellation(): break
|
101 |
if progress:
|
102 |
+
progress((query_idx / total_queries) * 0.3, desc=f"Searching {timeframe} ({query_idx+1}/{total_queries})")
|
103 |
|
104 |
+
try:
|
105 |
+
with DDGS(timeout=15) as ddgs:
|
106 |
+
# Use safe '.text()' which is designed to handle errors gracefully
|
107 |
+
results = ddgs.text(keywords=search_query, max_results=max_articles - len(all_results) + 2, safesearch='moderate', region='us-en')
|
108 |
+
|
109 |
+
# Defensively check if results is a list-like object
|
110 |
+
if results:
|
111 |
+
existing_urls = {r.get('href', '') for r in all_results}
|
112 |
+
for result in results:
|
113 |
+
# Ensure result is a dictionary and has the 'href' key
|
114 |
+
if isinstance(result, dict) and result.get('href') and result.get('href') not in existing_urls:
|
115 |
+
all_results.append(result)
|
116 |
if len(all_results) >= max_articles: break
|
117 |
+
except Exception as e:
|
118 |
+
print(f"DDGS Search failed for '{search_query}': {e}")
|
119 |
+
|
120 |
+
return all_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
def categorize_article_by_date(pub_date):
|
123 |
if not pub_date: return "unknown"
|
124 |
return "recent" if pub_date >= (datetime.now() - timedelta(days=365)) else "historical"
|
125 |
|
|
|
126 |
def fetch_article_parallel(result, article_num, total_articles, progress=None):
|
127 |
+
if check_cancellation() or not isinstance(result, dict): return None
|
128 |
+
|
129 |
+
url = result.get('href', 'No URL')
|
130 |
+
title = result.get('title', 'No Title')
|
131 |
if progress:
|
132 |
+
progress(0.4 + (article_num / total_articles) * 0.5, desc=f"Fetching {article_num+1}/{total_articles}: {title[:50]}...")
|
133 |
|
134 |
+
full_text, pub_date = get_full_article_with_timeout(url)
|
135 |
if check_cancellation(): return None
|
136 |
|
137 |
actual_timeframe = categorize_article_by_date(pub_date)
|
138 |
+
content = f"[SNIPPET ONLY]\n{result.get('body', 'No snippet')}" if "[ERROR]" in str(full_text) or "[WARNING]" in str(full_text) else full_text
|
139 |
timeframe_indicator = f"π
**Published**: {pub_date.strftime('%B %d, %Y')} ({actual_timeframe.title()})" if pub_date else f"π
**Timeframe**: {result.get('expected_timeframe', 'unknown').title()} (estimated)"
|
140 |
|
141 |
+
article = f"### {article_num+1}. {title}\n[Source]({url})\n{timeframe_indicator}\n\n{content}\n"
|
142 |
+
return {'article': article, 'timeframe': actual_timeframe, 'url': url}
|
143 |
|
144 |
+
def search_articles_enhanced(name: str, max_articles: int, progress=None) -> str:
|
|
|
145 |
reset_cancellation()
|
146 |
if progress: progress(0, desc="Initializing...")
|
147 |
|
148 |
recent_count, historical_count = max_articles // 2, max_articles - (max_articles // 2)
|
|
|
149 |
|
150 |
+
recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress)
|
151 |
+
if check_cancellation(): return "[CANCELLED]"
|
152 |
|
153 |
+
time.sleep(1) # Brief pause
|
154 |
|
155 |
+
historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress)
|
156 |
+
if check_cancellation(): return "[CANCELLED]"
|
157 |
|
158 |
+
all_source_results = []
|
159 |
+
for r in recent_results: r['expected_timeframe'] = 'recent'; all_source_results.append(r)
|
160 |
+
for r in historical_results: r['expected_timeframe'] = 'historical'; all_source_results.append(r)
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
if not all_source_results:
|
|
|
163 |
return f"[INFO] No articles found for '{name}'."
|
164 |
|
165 |
+
if progress: progress(0.4, desc=f"Found {len(all_source_results)} articles, fetching content...")
|
166 |
|
167 |
articles, recent_found, historical_found = [], 0, 0
|
168 |
+
with ThreadPoolExecutor(max_workers=min(3, len(all_source_results))) as executor:
|
169 |
+
future_to_article = {executor.submit(fetch_article_parallel, r, i, len(all_source_results), progress): r for i, r in enumerate(all_source_results)}
|
170 |
for future in as_completed(future_to_article):
|
171 |
+
if check_cancellation(): return "[CANCELLED]"
|
|
|
|
|
172 |
try:
|
173 |
result_data = future.result(timeout=20)
|
174 |
if result_data:
|
175 |
articles.append(result_data)
|
176 |
+
if result_data.get('timeframe') == "recent": recent_found += 1
|
177 |
+
elif result_data.get('timeframe') == "historical": historical_found += 1
|
178 |
except Exception as e:
|
179 |
+
print(f"Error processing article future: {e}")
|
|
|
|
|
|
|
180 |
|
181 |
+
if not articles:
|
182 |
+
return f"[ERROR] Could not fetch content for any of the {len(all_source_results)} articles found."
|
183 |
+
|
184 |
+
if progress: progress(0.95, desc="Formatting results...")
|
185 |
+
|
186 |
+
url_to_index = {res.get('href'): i for i, res in enumerate(all_source_results)}
|
187 |
articles.sort(key=lambda x: url_to_index.get(x.get('url'), 999))
|
188 |
|
189 |
+
summary = f"**Search Summary**: Found content for {len(articles)} articles ({recent_found} recent, {historical_found} historical)\n\n"
|
190 |
+
article_texts = [a.get('article', '[Content Missing]') for a in articles]
|
|
|
191 |
return summary + "\n---\n".join(article_texts)
|
192 |
|
|
|
193 |
def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
|
194 |
+
"""BULLETPROOF VERSION: This function is now hardened against the IndexError."""
|
195 |
+
if progress: progress(0, desc="Preparing text...")
|
196 |
MAX_CHARS = 15000
|
197 |
if len(search_results) > MAX_CHARS:
|
198 |
+
search_results = search_results[:search_results.rfind('.', 0, MAX_CHARS) + 1]
|
199 |
|
200 |
+
prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
|
201 |
+
Return a JSON object with the following structure: {{"founders": [{{"name": "Founder Name", "evidence": ["brief quote"]}}]}}
|
202 |
+
Respond only with valid JSON.
|
203 |
+
Text:
|
204 |
+
{search_results}"""
|
205 |
|
206 |
try:
|
207 |
+
if progress: progress(0.5, desc="Analyzing with AI...")
|
208 |
+
message = client.messages.create(
|
209 |
+
model="claude-sonnet-4-20250514",
|
210 |
+
max_tokens=1500, temperature=0.1,
|
211 |
+
messages=[{"role": "user", "content": prompt}]
|
212 |
+
)
|
213 |
|
214 |
+
# FIX: THE DEFINITIVE FIX FOR THE 'list index out of range' ERROR
|
215 |
+
# Check if the response is valid and has content before accessing it.
|
216 |
+
if message and isinstance(message.content, list) and len(message.content) > 0:
|
217 |
+
# Safely access the first text block
|
218 |
+
first_block = message.content[0]
|
219 |
+
if hasattr(first_block, 'text'):
|
220 |
+
result = first_block.text
|
221 |
+
if progress: progress(1.0, desc="Analysis completed!")
|
222 |
+
return result
|
223 |
|
224 |
+
# If the checks above fail, we land here.
|
225 |
+
if progress: progress(1.0, desc="Analysis failed.")
|
226 |
+
print(f"API Error: Received invalid response from Anthropic. Response: {message}")
|
227 |
+
return json.dumps({"error": "API returned an invalid or empty response.", "details": "The model may have refused to answer due to content filters or an internal error."})
|
228 |
|
229 |
except Exception as e:
|
230 |
+
if progress: progress(1.0, desc="Analysis failed.")
|
231 |
+
print(f"Extraction Exception: {e}")
|
232 |
+
return json.dumps({"error": "An exception occurred during AI extraction.", "details": str(e)})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
+
# === Gradio Interface Functions and UI ===
|
235 |
+
# This section has been simplified to use the more robust functions above.
|
236 |
|
237 |
+
def search_and_extract_flow(name: str, article_count: int, progress=gr.Progress()):
|
238 |
+
"""A single, robust function for the entire search and extract workflow."""
|
239 |
+
if not name or not name.strip():
|
240 |
+
return "β Please provide a company name.", "", "Ready."
|
241 |
|
242 |
try:
|
243 |
+
# Step 1: Search for articles
|
244 |
+
start_time = time.time()
|
245 |
+
articles_output = search_articles_enhanced(name.strip(), int(article_count), progress=progress)
|
246 |
+
|
247 |
+
if "[CANCELLED]" in articles_output:
|
248 |
+
return "π Search cancelled.", "", "Cancelled."
|
249 |
+
if "[ERROR]" in articles_output or "[INFO]" in articles_output:
|
250 |
+
return articles_output, "", "Search completed with info/error."
|
251 |
+
|
252 |
+
search_results_md = f"β
**Search** completed in {time.time() - start_time:.1f}s\n\n{articles_output}"
|
253 |
+
|
254 |
+
# Step 2: Extract entities from the results
|
255 |
+
progress(0, desc="Starting extraction...")
|
256 |
+
extraction_start_time = time.time()
|
257 |
+
entities_json = extract_entities_enhanced(articles_output, name.strip(), progress)
|
258 |
+
|
259 |
+
# Step 3: Format the JSON for display
|
260 |
try:
|
261 |
+
parsed = json.loads(entities_json)
|
262 |
formatted = json.dumps(parsed, indent=2)
|
263 |
+
extraction_results_md = f"β
**Extraction** completed in {time.time() - extraction_start_time:.1f}s\n\n```json\n{formatted}\n```"
|
264 |
except (json.JSONDecodeError, TypeError):
|
265 |
+
extraction_results_md = f"β οΈ **Extraction Warning**: Model did not return valid JSON.\n\n{entities_json}"
|
266 |
+
|
267 |
+
status = f"Completed in {time.time() - start_time:.1f}s"
|
268 |
+
return search_results_md, extraction_results_md, status
|
269 |
+
|
270 |
except Exception as e:
|
271 |
+
# This is a final catch-all for any truly unexpected errors.
|
272 |
+
error_message = f"β **An unexpected error occurred**: {e}"
|
273 |
+
print(error_message) # Log to console for debugging
|
274 |
+
return error_message, "", "Failed."
|
275 |
|
276 |
+
def cancel_flow():
|
277 |
cancel_operation.set()
|
278 |
return "π Cancellation requested..."
|
279 |
|
|
|
280 |
with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
|
281 |
gr.Markdown("# π Enhanced Founder Finder")
|
282 |
+
gr.Markdown("Enter a business name to find its founders. The process involves searching for articles and then using AI to extract founder names.")
|
283 |
+
|
284 |
with gr.Row():
|
285 |
with gr.Column(scale=2): name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'Tesla', 'SpaceX'")
|
286 |
+
with gr.Column(scale=1): article_count_slider = gr.Slider(2, 10, value=4, step=2, label="Articles to Search")
|
287 |
+
|
288 |
with gr.Row():
|
289 |
+
run_btn = gr.Button("π Find Founders", variant="primary")
|
290 |
cancel_btn = gr.Button("π Cancel", variant="stop")
|
291 |
+
|
292 |
+
status_output = gr.Markdown("Ready...")
|
|
|
|
|
|
|
293 |
|
294 |
+
with gr.Tab("Founder Intelligence Report"):
|
295 |
+
output_extract = gr.Markdown(label="Extracted Founder Information")
|
296 |
+
with gr.Tab("Raw Search Results"):
|
297 |
+
output_search = gr.Markdown(label="Article Search & Temporal Analysis")
|
298 |
+
|
299 |
+
run_event = run_btn.click(
|
300 |
+
fn=search_and_extract_flow,
|
301 |
+
inputs=[name_input, article_count_slider],
|
302 |
+
outputs=[output_search, output_extract, status_output]
|
303 |
+
)
|
304 |
+
cancel_btn.click(fn=cancel_flow, inputs=None, outputs=status_output, cancels=[run_event])
|
305 |
+
|
306 |
+
gr.Examples(
|
307 |
+
examples=[["OpenAI", 4], ["SpaceX", 6], ["Microsoft", 4], ["Anthropic", 4]],
|
308 |
+
inputs=[name_input, article_count_slider],
|
309 |
+
)
|
310 |
|
311 |
demo.queue()
|
312 |
|