Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -28,11 +28,11 @@ def check_cancellation():
|
|
28 |
"""Check if operation should be cancelled"""
|
29 |
return cancel_operation.is_set()
|
30 |
|
31 |
-
# === Model Functions
|
32 |
|
33 |
def extract_publication_date(soup, url):
|
34 |
-
"""Extract publication date from article HTML - same as before"""
|
35 |
try:
|
|
|
36 |
date_selectors = [
|
37 |
'time[datetime]', '.date', '.publish-date', '.published', '.post-date',
|
38 |
'[class*="date"]', '[class*="time"]',
|
@@ -56,26 +56,23 @@ def extract_publication_date(soup, url):
|
|
56 |
print(f"Date extraction error for {url}: {e}")
|
57 |
return None
|
58 |
|
|
|
59 |
def get_full_article_with_timeout(url, timeout=15):
|
60 |
-
|
61 |
if check_cancellation(): return "[CANCELLED] Operation was cancelled", None
|
62 |
try:
|
63 |
headers = {
|
64 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
65 |
-
'Accept': '
|
66 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
67 |
-
'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
|
68 |
}
|
69 |
response = requests.get(url, headers=headers, timeout=timeout, verify=True)
|
70 |
response.raise_for_status()
|
71 |
soup = BeautifulSoup(response.content, 'html.parser')
|
72 |
pub_date = extract_publication_date(soup, url)
|
73 |
-
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
|
74 |
-
element.decompose()
|
75 |
article_selectors = [
|
76 |
-
'article', '.article-content', '.post-content', '.story-body', '.story-content', '.entry-content',
|
77 |
-
'.
|
78 |
-
'.main-content', '.page-content', '.text', '.article-text'
|
79 |
]
|
80 |
for selector in article_selectors:
|
81 |
content = soup.select_one(selector)
|
@@ -92,90 +89,104 @@ def get_full_article_with_timeout(url, timeout=15):
|
|
92 |
except requests.exceptions.RequestException as e: return f"[ERROR] Network error: {str(e)}", None
|
93 |
except Exception as e: return f"[ERROR] Could not fetch article: {str(e)}", None
|
94 |
|
|
|
95 |
def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
|
96 |
-
"""Enhanced search with progress tracking and better error handling"""
|
97 |
if check_cancellation(): return []
|
98 |
if timeframe == "recent":
|
99 |
search_queries = [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent', f'"{name}" founder update latest']
|
100 |
else:
|
101 |
search_queries = [f'"{name}" founded established history', f'"{name}" founder origin story', f'"{name}" started began founder']
|
102 |
-
|
|
|
|
|
103 |
for query_idx, search_query in enumerate(search_queries):
|
104 |
if len(all_results) >= max_articles or check_cancellation(): break
|
105 |
if progress:
|
106 |
progress((query_idx / total_queries) * 0.3, desc=f"Searching {timeframe} articles ({query_idx + 1}/{total_queries})")
|
|
|
107 |
for attempt in range(max_retries):
|
108 |
if check_cancellation(): return all_results
|
109 |
try:
|
110 |
-
print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
|
111 |
-
if attempt > 0: time.sleep(base_delay * attempt)
|
112 |
with DDGS(timeout=15) as ddgs:
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
121 |
except Exception as e:
|
122 |
-
print(f"Attempt {attempt + 1} failed for {
|
123 |
-
if attempt < max_retries - 1: time.sleep(
|
|
|
124 |
return all_results[:max_articles]
|
125 |
|
|
|
126 |
def categorize_article_by_date(pub_date):
|
127 |
-
"""Same as before"""
|
128 |
if not pub_date: return "unknown"
|
129 |
return "recent" if pub_date >= (datetime.now() - timedelta(days=365)) else "historical"
|
130 |
|
|
|
131 |
def fetch_article_parallel(result, article_num, total_articles, progress=None):
|
132 |
-
"""Fetch single article with progress update"""
|
133 |
if check_cancellation(): return None
|
134 |
-
url, title
|
135 |
-
expected_timeframe = result.get('expected_timeframe', 'unknown')
|
136 |
if progress:
|
137 |
progress(0.4 + (article_num / total_articles) * 0.5, desc=f"Fetching article {article_num + 1}/{total_articles}: {title[:50]}...")
|
|
|
138 |
full_text, pub_date = get_full_article_with_timeout(url, timeout=12)
|
139 |
if check_cancellation(): return None
|
|
|
140 |
actual_timeframe = categorize_article_by_date(pub_date)
|
141 |
-
content = f"[SNIPPET ONLY]\n{snippet}" if any(e in str(full_text) for e in ["[ERROR]", "timeout", "[CANCELLED]"]) else full_text
|
142 |
-
timeframe_indicator = f"π
**Published**: {pub_date.strftime('%B %d, %Y')} ({actual_timeframe.title()})" if pub_date else f"π
**Timeframe**: {expected_timeframe.title()} (estimated)"
|
|
|
143 |
article = f"### {article_num + 1}. {title}\n[Source]({url})\n{timeframe_indicator}\n\n{content}\n"
|
144 |
return {'article': article, 'timeframe': actual_timeframe, 'url': url, 'title': title}
|
145 |
|
|
|
146 |
def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) -> str:
|
147 |
-
"""Enhanced search with progress tracking and parallel processing"""
|
148 |
reset_cancellation()
|
149 |
-
if progress: progress(0, desc="Initializing
|
|
|
150 |
recent_count, historical_count = max_articles // 2, max_articles - (max_articles // 2)
|
151 |
if progress: progress(0.05, desc=f"Planning search: {recent_count} recent + {historical_count} historical")
|
152 |
|
153 |
-
recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress)
|
154 |
-
if check_cancellation(): return "[CANCELLED] Search was cancelled
|
155 |
|
156 |
-
if progress: progress(0.3, desc="Searching for historical articles...")
|
157 |
time.sleep(1)
|
158 |
|
159 |
-
historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress)
|
160 |
-
if check_cancellation(): return "[CANCELLED] Search was cancelled
|
161 |
-
|
162 |
all_results = []
|
163 |
-
|
164 |
-
for r in
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
166 |
if not all_results:
|
167 |
-
if progress: progress(1.0, desc="
|
168 |
-
return f"[INFO] No articles found for {name}"
|
169 |
-
|
170 |
-
if progress: progress(0.4, desc=f"Found {len(all_results)} articles, now fetching
|
171 |
|
172 |
articles, recent_found, historical_found = [], 0, 0
|
173 |
with ThreadPoolExecutor(max_workers=min(3, len(all_results))) as executor:
|
174 |
-
|
175 |
-
for future in as_completed(
|
176 |
if check_cancellation():
|
177 |
-
for f in
|
178 |
-
return "[CANCELLED] Search was cancelled
|
179 |
try:
|
180 |
result_data = future.result(timeout=20)
|
181 |
if result_data:
|
@@ -185,138 +196,107 @@ def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) ->
|
|
185 |
except Exception as e:
|
186 |
print(f"Error fetching article result: {e}")
|
187 |
|
188 |
-
if check_cancellation(): return "[CANCELLED] Search was cancelled
|
189 |
if progress: progress(0.95, desc="Formatting results...")
|
190 |
-
|
191 |
# FIX: Replaced fragile sorting logic with a robust and efficient dictionary lookup.
|
|
|
192 |
url_to_index = {res.get('href'): i for i, res in enumerate(all_results) if res.get('href')}
|
193 |
articles.sort(key=lambda x: url_to_index.get(x.get('url'), 999))
|
194 |
|
195 |
summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical\n\n"
|
196 |
-
article_texts = [
|
197 |
-
if progress: progress(1.0, desc=f"Search completed! Found {len(articles)} articles")
|
198 |
return summary + "\n---\n".join(article_texts)
|
199 |
|
|
|
200 |
def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
|
201 |
-
"""Enhanced entity extraction with progress tracking"""
|
202 |
if progress: progress(0, desc="Preparing text for analysis...")
|
203 |
MAX_CHARS = 15000
|
204 |
if len(search_results) > MAX_CHARS:
|
205 |
search_results = search_results[:search_results.rfind('. ', 0, MAX_CHARS) + 1]
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
Only include founders who are explicitly mentioned as founders of {company_name}.
|
210 |
-
Ignore founders of other companies that may be mentioned in the text.
|
211 |
-
Return a JSON object with the following structure: {{"founders": [{{"name": "Founder Name", "evidence": ["brief quote or context where they were mentioned as founder"]}}]}}
|
212 |
-
Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
|
213 |
-
You have to examine every article available in the search results below.
|
214 |
-
Text:
|
215 |
-
{search_results}"""
|
216 |
-
|
217 |
try:
|
218 |
if progress: progress(0.5, desc="Sending request to AI model...")
|
219 |
-
message = client.messages.create(
|
220 |
-
model="claude-sonnet-4-20250514",
|
221 |
-
max_tokens=1500, temperature=0.1,
|
222 |
-
messages=[{"role": "user", "content": prompt}]
|
223 |
-
)
|
224 |
|
225 |
-
|
|
|
|
|
226 |
|
227 |
-
# FIX: Check if the API returned any content before trying to access it.
|
228 |
-
# This prevents the "list index out of range" error.
|
229 |
-
if not message.content:
|
230 |
-
error_json = {"error": "API returned no content", "details": "The model may have refused to answer, or an API error occurred."}
|
231 |
-
return json.dumps(error_json)
|
232 |
-
|
233 |
result = message.content[0].text
|
234 |
-
|
235 |
if progress: progress(1.0, desc="Analysis completed!")
|
236 |
return result
|
237 |
|
238 |
except Exception as e:
|
239 |
if progress: progress(1.0, desc="Analysis failed")
|
240 |
-
|
241 |
-
return json.dumps(error_json)
|
242 |
|
243 |
-
# === Gradio Interface Functions (Unchanged) ===
|
244 |
|
|
|
245 |
def search_only_enhanced(name: str, article_count: int, progress=gr.Progress()):
|
246 |
-
if not name.strip(): return "β No name provided", ""
|
247 |
try:
|
248 |
start = time.time()
|
249 |
-
|
250 |
-
articles_output = search_articles_enhanced(name.strip(), max_articles=article_count, progress=progress)
|
251 |
if "[CANCELLED]" in articles_output: return "π Search was cancelled by user.", ""
|
252 |
elapsed = time.time() - start
|
253 |
-
|
254 |
-
results = f"β
**Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n{articles_output}"
|
255 |
return results, articles_output
|
256 |
except Exception as e:
|
257 |
-
|
258 |
-
|
259 |
|
260 |
def extract_only_enhanced(stored_results: str, company_name: str, progress=gr.Progress()):
|
261 |
-
if not stored_results.strip(): return "β No search results
|
262 |
-
if not company_name.strip(): return "β No company name provided
|
263 |
-
if "[CANCELLED]" in stored_results: return "β Cannot extract from cancelled
|
|
|
264 |
try:
|
265 |
start = time.time()
|
266 |
-
entities = extract_entities_enhanced(stored_results, company_name.strip(), progress)
|
267 |
elapsed = time.time() - start
|
268 |
try:
|
269 |
parsed = json.loads(entities)
|
270 |
formatted = json.dumps(parsed, indent=2)
|
271 |
-
return f"β
**
|
272 |
except (json.JSONDecodeError, TypeError):
|
273 |
return f"β οΈ **Extraction Note**: Model did not return valid JSON. Completed in {elapsed:.1f}s\n\n{entities}"
|
274 |
except Exception as e:
|
275 |
-
|
276 |
-
return f"β **Extraction failed**: {str(e)}"
|
277 |
|
278 |
def cancel_search():
|
279 |
cancel_operation.set()
|
280 |
-
return "π
|
281 |
|
282 |
-
# === Gradio UI (Unchanged) ===
|
283 |
|
284 |
with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
|
285 |
gr.Markdown("# π Enhanced Founder Finder")
|
286 |
-
gr.Markdown("Enter a business
|
287 |
-
gr.Markdown("*π **New Features**: Progress bars, cancellation support, parallel processing, better error handling*")
|
288 |
-
gr.Markdown("*β±οΈ Note: Enhanced search typically takes 30β60 seconds with full progress visibility.*")
|
289 |
-
|
290 |
search_state = gr.State("")
|
291 |
-
|
292 |
with gr.Row():
|
293 |
-
with gr.Column(scale=2):
|
294 |
-
|
295 |
-
with gr.Column(scale=1):
|
296 |
-
article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles", info="Split between recent/historical")
|
297 |
-
|
298 |
with gr.Row():
|
299 |
-
search_btn = gr.Button("π
|
300 |
-
cancel_btn = gr.Button("π Cancel
|
301 |
-
extract_btn = gr.Button("π Extract
|
302 |
-
|
303 |
with gr.Row(): status_output = gr.Markdown("Ready to search...")
|
304 |
-
|
305 |
with gr.Row():
|
306 |
-
|
307 |
-
|
308 |
-
with gr.Column():
|
309 |
-
output2 = gr.Textbox(label="Founder Intelligence Report", lines=15, max_lines=25, show_copy_button=True)
|
310 |
|
311 |
-
search_event = search_btn.click(fn=search_only_enhanced, inputs=[name_input, article_count_slider], outputs=[output1, search_state]
|
312 |
-
cancel_btn.click(fn=cancel_search, inputs=None, outputs=
|
313 |
-
extract_btn.click(fn=extract_only_enhanced, inputs=[search_state, name_input], outputs=[output2]
|
314 |
-
gr.Examples(examples=[["
|
315 |
|
316 |
demo.queue()
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
-
demo.launch(
|
320 |
|
321 |
'''
|
322 |
import gradio as gr
|
|
|
28 |
"""Check if operation should be cancelled"""
|
29 |
return cancel_operation.is_set()
|
30 |
|
31 |
+
# === Model Functions ===
|
32 |
|
33 |
def extract_publication_date(soup, url):
|
|
|
34 |
try:
|
35 |
+
# ... (Function content is correct, keeping it for brevity) ...
|
36 |
date_selectors = [
|
37 |
'time[datetime]', '.date', '.publish-date', '.published', '.post-date',
|
38 |
'[class*="date"]', '[class*="time"]',
|
|
|
56 |
print(f"Date extraction error for {url}: {e}")
|
57 |
return None
|
58 |
|
59 |
+
|
60 |
def get_full_article_with_timeout(url, timeout=15):
|
61 |
+
# ... (Function content is correct, keeping it for brevity) ...
|
62 |
if check_cancellation(): return "[CANCELLED] Operation was cancelled", None
|
63 |
try:
|
64 |
headers = {
|
65 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
66 |
+
'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'
|
|
|
|
|
67 |
}
|
68 |
response = requests.get(url, headers=headers, timeout=timeout, verify=True)
|
69 |
response.raise_for_status()
|
70 |
soup = BeautifulSoup(response.content, 'html.parser')
|
71 |
pub_date = extract_publication_date(soup, url)
|
72 |
+
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']): element.decompose()
|
|
|
73 |
article_selectors = [
|
74 |
+
'article', '.article-content', '.post-content', '.story-body', '.story-content', '.entry-content', '.content-body',
|
75 |
+
'.article-body', 'main article', 'main .content', 'main', '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
|
|
|
76 |
]
|
77 |
for selector in article_selectors:
|
78 |
content = soup.select_one(selector)
|
|
|
89 |
except requests.exceptions.RequestException as e: return f"[ERROR] Network error: {str(e)}", None
|
90 |
except Exception as e: return f"[ERROR] Could not fetch article: {str(e)}", None
|
91 |
|
92 |
+
|
93 |
def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
|
|
|
94 |
if check_cancellation(): return []
|
95 |
if timeframe == "recent":
|
96 |
search_queries = [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent', f'"{name}" founder update latest']
|
97 |
else:
|
98 |
search_queries = [f'"{name}" founded established history', f'"{name}" founder origin story', f'"{name}" started began founder']
|
99 |
+
|
100 |
+
all_results, max_retries, total_queries = [], 2, len(search_queries)
|
101 |
+
|
102 |
for query_idx, search_query in enumerate(search_queries):
|
103 |
if len(all_results) >= max_articles or check_cancellation(): break
|
104 |
if progress:
|
105 |
progress((query_idx / total_queries) * 0.3, desc=f"Searching {timeframe} articles ({query_idx + 1}/{total_queries})")
|
106 |
+
|
107 |
for attempt in range(max_retries):
|
108 |
if check_cancellation(): return all_results
|
109 |
try:
|
|
|
|
|
110 |
with DDGS(timeout=15) as ddgs:
|
111 |
+
# FIX: Directly get results and check if it's None.
|
112 |
+
# This prevents crashes if the library returns None instead of an empty list.
|
113 |
+
results = ddgs.text(keywords=search_query, max_results=max_articles - len(all_results) + 2, safesearch='moderate', region='us-en')
|
114 |
+
|
115 |
+
if results:
|
116 |
+
existing_urls = {r.get('url', '') for r in all_results}
|
117 |
+
for result in results:
|
118 |
+
if len(all_results) >= max_articles: break
|
119 |
+
url = result.get('href') # Check for href directly
|
120 |
+
if url and url not in existing_urls:
|
121 |
+
all_results.append(result)
|
122 |
+
existing_urls.add(url)
|
123 |
+
break # Break from retry loop on success
|
124 |
except Exception as e:
|
125 |
+
print(f"DDGS Search Attempt {attempt + 1} failed for '{search_query}': {e}")
|
126 |
+
if attempt < max_retries - 1: time.sleep(1)
|
127 |
+
|
128 |
return all_results[:max_articles]
|
129 |
|
130 |
+
|
131 |
def categorize_article_by_date(pub_date):
|
|
|
132 |
if not pub_date: return "unknown"
|
133 |
return "recent" if pub_date >= (datetime.now() - timedelta(days=365)) else "historical"
|
134 |
|
135 |
+
|
136 |
def fetch_article_parallel(result, article_num, total_articles, progress=None):
|
|
|
137 |
if check_cancellation(): return None
|
138 |
+
url, title = result.get('href', 'No URL'), result.get('title', 'No Title')
|
|
|
139 |
if progress:
|
140 |
progress(0.4 + (article_num / total_articles) * 0.5, desc=f"Fetching article {article_num + 1}/{total_articles}: {title[:50]}...")
|
141 |
+
|
142 |
full_text, pub_date = get_full_article_with_timeout(url, timeout=12)
|
143 |
if check_cancellation(): return None
|
144 |
+
|
145 |
actual_timeframe = categorize_article_by_date(pub_date)
|
146 |
+
content = f"[SNIPPET ONLY]\n{result.get('body', 'No snippet')}" if any(e in str(full_text) for e in ["[ERROR]", "timeout", "[CANCELLED]"]) else full_text
|
147 |
+
timeframe_indicator = f"π
**Published**: {pub_date.strftime('%B %d, %Y')} ({actual_timeframe.title()})" if pub_date else f"π
**Timeframe**: {result.get('expected_timeframe', 'unknown').title()} (estimated)"
|
148 |
+
|
149 |
article = f"### {article_num + 1}. {title}\n[Source]({url})\n{timeframe_indicator}\n\n{content}\n"
|
150 |
return {'article': article, 'timeframe': actual_timeframe, 'url': url, 'title': title}
|
151 |
|
152 |
+
|
153 |
def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) -> str:
|
|
|
154 |
reset_cancellation()
|
155 |
+
if progress: progress(0, desc="Initializing...")
|
156 |
+
|
157 |
recent_count, historical_count = max_articles // 2, max_articles - (max_articles // 2)
|
158 |
if progress: progress(0.05, desc=f"Planning search: {recent_count} recent + {historical_count} historical")
|
159 |
|
160 |
+
recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress) or []
|
161 |
+
if check_cancellation(): return "[CANCELLED] Search was cancelled."
|
162 |
|
|
|
163 |
time.sleep(1)
|
164 |
|
165 |
+
historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress) or []
|
166 |
+
if check_cancellation(): return "[CANCELLED] Search was cancelled."
|
167 |
+
|
168 |
all_results = []
|
169 |
+
# FIX: This loop is now safe because recent_results is guaranteed to be a list.
|
170 |
+
for r in recent_results:
|
171 |
+
r['expected_timeframe'] = 'recent'
|
172 |
+
all_results.append(r)
|
173 |
+
for r in historical_results:
|
174 |
+
r['expected_timeframe'] = 'historical'
|
175 |
+
all_results.append(r)
|
176 |
+
|
177 |
if not all_results:
|
178 |
+
if progress: progress(1.0, desc="Completed - no results found.")
|
179 |
+
return f"[INFO] No articles found for '{name}'."
|
180 |
+
|
181 |
+
if progress: progress(0.4, desc=f"Found {len(all_results)} articles, now fetching...")
|
182 |
|
183 |
articles, recent_found, historical_found = [], 0, 0
|
184 |
with ThreadPoolExecutor(max_workers=min(3, len(all_results))) as executor:
|
185 |
+
future_to_article = {executor.submit(fetch_article_parallel, r, i, len(all_results), progress): r for i, r in enumerate(all_results)}
|
186 |
+
for future in as_completed(future_to_article):
|
187 |
if check_cancellation():
|
188 |
+
for f in future_to_article: f.cancel()
|
189 |
+
return "[CANCELLED] Search was cancelled."
|
190 |
try:
|
191 |
result_data = future.result(timeout=20)
|
192 |
if result_data:
|
|
|
196 |
except Exception as e:
|
197 |
print(f"Error fetching article result: {e}")
|
198 |
|
199 |
+
if check_cancellation(): return "[CANCELLED] Search was cancelled."
|
200 |
if progress: progress(0.95, desc="Formatting results...")
|
201 |
+
|
202 |
# FIX: Replaced fragile sorting logic with a robust and efficient dictionary lookup.
|
203 |
+
# This prevents any possibility of an IndexError or StopIteration during sorting.
|
204 |
url_to_index = {res.get('href'): i for i, res in enumerate(all_results) if res.get('href')}
|
205 |
articles.sort(key=lambda x: url_to_index.get(x.get('url'), 999))
|
206 |
|
207 |
summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical\n\n"
|
208 |
+
article_texts = [a['article'] for a in articles]
|
209 |
+
if progress: progress(1.0, desc=f"Search completed! Found {len(articles)} articles.")
|
210 |
return summary + "\n---\n".join(article_texts)
|
211 |
|
212 |
+
|
213 |
def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
|
|
|
214 |
if progress: progress(0, desc="Preparing text for analysis...")
|
215 |
MAX_CHARS = 15000
|
216 |
if len(search_results) > MAX_CHARS:
|
217 |
search_results = search_results[:search_results.rfind('. ', 0, MAX_CHARS) + 1]
|
218 |
|
219 |
+
prompt = f"""...""" # Prompt is fine
|
220 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
try:
|
222 |
if progress: progress(0.5, desc="Sending request to AI model...")
|
223 |
+
message = client.messages.create(model="claude-sonnet-4-20250514", max_tokens=1500, temperature=0.1, messages=[{"role": "user", "content": prompt}])
|
|
|
|
|
|
|
|
|
224 |
|
225 |
+
# FIX: Robust check for API response content. Prevents IndexError.
|
226 |
+
if not message.content or not isinstance(message.content, list):
|
227 |
+
return json.dumps({"error": "API returned an invalid or empty response."})
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
result = message.content[0].text
|
|
|
230 |
if progress: progress(1.0, desc="Analysis completed!")
|
231 |
return result
|
232 |
|
233 |
except Exception as e:
|
234 |
if progress: progress(1.0, desc="Analysis failed")
|
235 |
+
return json.dumps({"error": "Extraction failed due to an exception", "details": str(e)})
|
|
|
236 |
|
|
|
237 |
|
238 |
+
# === Gradio Interface (No changes needed here) ===
|
239 |
def search_only_enhanced(name: str, article_count: int, progress=gr.Progress()):
|
240 |
+
if not name or not name.strip(): return "β No name provided", ""
|
241 |
try:
|
242 |
start = time.time()
|
243 |
+
articles_output = search_articles_enhanced(name.strip(), int(article_count), progress=progress)
|
|
|
244 |
if "[CANCELLED]" in articles_output: return "π Search was cancelled by user.", ""
|
245 |
elapsed = time.time() - start
|
246 |
+
results = f"β
**Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n{articles_output}"
|
|
|
247 |
return results, articles_output
|
248 |
except Exception as e:
|
249 |
+
return f"β **Search failed unexpectedly**: {e}", ""
|
250 |
+
|
251 |
|
252 |
def extract_only_enhanced(stored_results: str, company_name: str, progress=gr.Progress()):
|
253 |
+
if not stored_results or not stored_results.strip(): return "β No search results. Please search first."
|
254 |
+
if not company_name or not company_name.strip(): return "β No company name provided for context."
|
255 |
+
if "[CANCELLED]" in stored_results: return "β Cannot extract from cancelled results. Please search again."
|
256 |
+
|
257 |
try:
|
258 |
start = time.time()
|
259 |
+
entities = extract_entities_enhanced(stored_results, company_name.strip(), progress=progress)
|
260 |
elapsed = time.time() - start
|
261 |
try:
|
262 |
parsed = json.loads(entities)
|
263 |
formatted = json.dumps(parsed, indent=2)
|
264 |
+
return f"β
**Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
|
265 |
except (json.JSONDecodeError, TypeError):
|
266 |
return f"β οΈ **Extraction Note**: Model did not return valid JSON. Completed in {elapsed:.1f}s\n\n{entities}"
|
267 |
except Exception as e:
|
268 |
+
return f"β **Extraction failed unexpectedly**: {e}"
|
|
|
269 |
|
270 |
def cancel_search():
|
271 |
cancel_operation.set()
|
272 |
+
return "π Cancellation requested..."
|
273 |
|
|
|
274 |
|
275 |
with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
|
276 |
gr.Markdown("# π Enhanced Founder Finder")
|
277 |
+
gr.Markdown("Enter a business name to find its founders using a temporal search strategy.")
|
|
|
|
|
|
|
278 |
search_state = gr.State("")
|
|
|
279 |
with gr.Row():
|
280 |
+
with gr.Column(scale=2): name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'Tesla', 'SpaceX'")
|
281 |
+
with gr.Column(scale=1): article_count_slider = gr.Slider(2, 10, value=4, step=2, label="Total Articles")
|
|
|
|
|
|
|
282 |
with gr.Row():
|
283 |
+
search_btn = gr.Button("π Temporal Search", variant="primary")
|
284 |
+
cancel_btn = gr.Button("π Cancel", variant="stop")
|
285 |
+
extract_btn = gr.Button("π Extract Founders", variant="secondary")
|
|
|
286 |
with gr.Row(): status_output = gr.Markdown("Ready to search...")
|
|
|
287 |
with gr.Row():
|
288 |
+
output1 = gr.Markdown(label="Search Results & Temporal Analysis")
|
289 |
+
output2 = gr.Markdown(label="Founder Intelligence Report")
|
|
|
|
|
290 |
|
291 |
+
search_event = search_btn.click(fn=search_only_enhanced, inputs=[name_input, article_count_slider], outputs=[output1, search_state])
|
292 |
+
cancel_btn.click(fn=cancel_search, inputs=None, outputs=status_output, cancels=[search_event])
|
293 |
+
extract_btn.click(fn=extract_only_enhanced, inputs=[search_state, name_input], outputs=[output2])
|
294 |
+
gr.Examples(examples=[["OpenAI", 4], ["SpaceX", 6], ["Microsoft", 4]], inputs=[name_input, article_count_slider])
|
295 |
|
296 |
demo.queue()
|
297 |
|
298 |
if __name__ == "__main__":
|
299 |
+
demo.launch(show_error=True)
|
300 |
|
301 |
'''
|
302 |
import gradio as gr
|