Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,188 +1,95 @@
|
|
1 |
-
import gradio as gr
|
2 |
import requests
|
3 |
-
import time
|
4 |
import re
|
5 |
from duckduckgo_search import DDGS
|
6 |
-
from bs4 import BeautifulSoup
|
7 |
import anthropic
|
8 |
import os
|
9 |
-
from datetime import datetime, timedelta
|
10 |
-
from dateutil import parser
|
11 |
import json
|
12 |
-
import threading
|
13 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
14 |
-
|
15 |
-
# Initialize Anthropic client
|
16 |
-
client = anthropic.Anthropic(
|
17 |
-
api_key=os.getenv("ANTHROPIC_API_KEY")
|
18 |
-
)
|
19 |
-
|
20 |
-
# Global variable for cancellation
|
21 |
-
cancel_operation = threading.Event()
|
22 |
|
23 |
-
|
24 |
-
|
25 |
|
26 |
-
|
27 |
-
return cancel_operation.is_set()
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
def extract_publication_date(soup):
|
32 |
"""
|
33 |
-
|
34 |
-
|
35 |
"""
|
36 |
-
|
37 |
-
|
38 |
-
date_selectors = [
|
39 |
-
'time[datetime]', 'meta[property="article:published_time"]',
|
40 |
-
'meta[name="publishdate"]', 'meta[name="date"]'
|
41 |
-
]
|
42 |
-
for selector in date_selectors:
|
43 |
-
element = soup.select_one(selector)
|
44 |
-
if element:
|
45 |
-
date_text = element.get('datetime') or element.get('content')
|
46 |
-
if date_text:
|
47 |
-
try: return parser.parse(date_text)
|
48 |
-
except (ValueError, TypeError): continue
|
49 |
-
|
50 |
-
# Fallback to text patterns
|
51 |
-
text_content = soup.get_text()[:2000]
|
52 |
-
date_patterns = [
|
53 |
-
r'(\w+ \d{1,2}, \d{4})', # January 1, 2023
|
54 |
-
r'(\d{4}-\d{2}-\d{2})', # 2023-01-01
|
55 |
-
r'(\d{1,2}/\d{1,2}/\d{4})' # 01/01/2023
|
56 |
-
]
|
57 |
-
for pattern in date_patterns:
|
58 |
-
matches = re.findall(pattern, text_content)
|
59 |
-
# THE CRITICAL FIX: Ensure 'matches' is not empty before accessing index 0.
|
60 |
-
if matches:
|
61 |
-
try:
|
62 |
-
return parser.parse(matches[0])
|
63 |
-
except (ValueError, TypeError):
|
64 |
-
continue
|
65 |
-
except Exception as e:
|
66 |
-
print(f"Error in date extraction: {e}")
|
67 |
-
return None
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
78 |
-
pub_date = extract_publication_date(soup)
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
except requests.exceptions.RequestException as e:
|
95 |
-
return f"[ERROR] Network error for {url}: {e}", None
|
96 |
except Exception as e:
|
97 |
-
return f"
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
queries = {
|
102 |
-
"recent": [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent'],
|
103 |
-
"historical": [f'"{name}" founded established history', f'"{name}" founder origin story']
|
104 |
-
}.get(timeframe, [])
|
105 |
-
|
106 |
-
all_results = []
|
107 |
-
for idx, query in enumerate(queries):
|
108 |
-
if len(all_results) >= max_articles: break
|
109 |
-
if progress: progress((idx / len(queries)) * 0.3, desc=f"Searching {timeframe} ({idx+1}/{len(queries)})")
|
110 |
-
try:
|
111 |
-
with DDGS(timeout=10) as ddgs:
|
112 |
-
results = ddgs.text(keywords=query, max_results=max_articles, safesearch='moderate')
|
113 |
-
if results:
|
114 |
-
existing_urls = {r.get('href') for r in all_results}
|
115 |
-
for res in results:
|
116 |
-
if res.get('href') and res.get('href') not in existing_urls:
|
117 |
-
all_results.append(res)
|
118 |
-
except Exception as e:
|
119 |
-
print(f"Search query '{query}' failed: {e}")
|
120 |
-
return all_results
|
121 |
-
|
122 |
-
# === Core Workflow Functions ===
|
123 |
-
|
124 |
-
def search_workflow(name: str, article_count: int, progress=gr.Progress()):
|
125 |
-
reset_cancellation()
|
126 |
-
progress(0, desc="Initializing search...")
|
127 |
|
128 |
-
|
129 |
-
historical_count = article_count - recent_count
|
130 |
|
131 |
-
|
132 |
-
if check_cancellation(): return "[CANCELLED]", ""
|
133 |
|
134 |
-
|
135 |
-
if check_cancellation(): return "[CANCELLED]", ""
|
136 |
|
137 |
-
all_sources = (recent_results or []) + (historical_results or [])
|
138 |
-
if not all_sources:
|
139 |
-
return "[INFO] No articles found.", ""
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
fetched_articles = []
|
144 |
-
with ThreadPoolExecutor(max_workers=3) as executor:
|
145 |
-
future_to_url = {executor.submit(get_full_article_with_timeout, src.get('href')): src for src in all_sources}
|
146 |
-
for i, future in enumerate(as_completed(future_to_url)):
|
147 |
-
if check_cancellation(): return "[CANCELLED]", ""
|
148 |
-
progress(0.4 + (i / len(all_sources)) * 0.55, desc=f"Fetching {i+1}/{len(all_sources)}")
|
149 |
-
try:
|
150 |
-
content, pub_date = future.result()
|
151 |
-
source = future_to_url[future]
|
152 |
-
fetched_articles.append({
|
153 |
-
"title": source.get('title', 'No Title'),
|
154 |
-
"url": source.get('href', 'No URL'),
|
155 |
-
"content": content,
|
156 |
-
"date": pub_date.strftime("%B %d, %Y") if pub_date else "Unknown Date"
|
157 |
-
})
|
158 |
-
except Exception as e:
|
159 |
-
print(f"Failed to fetch a result: {e}")
|
160 |
-
|
161 |
-
if not fetched_articles:
|
162 |
-
return "[ERROR] Could not fetch content for any articles.", ""
|
163 |
-
|
164 |
-
progress(0.95, desc="Formatting results...")
|
165 |
-
|
166 |
-
# Assemble the final markdown and raw text for the next step
|
167 |
-
markdown_output = ""
|
168 |
-
raw_text_for_ai = ""
|
169 |
-
for i, article in enumerate(fetched_articles):
|
170 |
-
markdown_output += f"### {i+1}. {article['title']}\n"
|
171 |
-
markdown_output += f"**Source**: [{article['url']}]({article['url']})\n"
|
172 |
-
markdown_output += f"**Date**: {article['date']}\n\n"
|
173 |
-
markdown_output += f"{article['content'][:800]}...\n\n---\n\n" # Show a snippet
|
174 |
-
|
175 |
-
raw_text_for_ai += f"Article {i+1}:\nTitle: {article['title']}\nContent: {article['content']}\n\n"
|
176 |
-
|
177 |
-
return markdown_output, raw_text_for_ai
|
178 |
|
179 |
def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
|
|
|
|
|
|
|
180 |
if not raw_text or not raw_text.strip():
|
181 |
-
return "❌
|
|
|
|
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
Do not add any text outside the JSON object.
|
187 |
|
188 |
ARTICLES:
|
@@ -191,82 +98,82 @@ ARTICLES:
|
|
191 |
---
|
192 |
"""
|
193 |
try:
|
194 |
-
progress(0.5, desc="
|
195 |
-
message =
|
196 |
model="claude-sonnet-4-20250514", # As requested
|
197 |
-
max_tokens=
|
198 |
temperature=0.0,
|
199 |
messages=[{"role": "user", "content": prompt}]
|
200 |
)
|
201 |
-
|
202 |
-
#
|
203 |
-
if message and isinstance(message.content, list) and message.content:
|
204 |
text_block = message.content[0]
|
205 |
if hasattr(text_block, 'text'):
|
206 |
json_text = text_block.text
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
except Exception as e:
|
220 |
return f"❌ **An unexpected error occurred during extraction**: {e}"
|
221 |
|
222 |
-
def cancel_flow():
|
223 |
-
cancel_operation.set()
|
224 |
-
return "🛑 Cancellation requested..."
|
225 |
|
226 |
-
# === Gradio UI
|
227 |
|
228 |
with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
|
229 |
-
gr.Markdown("# 🔎 Founder Finder")
|
230 |
-
gr.Markdown("A robust tool to find company founders
|
231 |
|
232 |
-
#
|
233 |
search_results_for_ai = gr.State("")
|
234 |
|
235 |
with gr.Row():
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
with gr.Row():
|
240 |
-
search_btn = gr.Button("1. ��� Search for Articles", variant="primary")
|
241 |
-
extract_btn = gr.Button("2. 📊 Extract Founders from Search", variant="secondary")
|
242 |
-
cancel_btn = gr.Button("🛑 Cancel", variant="stop")
|
243 |
|
244 |
-
|
|
|
245 |
|
246 |
with gr.Tab("Founder Intelligence Report"):
|
247 |
-
output_extract = gr.Markdown(
|
248 |
-
with gr.Tab("
|
249 |
-
output_search = gr.Markdown(
|
250 |
|
251 |
-
#
|
252 |
-
|
|
|
|
|
253 |
fn=search_workflow,
|
254 |
-
inputs=[name_input
|
255 |
-
outputs=[output_search, search_results_for_ai]
|
|
|
256 |
)
|
257 |
|
258 |
-
|
|
|
259 |
fn=extraction_workflow,
|
260 |
inputs=[search_results_for_ai, name_input],
|
261 |
-
outputs=[output_extract]
|
|
|
262 |
)
|
263 |
|
264 |
-
# Cancellation can stop either long process
|
265 |
-
cancel_btn.click(fn=cancel_flow, inputs=None, outputs=status_output, cancels=[search_event, extract_event])
|
266 |
-
|
267 |
gr.Examples(
|
268 |
-
examples=[
|
269 |
-
inputs=[name_input
|
270 |
)
|
271 |
|
272 |
demo.queue()
|
|
|
1 |
+
import gradio as gr
|
2 |
import requests
|
|
|
3 |
import re
|
4 |
from duckduckgo_search import DDGS
|
|
|
5 |
import anthropic
|
6 |
import os
|
|
|
|
|
7 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
# Initialize clients
|
10 |
+
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
11 |
|
12 |
+
# === 1. Simplified Search Workflow ===
|
|
|
13 |
|
14 |
+
def search_workflow(name: str, progress=gr.Progress()):
|
|
|
|
|
15 |
"""
|
16 |
+
A simple, sequential, and robust function to search for articles.
|
17 |
+
It fetches exactly 8 articles: 4 recent, 4 historical.
|
18 |
"""
|
19 |
+
if not name or not name.strip():
|
20 |
+
return "❌ Please enter a company name.", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
progress(0, desc="Starting search...")
|
23 |
+
|
24 |
+
# Define search queries
|
25 |
+
recent_keywords = f'"{name}" founder news'
|
26 |
+
historical_keywords = f'"{name}" founder history origin'
|
27 |
+
|
28 |
+
all_articles_markdown = []
|
29 |
+
raw_text_for_ai = ""
|
|
|
|
|
30 |
|
31 |
+
try:
|
32 |
+
with DDGS(timeout=20) as ddgs:
|
33 |
+
# --- Fetch 4 Recent Articles (past year) ---
|
34 |
+
progress(0.1, desc="Searching for recent articles...")
|
35 |
+
# The 'timelimit="y"' parameter is a reliable way to get recent results.
|
36 |
+
recent_results = ddgs.text(keywords=recent_keywords, max_results=4, timelimit='y') or []
|
37 |
+
|
38 |
+
for i, res in enumerate(recent_results):
|
39 |
+
title = res.get('title', 'No Title')
|
40 |
+
url = res.get('href', '#')
|
41 |
+
body = res.get('body', 'No snippet available.')
|
42 |
+
|
43 |
+
# Format for display
|
44 |
+
markdown = f"### (Recent) {title}\n**Source**: [{url}]({url})\n\n{body}\n"
|
45 |
+
all_articles_markdown.append(markdown)
|
46 |
+
|
47 |
+
# Format for AI
|
48 |
+
raw_text_for_ai += f"Article (Recent):\nTitle: {title}\nContent: {body}\n\n"
|
49 |
+
|
50 |
+
# --- Fetch 4 Historical Articles ---
|
51 |
+
progress(0.5, desc="Searching for historical articles...")
|
52 |
+
historical_results = ddgs.text(keywords=historical_keywords, max_results=4) or []
|
53 |
+
|
54 |
+
for i, res in enumerate(historical_results):
|
55 |
+
title = res.get('title', 'No Title')
|
56 |
+
url = res.get('href', '#')
|
57 |
+
body = res.get('body', 'No snippet available.')
|
58 |
+
|
59 |
+
# Format for display
|
60 |
+
markdown = f"### (Historical) {title}\n**Source**: [{url}]({url})\n\n{body}\n"
|
61 |
+
all_articles_markdown.append(markdown)
|
62 |
+
|
63 |
+
# Format for AI
|
64 |
+
raw_text_for_ai += f"Article (Historical):\nTitle: {title}\nContent: {body}\n\n"
|
65 |
|
|
|
|
|
66 |
except Exception as e:
|
67 |
+
return f"❌ An error occurred during search: {e}", ""
|
68 |
+
|
69 |
+
if not all_articles_markdown:
|
70 |
+
return "[INFO] No articles found for that company.", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
progress(1.0, desc="Search complete!")
|
|
|
73 |
|
74 |
+
final_markdown = f"## Found {len(all_articles_markdown)} Articles\n\n" + "\n---\n".join(all_articles_markdown)
|
|
|
75 |
|
76 |
+
return final_markdown, raw_text_for_ai
|
|
|
77 |
|
|
|
|
|
|
|
78 |
|
79 |
+
# === 2. Simplified Extraction Workflow ===
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
|
82 |
+
"""
|
83 |
+
A simple and robust function to extract founders from text using the AI model.
|
84 |
+
"""
|
85 |
if not raw_text or not raw_text.strip():
|
86 |
+
return "❌ Please run a search first to get text to analyze."
|
87 |
+
|
88 |
+
progress(0, desc="Preparing prompt for AI...")
|
89 |
|
90 |
+
prompt = f"""From the provided article snippets about "{company_name}", extract the names of individuals explicitly identified as a founder.
|
91 |
+
Return a single, valid JSON object with the structure: {{"founders": [{{"name": "Founder's Name", "evidence": "A brief quote or context."}}]}}
|
92 |
+
If no founders are mentioned, return an empty list: {{"founders": []}}.
|
93 |
Do not add any text outside the JSON object.
|
94 |
|
95 |
ARTICLES:
|
|
|
98 |
---
|
99 |
"""
|
100 |
try:
|
101 |
+
progress(0.5, desc="Sending request to AI model...")
|
102 |
+
message = anthropic_client.messages.create(
|
103 |
model="claude-sonnet-4-20250514", # As requested
|
104 |
+
max_tokens=1024,
|
105 |
temperature=0.0,
|
106 |
messages=[{"role": "user", "content": prompt}]
|
107 |
)
|
108 |
+
|
109 |
+
# This robust check prevents the 'list index out of range' error.
|
110 |
+
if message and message.content and isinstance(message.content, list) and len(message.content) > 0:
|
111 |
text_block = message.content[0]
|
112 |
if hasattr(text_block, 'text'):
|
113 |
json_text = text_block.text
|
114 |
+
|
115 |
+
# Clean the response to find the JSON object
|
116 |
+
match = re.search(r'\{.*\}', json_text, re.DOTALL)
|
117 |
+
if match:
|
118 |
+
clean_json = match.group(0)
|
119 |
+
try:
|
120 |
+
parsed_json = json.loads(clean_json)
|
121 |
+
formatted_json = json.dumps(parsed_json, indent=2)
|
122 |
+
progress(1.0, desc="Extraction complete!")
|
123 |
+
return f"```json\n{formatted_json}\n```"
|
124 |
+
except json.JSONDecodeError:
|
125 |
+
return f"⚠️ **AI Warning**: The model returned malformed JSON.\n\n{clean_json}"
|
126 |
+
else:
|
127 |
+
return f"⚠️ **AI Warning**: The model did not return a JSON object.\n\n{json_text}"
|
128 |
+
|
129 |
+
return "❌ **API Error**: The AI model returned an empty or invalid response."
|
130 |
|
131 |
except Exception as e:
|
132 |
return f"❌ **An unexpected error occurred during extraction**: {e}"
|
133 |
|
|
|
|
|
|
|
134 |
|
135 |
+
# === 3. Simplified Gradio UI ===
|
136 |
|
137 |
with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
|
138 |
+
gr.Markdown("# 🔎 Simple Founder Finder")
|
139 |
+
gr.Markdown("A simplified and robust tool to find company founders. **Step 1:** Search for articles. **Step 2:** Extract founders from the results.")
|
140 |
|
141 |
+
# Hidden state to pass text from search to extraction
|
142 |
search_results_for_ai = gr.State("")
|
143 |
|
144 |
with gr.Row():
|
145 |
+
name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'OpenAI', 'SpaceX'", scale=3)
|
146 |
+
search_btn = gr.Button("1. 🔍 Search for Articles", variant="primary", scale=1)
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
+
with gr.Row():
|
149 |
+
extract_btn = gr.Button("2. 📊 Extract Founders from Search Results", variant="secondary")
|
150 |
|
151 |
with gr.Tab("Founder Intelligence Report"):
|
152 |
+
output_extract = gr.Markdown()
|
153 |
+
with gr.Tab("Search Results"):
|
154 |
+
output_search = gr.Markdown()
|
155 |
|
156 |
+
# --- Event Wiring ---
|
157 |
+
|
158 |
+
# Search button populates the search results tab and the hidden state
|
159 |
+
search_btn.click(
|
160 |
fn=search_workflow,
|
161 |
+
inputs=[name_input],
|
162 |
+
outputs=[output_search, search_results_for_ai],
|
163 |
+
show_progress="full"
|
164 |
)
|
165 |
|
166 |
+
# Extract button uses the hidden state to populate the extraction tab
|
167 |
+
extract_btn.click(
|
168 |
fn=extraction_workflow,
|
169 |
inputs=[search_results_for_ai, name_input],
|
170 |
+
outputs=[output_extract],
|
171 |
+
show_progress="full"
|
172 |
)
|
173 |
|
|
|
|
|
|
|
174 |
gr.Examples(
|
175 |
+
examples=["OpenAI", "Anthropic", "Mistral AI", "Hugging Face"],
|
176 |
+
inputs=[name_input],
|
177 |
)
|
178 |
|
179 |
demo.queue()
|