dygoo commited on
Commit
cdb081c
·
verified ·
1 Parent(s): 5aafe64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -207
app.py CHANGED
@@ -1,188 +1,95 @@
1
- import gradio as gr
2
  import requests
3
- import time
4
  import re
5
  from duckduckgo_search import DDGS
6
- from bs4 import BeautifulSoup
7
  import anthropic
8
  import os
9
- from datetime import datetime, timedelta
10
- from dateutil import parser
11
  import json
12
- import threading
13
- from concurrent.futures import ThreadPoolExecutor, as_completed
14
-
15
- # Initialize Anthropic client
16
- client = anthropic.Anthropic(
17
- api_key=os.getenv("ANTHROPIC_API_KEY")
18
- )
19
-
20
- # Global variable for cancellation
21
- cancel_operation = threading.Event()
22
 
23
- def reset_cancellation():
24
- cancel_operation.clear()
25
 
26
- def check_cancellation():
27
- return cancel_operation.is_set()
28
 
29
- # === Helper Functions (Hardened for Stability) ===
30
-
31
- def extract_publication_date(soup):
32
  """
33
- BULLETPROOF VERSION: Safely extracts a publication date.
34
- This was the most likely source of the IndexError.
35
  """
36
- try:
37
- # Prioritize structured data
38
- date_selectors = [
39
- 'time[datetime]', 'meta[property="article:published_time"]',
40
- 'meta[name="publishdate"]', 'meta[name="date"]'
41
- ]
42
- for selector in date_selectors:
43
- element = soup.select_one(selector)
44
- if element:
45
- date_text = element.get('datetime') or element.get('content')
46
- if date_text:
47
- try: return parser.parse(date_text)
48
- except (ValueError, TypeError): continue
49
-
50
- # Fallback to text patterns
51
- text_content = soup.get_text()[:2000]
52
- date_patterns = [
53
- r'(\w+ \d{1,2}, \d{4})', # January 1, 2023
54
- r'(\d{4}-\d{2}-\d{2})', # 2023-01-01
55
- r'(\d{1,2}/\d{1,2}/\d{4})' # 01/01/2023
56
- ]
57
- for pattern in date_patterns:
58
- matches = re.findall(pattern, text_content)
59
- # THE CRITICAL FIX: Ensure 'matches' is not empty before accessing index 0.
60
- if matches:
61
- try:
62
- return parser.parse(matches[0])
63
- except (ValueError, TypeError):
64
- continue
65
- except Exception as e:
66
- print(f"Error in date extraction: {e}")
67
- return None
68
 
69
- def get_full_article_with_timeout(url, timeout=15):
70
- """Safely fetches and parses an article."""
71
- if check_cancellation(): return "[CANCELLED]", None
72
- try:
73
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
74
- response = requests.get(url, headers=headers, timeout=timeout, verify=True)
75
- response.raise_for_status()
76
-
77
- soup = BeautifulSoup(response.content, 'html.parser')
78
- pub_date = extract_publication_date(soup)
79
 
80
- for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form']):
81
- element.decompose()
82
-
83
- article_selectors = ['article', '.article-content', '.post-content', 'main article', '[role="main"]']
84
- for selector in article_selectors:
85
- content_area = soup.select_one(selector)
86
- if content_area:
87
- text_parts = [p.get_text(strip=True) for p in content_area.find_all('p') if len(p.get_text(strip=True)) > 50]
88
- if text_parts:
89
- return '\n\n'.join(text_parts)[:10000], pub_date
90
-
91
- # Fallback if no specific article tag is found
92
- return soup.get_text(separator='\n\n', strip=True)[:10000], pub_date
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- except requests.exceptions.RequestException as e:
95
- return f"[ERROR] Network error for {url}: {e}", None
96
  except Exception as e:
97
- return f"[ERROR] Could not process article {url}: {e}", None
98
-
99
- def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int, progress=None) -> list:
100
- if check_cancellation(): return []
101
- queries = {
102
- "recent": [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent'],
103
- "historical": [f'"{name}" founded established history', f'"{name}" founder origin story']
104
- }.get(timeframe, [])
105
-
106
- all_results = []
107
- for idx, query in enumerate(queries):
108
- if len(all_results) >= max_articles: break
109
- if progress: progress((idx / len(queries)) * 0.3, desc=f"Searching {timeframe} ({idx+1}/{len(queries)})")
110
- try:
111
- with DDGS(timeout=10) as ddgs:
112
- results = ddgs.text(keywords=query, max_results=max_articles, safesearch='moderate')
113
- if results:
114
- existing_urls = {r.get('href') for r in all_results}
115
- for res in results:
116
- if res.get('href') and res.get('href') not in existing_urls:
117
- all_results.append(res)
118
- except Exception as e:
119
- print(f"Search query '{query}' failed: {e}")
120
- return all_results
121
-
122
- # === Core Workflow Functions ===
123
-
124
- def search_workflow(name: str, article_count: int, progress=gr.Progress()):
125
- reset_cancellation()
126
- progress(0, desc="Initializing search...")
127
 
128
- recent_count = article_count // 2
129
- historical_count = article_count - recent_count
130
 
131
- recent_results = search_articles_by_timeframe(name, "recent", recent_count, progress)
132
- if check_cancellation(): return "[CANCELLED]", ""
133
 
134
- historical_results = search_articles_by_timeframe(name, "historical", historical_count, progress)
135
- if check_cancellation(): return "[CANCELLED]", ""
136
 
137
- all_sources = (recent_results or []) + (historical_results or [])
138
- if not all_sources:
139
- return "[INFO] No articles found.", ""
140
 
141
- progress(0.4, desc=f"Found {len(all_sources)} articles, fetching content...")
142
-
143
- fetched_articles = []
144
- with ThreadPoolExecutor(max_workers=3) as executor:
145
- future_to_url = {executor.submit(get_full_article_with_timeout, src.get('href')): src for src in all_sources}
146
- for i, future in enumerate(as_completed(future_to_url)):
147
- if check_cancellation(): return "[CANCELLED]", ""
148
- progress(0.4 + (i / len(all_sources)) * 0.55, desc=f"Fetching {i+1}/{len(all_sources)}")
149
- try:
150
- content, pub_date = future.result()
151
- source = future_to_url[future]
152
- fetched_articles.append({
153
- "title": source.get('title', 'No Title'),
154
- "url": source.get('href', 'No URL'),
155
- "content": content,
156
- "date": pub_date.strftime("%B %d, %Y") if pub_date else "Unknown Date"
157
- })
158
- except Exception as e:
159
- print(f"Failed to fetch a result: {e}")
160
-
161
- if not fetched_articles:
162
- return "[ERROR] Could not fetch content for any articles.", ""
163
-
164
- progress(0.95, desc="Formatting results...")
165
-
166
- # Assemble the final markdown and raw text for the next step
167
- markdown_output = ""
168
- raw_text_for_ai = ""
169
- for i, article in enumerate(fetched_articles):
170
- markdown_output += f"### {i+1}. {article['title']}\n"
171
- markdown_output += f"**Source**: [{article['url']}]({article['url']})\n"
172
- markdown_output += f"**Date**: {article['date']}\n\n"
173
- markdown_output += f"{article['content'][:800]}...\n\n---\n\n" # Show a snippet
174
-
175
- raw_text_for_ai += f"Article {i+1}:\nTitle: {article['title']}\nContent: {article['content']}\n\n"
176
-
177
- return markdown_output, raw_text_for_ai
178
 
179
  def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
 
 
 
180
  if not raw_text or not raw_text.strip():
181
- return "❌ Nothing to extract. Please run a search first."
 
 
182
 
183
- progress(0, desc="Preparing for AI extraction...")
184
- prompt = f"""From the provided articles about "{company_name}", extract the names of individuals explicitly identified as a founder.
185
- Return a single, valid JSON object with the structure: {{"founders": [{{"name": "Founder's Name", "evidence": "A brief quote proving they founded {company_name}."}}]}}
186
  Do not add any text outside the JSON object.
187
 
188
  ARTICLES:
@@ -191,82 +98,82 @@ ARTICLES:
191
  ---
192
  """
193
  try:
194
- progress(0.5, desc="Analyzing with AI model...")
195
- message = client.messages.create(
196
  model="claude-sonnet-4-20250514", # As requested
197
- max_tokens=1500,
198
  temperature=0.0,
199
  messages=[{"role": "user", "content": prompt}]
200
  )
201
-
202
- # Robust check for the API response
203
- if message and isinstance(message.content, list) and message.content:
204
  text_block = message.content[0]
205
  if hasattr(text_block, 'text'):
206
  json_text = text_block.text
207
- try:
208
- # Validate and format the JSON
209
- parsed_json = json.loads(json_text)
210
- formatted_json = json.dumps(parsed_json, indent=2)
211
- progress(1.0, desc="Extraction complete!")
212
- return f"```json\n{formatted_json}\n```"
213
- except json.JSONDecodeError:
214
- return f"⚠️ **Model Warning**: The AI returned text that is not valid JSON.\n\n{json_text}"
215
-
216
- # This block runs if the API response is empty or malformed
217
- return "❌ **API Error**: The AI model returned an empty or invalid response. This might be due to content filters."
 
 
 
 
 
218
 
219
  except Exception as e:
220
  return f"❌ **An unexpected error occurred during extraction**: {e}"
221
 
222
- def cancel_flow():
223
- cancel_operation.set()
224
- return "🛑 Cancellation requested..."
225
 
226
- # === Gradio UI (Clean and Stable) ===
227
 
228
  with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
229
- gr.Markdown("# 🔎 Founder Finder")
230
- gr.Markdown("A robust tool to find company founders using web search and AI extraction.")
231
 
232
- # State to hold the raw text from search for the extraction step
233
  search_results_for_ai = gr.State("")
234
 
235
  with gr.Row():
236
- with gr.Column(scale=2): name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'OpenAI', 'SpaceX'")
237
- with gr.Column(scale=1): article_count_slider = gr.Slider(2, 8, value=4, step=2, label="Articles to Search")
238
-
239
- with gr.Row():
240
- search_btn = gr.Button("1. ��� Search for Articles", variant="primary")
241
- extract_btn = gr.Button("2. 📊 Extract Founders from Search", variant="secondary")
242
- cancel_btn = gr.Button("🛑 Cancel", variant="stop")
243
 
244
- status_output = gr.Markdown("Ready...")
 
245
 
246
  with gr.Tab("Founder Intelligence Report"):
247
- output_extract = gr.Markdown(label="Extracted Founder Information")
248
- with gr.Tab("Raw Search Results"):
249
- output_search = gr.Markdown(label="Article Snippets & Sources")
250
 
251
- # Wire the UI events cleanly
252
- search_event = search_btn.click(
 
 
253
  fn=search_workflow,
254
- inputs=[name_input, article_count_slider],
255
- outputs=[output_search, search_results_for_ai]
 
256
  )
257
 
258
- extract_event = extract_btn.click(
 
259
  fn=extraction_workflow,
260
  inputs=[search_results_for_ai, name_input],
261
- outputs=[output_extract]
 
262
  )
263
 
264
- # Cancellation can stop either long process
265
- cancel_btn.click(fn=cancel_flow, inputs=None, outputs=status_output, cancels=[search_event, extract_event])
266
-
267
  gr.Examples(
268
- examples=[["OpenAI", 4], ["Anthropic", 4], ["Mistral AI", 4]],
269
- inputs=[name_input, article_count_slider],
270
  )
271
 
272
  demo.queue()
 
1
+ import gradio as gr
2
  import requests
 
3
  import re
4
  from duckduckgo_search import DDGS
 
5
  import anthropic
6
  import os
 
 
7
  import json
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Initialize clients
10
+ anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
11
 
12
+ # === 1. Simplified Search Workflow ===
 
13
 
14
+ def search_workflow(name: str, progress=gr.Progress()):
 
 
15
  """
16
+ A simple, sequential, and robust function to search for articles.
17
+ It fetches exactly 8 articles: 4 recent, 4 historical.
18
  """
19
+ if not name or not name.strip():
20
+ return "❌ Please enter a company name.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ progress(0, desc="Starting search...")
23
+
24
+ # Define search queries
25
+ recent_keywords = f'"{name}" founder news'
26
+ historical_keywords = f'"{name}" founder history origin'
27
+
28
+ all_articles_markdown = []
29
+ raw_text_for_ai = ""
 
 
30
 
31
+ try:
32
+ with DDGS(timeout=20) as ddgs:
33
+ # --- Fetch 4 Recent Articles (past year) ---
34
+ progress(0.1, desc="Searching for recent articles...")
35
+ # The 'timelimit="y"' parameter is a reliable way to get recent results.
36
+ recent_results = ddgs.text(keywords=recent_keywords, max_results=4, timelimit='y') or []
37
+
38
+ for i, res in enumerate(recent_results):
39
+ title = res.get('title', 'No Title')
40
+ url = res.get('href', '#')
41
+ body = res.get('body', 'No snippet available.')
42
+
43
+ # Format for display
44
+ markdown = f"### (Recent) {title}\n**Source**: [{url}]({url})\n\n{body}\n"
45
+ all_articles_markdown.append(markdown)
46
+
47
+ # Format for AI
48
+ raw_text_for_ai += f"Article (Recent):\nTitle: {title}\nContent: {body}\n\n"
49
+
50
+ # --- Fetch 4 Historical Articles ---
51
+ progress(0.5, desc="Searching for historical articles...")
52
+ historical_results = ddgs.text(keywords=historical_keywords, max_results=4) or []
53
+
54
+ for i, res in enumerate(historical_results):
55
+ title = res.get('title', 'No Title')
56
+ url = res.get('href', '#')
57
+ body = res.get('body', 'No snippet available.')
58
+
59
+ # Format for display
60
+ markdown = f"### (Historical) {title}\n**Source**: [{url}]({url})\n\n{body}\n"
61
+ all_articles_markdown.append(markdown)
62
+
63
+ # Format for AI
64
+ raw_text_for_ai += f"Article (Historical):\nTitle: {title}\nContent: {body}\n\n"
65
 
 
 
66
  except Exception as e:
67
+ return f" An error occurred during search: {e}", ""
68
+
69
+ if not all_articles_markdown:
70
+ return "[INFO] No articles found for that company.", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ progress(1.0, desc="Search complete!")
 
73
 
74
+ final_markdown = f"## Found {len(all_articles_markdown)} Articles\n\n" + "\n---\n".join(all_articles_markdown)
 
75
 
76
+ return final_markdown, raw_text_for_ai
 
77
 
 
 
 
78
 
79
+ # === 2. Simplified Extraction Workflow ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
82
+ """
83
+ A simple and robust function to extract founders from text using the AI model.
84
+ """
85
  if not raw_text or not raw_text.strip():
86
+ return "❌ Please run a search first to get text to analyze."
87
+
88
+ progress(0, desc="Preparing prompt for AI...")
89
 
90
+ prompt = f"""From the provided article snippets about "{company_name}", extract the names of individuals explicitly identified as a founder.
91
+ Return a single, valid JSON object with the structure: {{"founders": [{{"name": "Founder's Name", "evidence": "A brief quote or context."}}]}}
92
+ If no founders are mentioned, return an empty list: {{"founders": []}}.
93
  Do not add any text outside the JSON object.
94
 
95
  ARTICLES:
 
98
  ---
99
  """
100
  try:
101
+ progress(0.5, desc="Sending request to AI model...")
102
+ message = anthropic_client.messages.create(
103
  model="claude-sonnet-4-20250514", # As requested
104
+ max_tokens=1024,
105
  temperature=0.0,
106
  messages=[{"role": "user", "content": prompt}]
107
  )
108
+
109
+ # This robust check prevents the 'list index out of range' error.
110
+ if message and message.content and isinstance(message.content, list) and len(message.content) > 0:
111
  text_block = message.content[0]
112
  if hasattr(text_block, 'text'):
113
  json_text = text_block.text
114
+
115
+ # Clean the response to find the JSON object
116
+ match = re.search(r'\{.*\}', json_text, re.DOTALL)
117
+ if match:
118
+ clean_json = match.group(0)
119
+ try:
120
+ parsed_json = json.loads(clean_json)
121
+ formatted_json = json.dumps(parsed_json, indent=2)
122
+ progress(1.0, desc="Extraction complete!")
123
+ return f"```json\n{formatted_json}\n```"
124
+ except json.JSONDecodeError:
125
+ return f"⚠️ **AI Warning**: The model returned malformed JSON.\n\n{clean_json}"
126
+ else:
127
+ return f"⚠️ **AI Warning**: The model did not return a JSON object.\n\n{json_text}"
128
+
129
+ return "❌ **API Error**: The AI model returned an empty or invalid response."
130
 
131
  except Exception as e:
132
  return f"❌ **An unexpected error occurred during extraction**: {e}"
133
 
 
 
 
134
 
135
+ # === 3. Simplified Gradio UI ===
136
 
137
  with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
138
+ gr.Markdown("# 🔎 Simple Founder Finder")
139
+ gr.Markdown("A simplified and robust tool to find company founders. **Step 1:** Search for articles. **Step 2:** Extract founders from the results.")
140
 
141
+ # Hidden state to pass text from search to extraction
142
  search_results_for_ai = gr.State("")
143
 
144
  with gr.Row():
145
+ name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'OpenAI', 'SpaceX'", scale=3)
146
+ search_btn = gr.Button("1. 🔍 Search for Articles", variant="primary", scale=1)
 
 
 
 
 
147
 
148
+ with gr.Row():
149
+ extract_btn = gr.Button("2. 📊 Extract Founders from Search Results", variant="secondary")
150
 
151
  with gr.Tab("Founder Intelligence Report"):
152
+ output_extract = gr.Markdown()
153
+ with gr.Tab("Search Results"):
154
+ output_search = gr.Markdown()
155
 
156
+ # --- Event Wiring ---
157
+
158
+ # Search button populates the search results tab and the hidden state
159
+ search_btn.click(
160
  fn=search_workflow,
161
+ inputs=[name_input],
162
+ outputs=[output_search, search_results_for_ai],
163
+ show_progress="full"
164
  )
165
 
166
+ # Extract button uses the hidden state to populate the extraction tab
167
+ extract_btn.click(
168
  fn=extraction_workflow,
169
  inputs=[search_results_for_ai, name_input],
170
+ outputs=[output_extract],
171
+ show_progress="full"
172
  )
173
 
 
 
 
174
  gr.Examples(
175
+ examples=["OpenAI", "Anthropic", "Mistral AI", "Hugging Face"],
176
+ inputs=[name_input],
177
  )
178
 
179
  demo.queue()