Founder_Name_Extraction_v3

Sleeping

App Files Files Community

dygoo commited on Jun 9

Commit

cdb081c

verified ·

1 Parent(s): 5aafe64

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -207

app.py CHANGED Viewed

@@ -1,188 +1,95 @@
-import gradio as gr
 import requests
-import time
 import re
 from duckduckgo_search import DDGS
-from bs4 import BeautifulSoup
 import anthropic
 import os
-from datetime import datetime, timedelta
-from dateutil import parser
 import json
-import threading
-from concurrent.futures import ThreadPoolExecutor, as_completed
-# Initialize Anthropic client
-client = anthropic.Anthropic(
-    api_key=os.getenv("ANTHROPIC_API_KEY")
-)
-# Global variable for cancellation
-cancel_operation = threading.Event()
-def reset_cancellation():
-    cancel_operation.clear()
-def check_cancellation():
-    return cancel_operation.is_set()
-# === Helper Functions (Hardened for Stability) ===
-def extract_publication_date(soup):
     """
-    BULLETPROOF VERSION: Safely extracts a publication date.
-    This was the most likely source of the IndexError.
     """
-    try:
-        # Prioritize structured data
-        date_selectors = [
-            'time[datetime]', 'meta[property="article:published_time"]',
-            'meta[name="publishdate"]', 'meta[name="date"]'
-        ]
-        for selector in date_selectors:
-            element = soup.select_one(selector)
-            if element:
-                date_text = element.get('datetime') or element.get('content')
-                if date_text:
-                    try: return parser.parse(date_text)
-                    except (ValueError, TypeError): continue
-        # Fallback to text patterns
-        text_content = soup.get_text()[:2000]
-        date_patterns = [
-            r'(\w+ \d{1,2}, \d{4})',  # January 1, 2023
-            r'(\d{4}-\d{2}-\d{2})',      # 2023-01-01
-            r'(\d{1,2}/\d{1,2}/\d{4})' # 01/01/2023
-        ]
-        for pattern in date_patterns:
-            matches = re.findall(pattern, text_content)
-            # THE CRITICAL FIX: Ensure 'matches' is not empty before accessing index 0.
-            if matches:
-                try:
-                    return parser.parse(matches[0])
-                except (ValueError, TypeError):
-                    continue
-    except Exception as e:
-        print(f"Error in date extraction: {e}")
-    return None
-def get_full_article_with_timeout(url, timeout=15):
-    """Safely fetches and parses an article."""
-    if check_cancellation(): return "[CANCELLED]", None
-    try:
-        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
-        response = requests.get(url, headers=headers, timeout=timeout, verify=True)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
-        pub_date = extract_publication_date(soup)
-        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form']):
-            element.decompose()
-        article_selectors = ['article', '.article-content', '.post-content', 'main article', '[role="main"]']
-        for selector in article_selectors:
-            content_area = soup.select_one(selector)
-            if content_area:
-                text_parts = [p.get_text(strip=True) for p in content_area.find_all('p') if len(p.get_text(strip=True)) > 50]
-                if text_parts:
-                    return '\n\n'.join(text_parts)[:10000], pub_date
-        # Fallback if no specific article tag is found
-        return soup.get_text(separator='\n\n', strip=True)[:10000], pub_date
-    except requests.exceptions.RequestException as e:
-        return f"[ERROR] Network error for {url}: {e}", None
     except Exception as e:
-        return f"[ERROR] Could not process article {url}: {e}", None
-def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int, progress=None) -> list:
-    if check_cancellation(): return []
-    queries = {
-        "recent": [f'"{name}" founder news 2024 2025', f'"{name}" CEO founder recent'],
-        "historical": [f'"{name}" founded established history', f'"{name}" founder origin story']
-    }.get(timeframe, [])
-    all_results = []
-    for idx, query in enumerate(queries):
-        if len(all_results) >= max_articles: break
-        if progress: progress((idx / len(queries)) * 0.3, desc=f"Searching {timeframe} ({idx+1}/{len(queries)})")
-        try:
-            with DDGS(timeout=10) as ddgs:
-                results = ddgs.text(keywords=query, max_results=max_articles, safesearch='moderate')
-                if results:
-                    existing_urls = {r.get('href') for r in all_results}
-                    for res in results:
-                        if res.get('href') and res.get('href') not in existing_urls:
-                            all_results.append(res)
-        except Exception as e:
-            print(f"Search query '{query}' failed: {e}")
-    return all_results
-# === Core Workflow Functions ===
-def search_workflow(name: str, article_count: int, progress=gr.Progress()):
-    reset_cancellation()
-    progress(0, desc="Initializing search...")
-    recent_count = article_count // 2
-    historical_count = article_count - recent_count
-    recent_results = search_articles_by_timeframe(name, "recent", recent_count, progress)
-    if check_cancellation(): return "[CANCELLED]", ""
-    historical_results = search_articles_by_timeframe(name, "historical", historical_count, progress)
-    if check_cancellation(): return "[CANCELLED]", ""
-    all_sources = (recent_results or []) + (historical_results or [])
-    if not all_sources:
-        return "[INFO] No articles found.", ""
-    progress(0.4, desc=f"Found {len(all_sources)} articles, fetching content...")
-    fetched_articles = []
-    with ThreadPoolExecutor(max_workers=3) as executor:
-        future_to_url = {executor.submit(get_full_article_with_timeout, src.get('href')): src for src in all_sources}
-        for i, future in enumerate(as_completed(future_to_url)):
-            if check_cancellation(): return "[CANCELLED]", ""
-            progress(0.4 + (i / len(all_sources)) * 0.55, desc=f"Fetching {i+1}/{len(all_sources)}")
-            try:
-                content, pub_date = future.result()
-                source = future_to_url[future]
-                fetched_articles.append({
-                    "title": source.get('title', 'No Title'),
-                    "url": source.get('href', 'No URL'),
-                    "content": content,
-                    "date": pub_date.strftime("%B %d, %Y") if pub_date else "Unknown Date"
-                })
-            except Exception as e:
-                print(f"Failed to fetch a result: {e}")
-    if not fetched_articles:
-        return "[ERROR] Could not fetch content for any articles.", ""
-    progress(0.95, desc="Formatting results...")
-    # Assemble the final markdown and raw text for the next step
-    markdown_output = ""
-    raw_text_for_ai = ""
-    for i, article in enumerate(fetched_articles):
-        markdown_output += f"### {i+1}. {article['title']}\n"
-        markdown_output += f"**Source**: [{article['url']}]({article['url']})\n"
-        markdown_output += f"**Date**: {article['date']}\n\n"
-        markdown_output += f"{article['content'][:800]}...\n\n---\n\n" # Show a snippet
-        raw_text_for_ai += f"Article {i+1}:\nTitle: {article['title']}\nContent: {article['content']}\n\n"
-    return markdown_output, raw_text_for_ai
 def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
     if not raw_text or not raw_text.strip():
-        return "❌ Nothing to extract. Please run a search first."
-    progress(0, desc="Preparing for AI extraction...")
-    prompt = f"""From the provided articles about "{company_name}", extract the names of individuals explicitly identified as a founder.
-Return a single, valid JSON object with the structure: {{"founders": [{{"name": "Founder's Name", "evidence": "A brief quote proving they founded {company_name}."}}]}}
 Do not add any text outside the JSON object.
 ARTICLES:
@@ -191,82 +98,82 @@ ARTICLES:
 ---
 """
     try:
-        progress(0.5, desc="Analyzing with AI model...")
-        message = client.messages.create(
             model="claude-sonnet-4-20250514", # As requested
-            max_tokens=1500,
             temperature=0.0,
             messages=[{"role": "user", "content": prompt}]
         )
-        # Robust check for the API response
-        if message and isinstance(message.content, list) and message.content:
             text_block = message.content[0]
             if hasattr(text_block, 'text'):
                 json_text = text_block.text
-                try:
-                    # Validate and format the JSON
-                    parsed_json = json.loads(json_text)
-                    formatted_json = json.dumps(parsed_json, indent=2)
-                    progress(1.0, desc="Extraction complete!")
-                    return f"```json\n{formatted_json}\n```"
-                except json.JSONDecodeError:
-                    return f"⚠️ **Model Warning**: The AI returned text that is not valid JSON.\n\n{json_text}"
-        # This block runs if the API response is empty or malformed
-        return "❌ **API Error**: The AI model returned an empty or invalid response. This might be due to content filters."
     except Exception as e:
         return f"❌ **An unexpected error occurred during extraction**: {e}"
-def cancel_flow():
-    cancel_operation.set()
-    return "🛑 Cancellation requested..."
-# === Gradio UI (Clean and Stable) ===
 with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🔎 Founder Finder")
-    gr.Markdown("A robust tool to find company founders using web search and AI extraction.")
-    # State to hold the raw text from search for the extraction step
     search_results_for_ai = gr.State("")
     with gr.Row():
-        with gr.Column(scale=2): name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'OpenAI', 'SpaceX'")
-        with gr.Column(scale=1): article_count_slider = gr.Slider(2, 8, value=4, step=2, label="Articles to Search")
-    with gr.Row():
-        search_btn = gr.Button("1. ��� Search for Articles", variant="primary")
-        extract_btn = gr.Button("2. 📊 Extract Founders from Search", variant="secondary")
-        cancel_btn = gr.Button("🛑 Cancel", variant="stop")
-    status_output = gr.Markdown("Ready...")
     with gr.Tab("Founder Intelligence Report"):
-        output_extract = gr.Markdown(label="Extracted Founder Information")
-    with gr.Tab("Raw Search Results"):
-        output_search = gr.Markdown(label="Article Snippets & Sources")
-    # Wire the UI events cleanly
-    search_event = search_btn.click(
         fn=search_workflow,
-        inputs=[name_input, article_count_slider],
-        outputs=[output_search, search_results_for_ai]
     )
-    extract_event = extract_btn.click(
         fn=extraction_workflow,
         inputs=[search_results_for_ai, name_input],
-        outputs=[output_extract]
     )
-    # Cancellation can stop either long process
-    cancel_btn.click(fn=cancel_flow, inputs=None, outputs=status_output, cancels=[search_event, extract_event])
     gr.Examples(
-        examples=[["OpenAI", 4], ["Anthropic", 4], ["Mistral AI", 4]],
-        inputs=[name_input, article_count_slider],
     )
 demo.queue()

+import gradio as gr
 import requests
 import re
 from duckduckgo_search import DDGS
 import anthropic
 import os
 import json
+# Initialize clients
+anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+# === 1. Simplified Search Workflow ===
+def search_workflow(name: str, progress=gr.Progress()):
     """
+    A simple, sequential, and robust function to search for articles.
+    It fetches exactly 8 articles: 4 recent, 4 historical.
     """
+    if not name or not name.strip():
+        return "❌ Please enter a company name.", ""
+    progress(0, desc="Starting search...")
+    # Define search queries
+    recent_keywords = f'"{name}" founder news'
+    historical_keywords = f'"{name}" founder history origin'
+    all_articles_markdown = []
+    raw_text_for_ai = ""
+    try:
+        with DDGS(timeout=20) as ddgs:
+            # --- Fetch 4 Recent Articles (past year) ---
+            progress(0.1, desc="Searching for recent articles...")
+            # The 'timelimit="y"' parameter is a reliable way to get recent results.
+            recent_results = ddgs.text(keywords=recent_keywords, max_results=4, timelimit='y') or []
+            for i, res in enumerate(recent_results):
+                title = res.get('title', 'No Title')
+                url = res.get('href', '#')
+                body = res.get('body', 'No snippet available.')
+                # Format for display
+                markdown = f"### (Recent) {title}\n**Source**: [{url}]({url})\n\n{body}\n"
+                all_articles_markdown.append(markdown)
+                # Format for AI
+                raw_text_for_ai += f"Article (Recent):\nTitle: {title}\nContent: {body}\n\n"
+            # --- Fetch 4 Historical Articles ---
+            progress(0.5, desc="Searching for historical articles...")
+            historical_results = ddgs.text(keywords=historical_keywords, max_results=4) or []
+            for i, res in enumerate(historical_results):
+                title = res.get('title', 'No Title')
+                url = res.get('href', '#')
+                body = res.get('body', 'No snippet available.')
+                # Format for display
+                markdown = f"### (Historical) {title}\n**Source**: [{url}]({url})\n\n{body}\n"
+                all_articles_markdown.append(markdown)
+                # Format for AI
+                raw_text_for_ai += f"Article (Historical):\nTitle: {title}\nContent: {body}\n\n"
     except Exception as e:
+        return f"❌ An error occurred during search: {e}", ""
+    if not all_articles_markdown:
+        return "[INFO] No articles found for that company.", ""
+    progress(1.0, desc="Search complete!")
+    final_markdown = f"## Found {len(all_articles_markdown)} Articles\n\n" + "\n---\n".join(all_articles_markdown)
+    return final_markdown, raw_text_for_ai
+# === 2. Simplified Extraction Workflow ===
 def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
+    """
+    A simple and robust function to extract founders from text using the AI model.
+    """
     if not raw_text or not raw_text.strip():
+        return "❌ Please run a search first to get text to analyze."
+    progress(0, desc="Preparing prompt for AI...")
+    prompt = f"""From the provided article snippets about "{company_name}", extract the names of individuals explicitly identified as a founder.
+Return a single, valid JSON object with the structure: {{"founders": [{{"name": "Founder's Name", "evidence": "A brief quote or context."}}]}}
+If no founders are mentioned, return an empty list: {{"founders": []}}.
 Do not add any text outside the JSON object.
 ARTICLES:
 ---
 """
     try:
+        progress(0.5, desc="Sending request to AI model...")
+        message = anthropic_client.messages.create(
             model="claude-sonnet-4-20250514", # As requested
+            max_tokens=1024,
             temperature=0.0,
             messages=[{"role": "user", "content": prompt}]
         )
+        # This robust check prevents the 'list index out of range' error.
+        if message and message.content and isinstance(message.content, list) and len(message.content) > 0:
             text_block = message.content[0]
             if hasattr(text_block, 'text'):
                 json_text = text_block.text
+                # Clean the response to find the JSON object
+                match = re.search(r'\{.*\}', json_text, re.DOTALL)
+                if match:
+                    clean_json = match.group(0)
+                    try:
+                        parsed_json = json.loads(clean_json)
+                        formatted_json = json.dumps(parsed_json, indent=2)
+                        progress(1.0, desc="Extraction complete!")
+                        return f"```json\n{formatted_json}\n```"
+                    except json.JSONDecodeError:
+                        return f"⚠️ **AI Warning**: The model returned malformed JSON.\n\n{clean_json}"
+                else:
+                    return f"⚠️ **AI Warning**: The model did not return a JSON object.\n\n{json_text}"
+        return "❌ **API Error**: The AI model returned an empty or invalid response."
     except Exception as e:
         return f"❌ **An unexpected error occurred during extraction**: {e}"
+# === 3. Simplified Gradio UI ===
 with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🔎 Simple Founder Finder")
+    gr.Markdown("A simplified and robust tool to find company founders. **Step 1:** Search for articles. **Step 2:** Extract founders from the results.")
+    # Hidden state to pass text from search to extraction
     search_results_for_ai = gr.State("")
     with gr.Row():
+        name_input = gr.Textbox(label="Company Name", placeholder="e.g., 'OpenAI', 'SpaceX'", scale=3)
+        search_btn = gr.Button("1. 🔍 Search for Articles", variant="primary", scale=1)
+    with gr.Row():
+        extract_btn = gr.Button("2. 📊 Extract Founders from Search Results", variant="secondary")
     with gr.Tab("Founder Intelligence Report"):
+        output_extract = gr.Markdown()
+    with gr.Tab("Search Results"):
+        output_search = gr.Markdown()
+    # --- Event Wiring ---
+    # Search button populates the search results tab and the hidden state
+    search_btn.click(
         fn=search_workflow,
+        inputs=[name_input],
+        outputs=[output_search, search_results_for_ai],
+        show_progress="full"
     )
+    # Extract button uses the hidden state to populate the extraction tab
+    extract_btn.click(
         fn=extraction_workflow,
         inputs=[search_results_for_ai, name_input],
+        outputs=[output_extract],
+        show_progress="full"
     )
     gr.Examples(
+        examples=["OpenAI", "Anthropic", "Mistral AI", "Hugging Face"],
+        inputs=[name_input],
     )
 demo.queue()