Founder_Name_Extraction_v3

Sleeping

App Files Files Community

dygoo commited on Jun 10

Commit

ddd51b5

verified ·

1 Parent(s): 99f18e1

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -406

app.py CHANGED Viewed

@@ -13,8 +13,7 @@ anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 def search_workflow(name: str, progress=gr.Progress()):
     """
-    A simple, sequential, and robust function to search for articles.
-    It fetches exactly 8 articles: 4 recent, 4 historical.
     """
     if not name or not name.strip():
         return "❌ Please enter a company name.", ""
@@ -80,7 +79,7 @@ def search_workflow(name: str, progress=gr.Progress()):
 def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
     """
-    A simple and robust function to extract founders from text using the AI model.
     """
     if not raw_text or not raw_text.strip():
         return "❌ Please run a search first to get text to analyze."
@@ -135,8 +134,8 @@ ARTICLES:
 # === 3. Simplified Gradio UI ===
 with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🔎 Simple Founder Finder")
-    gr.Markdown("A simplified and robust tool to find company founders. **Step 1:** Search for articles. **Step 2:** Extract founders from the results.")
     # Hidden state to pass text from search to extraction
     search_results_for_ai = gr.State("")
@@ -178,404 +177,3 @@ demo.queue()
 if __name__ == "__main__":
     demo.launch(show_error=True)
-'''
-import gradio as gr
-import requests
-import time
-import re
-from duckduckgo_search import DDGS
-from bs4 import BeautifulSoup
-import anthropic
-import os
-from datetime import datetime, timedelta
-from dateutil import parser
-import json
-# Initialize Anthropic client
-client = anthropic.Anthropic(
-    api_key=os.getenv("ANTHROPIC_API_KEY")  # Set as secret in HF Space settings
-)
-# === Model functions ===
-def extract_publication_date(soup, url):
-    """Extract publication date from article HTML"""
-    try:
-        # Common date selectors
-        date_selectors = [
-            'time[datetime]',
-            '.date', '.publish-date', '.published', '.post-date',
-            '[class*="date"]', '[class*="time"]',
-            'meta[property="article:published_time"]',
-            'meta[name="publishdate"]',
-            'meta[name="date"]'
-        ]
-        for selector in date_selectors:
-            element = soup.select_one(selector)
-            if element:
-                date_text = element.get('datetime') or element.get('content') or element.get_text()
-                if date_text:
-                    try:
-                        return parser.parse(date_text)
-                    except:
-                        continue
-        # Look for date patterns in text
-        date_patterns = [
-            r'(\w+ \d{1,2}, \d{4})',  # January 15, 2023
-            r'(\d{1,2}/\d{1,2}/\d{4})',  # 01/15/2023
-            r'(\d{4}-\d{2}-\d{2})'  # 2023-01-15
-        ]
-        text = soup.get_text()[:2000]  # First 2000 chars
-        for pattern in date_patterns:
-            matches = re.findall(pattern, text)
-            if matches:
-                try:
-                    return parser.parse(matches[0])
-                except:
-                    continue
-    except Exception as e:
-        print(f"Date extraction error for {url}: {e}")
-    return None
-def get_full_article(url):
-    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1'
-        }
-        response = requests.get(url, headers=headers, timeout=20, verify=True)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # Extract publication date
-        pub_date = extract_publication_date(soup, url)
-        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
-            element.decompose()
-        article_selectors = [
-            'article', '.article-content', '.post-content', '.story-body', '.story-content',
-            '.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
-            '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
-        ]
-        for selector in article_selectors:
-            content = soup.select_one(selector)
-            if content:
-                paragraphs = content.find_all(['p', 'div'], string=True)
-                if paragraphs:
-                    text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
-                    full_text = '\n\n'.join(text_parts)
-                    if len(full_text) > 300:
-                        return full_text[:10000], pub_date
-        body_text = soup.get_text(separator='\n\n', strip=True)
-        body_text = re.sub(r'\n{3,}', '\n\n', body_text)
-        return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
-    except requests.exceptions.Timeout:
-        return "[WARNING] Article fetch timeout - using snippet instead", None
-    except requests.exceptions.RequestException:
-        return "[ERROR] Could not fetch article: Network error", None
-    except Exception as e:
-        return f"[ERROR] Could not fetch article: {str(e)}", None
-def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
-    """Search for articles in specific timeframe"""
-    # Define search queries based on timeframe
-    if timeframe == "recent":
-        # Recent articles (news, updates, current events)
-        search_queries = [
-            f'"{name}" founder news 2024 2025',
-            f'"{name}" CEO founder recent',
-            f'"{name}" founder update latest'
-        ]
-    else:  # historical
-        # Historical articles (founding, establishment, origin stories)
-        search_queries = [
-            f'"{name}" founded established history',
-            f'"{name}" founder origin story',
-            f'"{name}" started began founder',
-            f'"{name}" founder early days'
-        ]
-    all_results = []
-    max_retries = 2
-    base_delay = 3
-    for query_idx, search_query in enumerate(search_queries):
-        if len(all_results) >= max_articles:
-            break
-        for attempt in range(max_retries):
-            try:
-                print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
-                time.sleep(base_delay * (attempt + 1))
-                configs = [
-                    {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
-                    {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
-                ]
-                config = configs[min(attempt, len(configs)-1)]
-                with DDGS(timeout=config['timeout']) as ddgs:
-                    search_params = {
-                        'keywords': search_query,
-                        'max_results': max_articles - len(all_results) + 2,  # Get a few extra to filter
-                        'safesearch': config['safesearch']
-                    }
-                    if config['region']:
-                        search_params['region'] = config['region']
-                    results = list(ddgs.text(**search_params))
-                    print(f"Found {len(results)} results for query {query_idx + 1}")
-                if results:
-                    # Add unique results (avoid duplicates)
-                    existing_urls = {r.get('url', '') for r in all_results}
-                    for result in results:
-                        if len(all_results) >= max_articles:
-                            break
-                        url = result.get('href', '')
-                        if url and url not in existing_urls:
-                            all_results.append(result)
-                            existing_urls.add(url)
-                    break
-            except Exception as e:
-                print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
-                if attempt < max_retries - 1:
-                    time.sleep(base_delay * (attempt + 2))
-    return all_results[:max_articles]
-def categorize_article_by_date(pub_date):
-    """Categorize article as recent or historical based on publication date"""
-    if not pub_date:
-        return "unknown"
-    one_year_ago = datetime.now() - timedelta(days=365)
-    if pub_date >= one_year_ago:
-        return "recent"
-    else:
-        return "historical"
-def search_articles(name: str, max_articles: int = 4) -> str:
-    """Enhanced search that ensures both recent and historical articles"""
-    # Split articles between recent and historical
-    recent_count = max_articles // 2
-    historical_count = max_articles - recent_count
-    print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
-    # Search for recent articles
-    recent_results = search_articles_by_timeframe(name, "recent", recent_count)
-    time.sleep(2)  # Brief pause between timeframe searches
-    # Search for historical articles
-    historical_results = search_articles_by_timeframe(name, "historical", historical_count)
-    # Combine and process all results
-    all_results = []
-    # Process recent articles
-    for result in recent_results:
-        result['expected_timeframe'] = 'recent'
-        all_results.append(result)
-    # Process historical articles
-    for result in historical_results:
-        result['expected_timeframe'] = 'historical'
-        all_results.append(result)
-    if not all_results:
-        return f"[INFO] No articles found for {name}"
-    # Fetch and categorize articles
-    articles = []
-    recent_found = 0
-    historical_found = 0
-    for i, result in enumerate(all_results, 1):
-        url = result.get('href', 'No URL')
-        title = result.get('title', 'No Title')
-        snippet = result.get('body', 'No snippet available')
-        expected_timeframe = result.get('expected_timeframe', 'unknown')
-        if i > 1:
-            time.sleep(2)
-        full_text, pub_date = get_full_article(url)
-        actual_timeframe = categorize_article_by_date(pub_date)
-        # Count articles by actual timeframe
-        if actual_timeframe == "recent":
-            recent_found += 1
-        elif actual_timeframe == "historical":
-            historical_found += 1
-        if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
-            print(f"Using snippet fallback for article {i}")
-            content = f"[SNIPPET ONLY]\n{snippet}"
-        else:
-            content = full_text
-        # Create timeframe indicator
-        timeframe_indicator = ""
-        if pub_date:
-            date_str = pub_date.strftime("%B %d, %Y")
-            timeframe_indicator = f"📅 **Published**: {date_str} ({actual_timeframe.title()})"
-        else:
-            timeframe_indicator = f"📅 **Timeframe**: {expected_timeframe.title()} (estimated)"
-        article = f"### {i}. {title}\n"
-        article += f"[Source]({url})\n"
-        article += f"{timeframe_indicator}\n\n"
-        article += f"{content}\n"
-        articles.append(article)
-    # Add summary of coverage
-    summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
-    return summary + "\n---\n".join(articles)
-def extract_entities(search_results: str, company_name: str) -> str:
-    """Extract entities using Claude 4"""
-    MAX_CHARS = 15000
-    if len(search_results) > MAX_CHARS:
-        trunc = search_results[:MAX_CHARS]
-        last_period = trunc.rfind('. ')
-        search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
-    prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
-Only include founders who are explicitly mentioned as founders of {company_name}.
-Ignore founders of other companies that may be mentioned in the text.
-Return a JSON object with the following structure:
-{{
-    "founders": [
-        {{
-            "name": "Founder Name",
-            "evidence": ["brief quote or context where they were mentioned as founder"]
-        }}
-    ]
-}}
-Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
-You have to examine every article available in the search results below.
-Text:
-{search_results}"""
-    try:
-        message = client.messages.create(
-            model="claude-sonnet-4-20250514",
-            max_tokens=1500,
-            temperature=0.1,
-            messages=[
-                {
-                    "role": "user",
-                    "content": prompt
-                }
-            ]
-        )
-        return message.content[0].text
-    except Exception as e:
-        return f"[ERROR] Extraction failed: {str(e)}"
-# === Gradio interface functions ===
-def search_only(name: str, article_count: int):
-    if not name.strip():
-        return "No name provided", ""
-    try:
-        start = time.time()
-        articles_output = search_articles(name.strip(), max_articles=article_count)
-        elapsed = time.time() - start
-        results = f"✅ **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
-        results += articles_output
-        return results, articles_output
-    except Exception as e:
-        return f"[ERROR] Search failed: {str(e)}", ""
-def extract_only(stored_results: str, company_name: str):
-    if not stored_results.strip():
-        return "No search results available. Please search first."
-    if not company_name.strip():
-        return "No company name provided. Please search first."
-    try:
-        start = time.time()
-        entities = extract_entities(stored_results, company_name.strip())
-        elapsed = time.time() - start
-        # Try to format JSON for better readability
-        try:
-            parsed = json.loads(entities)
-            formatted = json.dumps(parsed, indent=2)
-            return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
-        except:
-            return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
-    except Exception as e:
-        return f"[ERROR] Extraction failed: {str(e)}"
-# === Gradio UI ===
-with gr.Blocks(title="Enhanced Founder Finder") as demo:
-    gr.Markdown("# 🔎 Enhanced Founder Finder")
-    gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
-    gr.Markdown("*🚀 **New**: Automatically searches for both recent news AND historical founding information*")
-    gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
-    search_state = gr.State("")
-    with gr.Row():
-        name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
-        article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
-        with gr.Column():
-            search_btn = gr.Button("🔍 Enhanced Temporal Search", variant="primary")
-            extract_btn = gr.Button("📊 Extract Founder Intelligence", variant="secondary")
-    output1 = gr.Markdown(label="Search Results with Temporal Analysis")
-    output2 = gr.Textbox(
-        label="Founder Intelligence Report",
-        lines=15,
-        max_lines=25,
-        show_copy_button=True
-    )
-    search_btn.click(
-        fn=search_only,
-        inputs=[name_input, article_count_slider],
-        outputs=[output1, search_state]
-    )
-    extract_btn.click(
-        fn=extract_only,
-        inputs=[search_state, name_input],
-        outputs=[output2]
-    )
-if __name__ == "__main__":
-    demo.launch()
-'''

 def search_workflow(name: str, progress=gr.Progress()):
     """
+    A simple function to search for articles, fetching exactly 8 news articles: 4 recent, 4 historical.
     """
     if not name or not name.strip():
         return "❌ Please enter a company name.", ""
 def extraction_workflow(raw_text: str, company_name: str, progress=gr.Progress()):
     """
+    A simple and robust tool to extract founders from text using the AI model.
     """
     if not raw_text or not raw_text.strip():
         return "❌ Please run a search first to get text to analyze."
 # === 3. Simplified Gradio UI ===
 with gr.Blocks(title="Founder Finder", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🔎 Founder Finder")
+    gr.Markdown("A tool to find company founders. **Step 1:** Search for articles. **Step 2:** Extract founders from the results.")
     # Hidden state to pass text from search to extraction
     search_results_for_ai = gr.State("")
 if __name__ == "__main__":
     demo.launch(show_error=True)