Founder_Name_Extraction_v3

Sleeping

App Files Files Community

dygoo commited on Jun 9

Commit

3238b9e

verified ·

1 Parent(s): 65d6d08

Update app.py

Browse files

Files changed (1) hide show

app.py +407 -1

app.py CHANGED Viewed

@@ -6,6 +6,412 @@ from duckduckgo_search import DDGS
 from bs4 import BeautifulSoup
 import anthropic
 import os
 # Initialize Anthropic client
 client = anthropic.Anthropic(
@@ -231,5 +637,5 @@ with gr.Blocks(title="Founder Finder") as demo:
 if __name__ == "__main__":
     demo.launch()

 from bs4 import BeautifulSoup
 import anthropic
 import os
+from datetime import datetime, timedelta
+from dateutil import parser
+import json
+# Initialize Anthropic client
+client = anthropic.Anthropic(
+    api_key=os.getenv("ANTHROPIC_API_KEY")  # Set as secret in HF Space settings
+)
+# === Model functions ===
+def extract_publication_date(soup, url):
+    """Extract publication date from article HTML"""
+    try:
+        # Common date selectors
+        date_selectors = [
+            'time[datetime]',
+            '.date', '.publish-date', '.published', '.post-date',
+            '[class*="date"]', '[class*="time"]',
+            'meta[property="article:published_time"]',
+            'meta[name="publishdate"]',
+            'meta[name="date"]'
+        ]
+        for selector in date_selectors:
+            element = soup.select_one(selector)
+            if element:
+                date_text = element.get('datetime') or element.get('content') or element.get_text()
+                if date_text:
+                    try:
+                        return parser.parse(date_text)
+                    except:
+                        continue
+        # Look for date patterns in text
+        date_patterns = [
+            r'(\w+ \d{1,2}, \d{4})',  # January 15, 2023
+            r'(\d{1,2}/\d{1,2}/\d{4})',  # 01/15/2023
+            r'(\d{4}-\d{2}-\d{2})'  # 2023-01-15
+        ]
+        text = soup.get_text()[:2000]  # First 2000 chars
+        for pattern in date_patterns:
+            matches = re.findall(pattern, text)
+            if matches:
+                try:
+                    return parser.parse(matches[0])
+                except:
+                    continue
+    except Exception as e:
+        print(f"Date extraction error for {url}: {e}")
+    return None
+def get_full_article(url):
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        }
+        response = requests.get(url, headers=headers, timeout=20, verify=True)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Extract publication date
+        pub_date = extract_publication_date(soup, url)
+        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
+            element.decompose()
+        article_selectors = [
+            'article', '.article-content', '.post-content', '.story-body', '.story-content',
+            '.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
+            '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
+        ]
+        for selector in article_selectors:
+            content = soup.select_one(selector)
+            if content:
+                paragraphs = content.find_all(['p', 'div'], string=True)
+                if paragraphs:
+                    text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
+                    full_text = '\n\n'.join(text_parts)
+                    if len(full_text) > 300:
+                        return full_text[:10000], pub_date
+        body_text = soup.get_text(separator='\n\n', strip=True)
+        body_text = re.sub(r'\n{3,}', '\n\n', body_text)
+        return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
+    except requests.exceptions.Timeout:
+        return "[WARNING] Article fetch timeout - using snippet instead", None
+    except requests.exceptions.RequestException:
+        return "[ERROR] Could not fetch article: Network error", None
+    except Exception as e:
+        return f"[ERROR] Could not fetch article: {str(e)}", None
+def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
+    """Search for articles in specific timeframe"""
+    # Define search queries based on timeframe
+    if timeframe == "recent":
+        # Recent articles (news, updates, current events)
+        search_queries = [
+            f'"{name}" founder news 2024 2025',
+            f'"{name}" CEO founder recent',
+            f'"{name}" founder update latest'
+        ]
+    else:  # historical
+        # Historical articles (founding, establishment, origin stories)
+        search_queries = [
+            f'"{name}" founded established history',
+            f'"{name}" founder origin story',
+            f'"{name}" started began founder',
+            f'"{name}" founder early days'
+        ]
+    all_results = []
+    max_retries = 2
+    base_delay = 3
+    for query_idx, search_query in enumerate(search_queries):
+        if len(all_results) >= max_articles:
+            break
+        for attempt in range(max_retries):
+            try:
+                print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
+                time.sleep(base_delay * (attempt + 1))
+                configs = [
+                    {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
+                    {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
+                ]
+                config = configs[min(attempt, len(configs)-1)]
+                with DDGS(timeout=config['timeout']) as ddgs:
+                    search_params = {
+                        'keywords': search_query,
+                        'max_results': max_articles - len(all_results) + 2,  # Get a few extra to filter
+                        'safesearch': config['safesearch']
+                    }
+                    if config['region']:
+                        search_params['region'] = config['region']
+                    results = list(ddgs.text(**search_params))
+                    print(f"Found {len(results)} results for query {query_idx + 1}")
+                if results:
+                    # Add unique results (avoid duplicates)
+                    existing_urls = {r.get('url', '') for r in all_results}
+                    for result in results:
+                        if len(all_results) >= max_articles:
+                            break
+                        url = result.get('href', '')
+                        if url and url not in existing_urls:
+                            all_results.append(result)
+                            existing_urls.add(url)
+                    break
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
+                if attempt < max_retries - 1:
+                    time.sleep(base_delay * (attempt + 2))
+    return all_results[:max_articles]
+def categorize_article_by_date(pub_date):
+    """Categorize article as recent or historical based on publication date"""
+    if not pub_date:
+        return "unknown"
+    one_year_ago = datetime.now() - timedelta(days=365)
+    if pub_date >= one_year_ago:
+        return "recent"
+    else:
+        return "historical"
+def search_articles(name: str, max_articles: int = 4) -> str:
+    """Enhanced search that ensures both recent and historical articles"""
+    # Split articles between recent and historical
+    recent_count = max_articles // 2
+    historical_count = max_articles - recent_count
+    print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
+    # Search for recent articles
+    recent_results = search_articles_by_timeframe(name, "recent", recent_count)
+    time.sleep(2)  # Brief pause between timeframe searches
+    # Search for historical articles
+    historical_results = search_articles_by_timeframe(name, "historical", historical_count)
+    # Combine and process all results
+    all_results = []
+    # Process recent articles
+    for result in recent_results:
+        result['expected_timeframe'] = 'recent'
+        all_results.append(result)
+    # Process historical articles
+    for result in historical_results:
+        result['expected_timeframe'] = 'historical'
+        all_results.append(result)
+    if not all_results:
+        return f"[INFO] No articles found for {name}"
+    # Fetch and categorize articles
+    articles = []
+    recent_found = 0
+    historical_found = 0
+    for i, result in enumerate(all_results, 1):
+        url = result.get('href', 'No URL')
+        title = result.get('title', 'No Title')
+        snippet = result.get('body', 'No snippet available')
+        expected_timeframe = result.get('expected_timeframe', 'unknown')
+        if i > 1:
+            time.sleep(2)
+        full_text, pub_date = get_full_article(url)
+        actual_timeframe = categorize_article_by_date(pub_date)
+        # Count articles by actual timeframe
+        if actual_timeframe == "recent":
+            recent_found += 1
+        elif actual_timeframe == "historical":
+            historical_found += 1
+        if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
+            print(f"Using snippet fallback for article {i}")
+            content = f"[SNIPPET ONLY]\n{snippet}"
+        else:
+            content = full_text
+        # Create timeframe indicator
+        timeframe_indicator = ""
+        if pub_date:
+            date_str = pub_date.strftime("%B %d, %Y")
+            timeframe_indicator = f"📅 **Published**: {date_str} ({actual_timeframe.title()})"
+        else:
+            timeframe_indicator = f"📅 **Timeframe**: {expected_timeframe.title()} (estimated)"
+        article = f"### {i}. {title}\n"
+        article += f"[Source]({url})\n"
+        article += f"{timeframe_indicator}\n\n"
+        article += f"{content}\n"
+        articles.append(article)
+    # Add summary of coverage
+    summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
+    return summary + "\n---\n".join(articles)
+def extract_entities(search_results: str, company_name: str) -> str:
+    """Extract entities using Claude 4"""
+    MAX_CHARS = 12000  # Increased to handle more content
+    if len(search_results) > MAX_CHARS:
+        trunc = search_results[:MAX_CHARS]
+        last_period = trunc.rfind('. ')
+        search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
+    prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
+Only include founders who are explicitly mentioned as founders of {company_name}.
+Ignore founders of other companies that may be mentioned in the text.
+Also identify the temporal context for each founder mention (recent news vs historical founding information).
+Return a JSON object with the following structure:
+{{
+    "founders": [
+        {{
+            "name": "Founder Name",
+            "type": "person" or "organization",
+            "context": "recent" or "historical" or "both",
+            "evidence": ["brief quote or context where they were mentioned as founder"]
+        }}
+    ],
+    "founding_timeline": {{
+        "founding_date": "date if mentioned",
+        "key_events": ["important founding milestones mentioned"]
+    }},
+    "confidence": "high/medium/low based on clarity of founder information"
+}}
+Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
+Text:
+{search_results}"""
+    try:
+        message = client.messages.create(
+            model="claude-sonnet-4-20250514",
+            max_tokens=1500,
+            temperature=0.1,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ]
+        )
+        return message.content[0].text
+    except Exception as e:
+        return f"[ERROR] Extraction failed: {str(e)}"
+# === Gradio interface functions ===
+def search_only(name: str, article_count: int):
+    if not name.strip():
+        return "No name provided", ""
+    try:
+        start = time.time()
+        articles_output = search_articles(name.strip(), max_articles=article_count)
+        elapsed = time.time() - start
+        results = f"✅ **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
+        results += articles_output
+        return results, articles_output
+    except Exception as e:
+        return f"[ERROR] Search failed: {str(e)}", ""
+def extract_only(stored_results: str, company_name: str):
+    if not stored_results.strip():
+        return "No search results available. Please search first."
+    if not company_name.strip():
+        return "No company name provided. Please search first."
+    try:
+        start = time.time()
+        entities = extract_entities(stored_results, company_name.strip())
+        elapsed = time.time() - start
+        # Try to format JSON for better readability
+        try:
+            parsed = json.loads(entities)
+            formatted = json.dumps(parsed, indent=2)
+            return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
+        except:
+            return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
+    except Exception as e:
+        return f"[ERROR] Extraction failed: {str(e)}"
+# === Gradio UI ===
+with gr.Blocks(title="Enhanced Founder Finder") as demo:
+    gr.Markdown("# 🔎 Enhanced Founder Finder")
+    gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
+    gr.Markdown("*🚀 **New**: Automatically searches for both recent news AND historical founding information*")
+    gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
+    search_state = gr.State("")
+    with gr.Row():
+        name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
+        article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
+        with gr.Column():
+            search_btn = gr.Button("🔍 Enhanced Temporal Search", variant="primary")
+            extract_btn = gr.Button("📊 Extract Founder Intelligence", variant="secondary")
+    output1 = gr.Markdown(label="Search Results with Temporal Analysis")
+    output2 = gr.Textbox(
+        label="Founder Intelligence Report",
+        lines=15,
+        max_lines=25,
+        show_copy_button=True
+    )
+    search_btn.click(
+        fn=search_only,
+        inputs=[name_input, article_count_slider],
+        outputs=[output1, search_state]
+    )
+    extract_btn.click(
+        fn=extract_only,
+        inputs=[search_state, name_input],
+        outputs=[output2]
+    )
+if __name__ == "__main__":
+    demo.launch()
+''' import gradio as gr
+import requests
+import time
+import re
+from duckduckgo_search import DDGS
+from bs4 import BeautifulSoup
+import anthropic
+import os
 # Initialize Anthropic client
 client = anthropic.Anthropic(
 if __name__ == "__main__":
     demo.launch()
+'''