Founder_Name_Extraction_v3

Sleeping

App Files Files Community

dygoo commited on Jun 9

Commit

725cd97

verified ·

1 Parent(s): 3702004

Update app.py

Browse files

Files changed (1) hide show

app.py +550 -204

app.py CHANGED Viewed

@@ -9,18 +9,31 @@ import os
 from datetime import datetime, timedelta
 from dateutil import parser
 import json
 # Initialize Anthropic client
 client = anthropic.Anthropic(
-    api_key=os.getenv("ANTHROPIC_API_KEY")  # Set as secret in HF Space settings
 )
-# === Model functions ===
 def extract_publication_date(soup, url):
-    """Extract publication date from article HTML"""
     try:
-        # Common date selectors
         date_selectors = [
             'time[datetime]',
             '.date', '.publish-date', '.published', '.post-date',
@@ -40,14 +53,13 @@ def extract_publication_date(soup, url):
                     except:
                         continue
-        # Look for date patterns in text
         date_patterns = [
-            r'(\w+ \d{1,2}, \d{4})',  # January 15, 2023
-            r'(\d{1,2}/\d{1,2}/\d{4})',  # 01/15/2023
-            r'(\d{4}-\d{2}-\d{2})'  # 2023-01-15
         ]
-        text = soup.get_text()[:2000]  # First 2000 chars
         for pattern in date_patterns:
             matches = re.findall(pattern, text)
             if matches:
@@ -61,7 +73,11 @@ def extract_publication_date(soup, url):
     return None
-def get_full_article(url):
     try:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
@@ -71,11 +87,10 @@ def get_full_article(url):
             'Upgrade-Insecure-Requests': '1'
         }
-        response = requests.get(url, headers=headers, timeout=20, verify=True)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
-        # Extract publication date
         pub_date = extract_publication_date(soup, url)
         for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
@@ -103,24 +118,24 @@ def get_full_article(url):
     except requests.exceptions.Timeout:
         return "[WARNING] Article fetch timeout - using snippet instead", None
-    except requests.exceptions.RequestException:
-        return "[ERROR] Could not fetch article: Network error", None
     except Exception as e:
         return f"[ERROR] Could not fetch article: {str(e)}", None
-def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
-    """Search for articles in specific timeframe"""
-    # Define search queries based on timeframe
     if timeframe == "recent":
-        # Recent articles (news, updates, current events)
         search_queries = [
             f'"{name}" founder news 2024 2025',
             f'"{name}" CEO founder recent',
             f'"{name}" founder update latest'
         ]
-    else:  # historical
-        # Historical articles (founding, establishment, origin stories)
         search_queries = [
             f'"{name}" founded established history',
             f'"{name}" founder origin story',
@@ -130,20 +145,31 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
     all_results = []
     max_retries = 2
-    base_delay = 3
     for query_idx, search_query in enumerate(search_queries):
-        if len(all_results) >= max_articles:
             break
         for attempt in range(max_retries):
             try:
                 print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
-                time.sleep(base_delay * (attempt + 1))
                 configs = [
-                    {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
-                    {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
                 ]
                 config = configs[min(attempt, len(configs)-1)]
@@ -151,17 +177,15 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
                 with DDGS(timeout=config['timeout']) as ddgs:
                     search_params = {
                         'keywords': search_query,
-                        'max_results': max_articles - len(all_results) + 2,  # Get a few extra to filter
                         'safesearch': config['safesearch']
                     }
                     if config['region']:
                         search_params['region'] = config['region']
                     results = list(ddgs.text(**search_params))
-                    print(f"Found {len(results)} results for query {query_idx + 1}")
                 if results:
-                    # Add unique results (avoid duplicates)
                     existing_urls = {r.get('url', '') for r in all_results}
                     for result in results:
                         if len(all_results) >= max_articles:
@@ -175,114 +199,193 @@ def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int =
             except Exception as e:
                 print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
                 if attempt < max_retries - 1:
-                    time.sleep(base_delay * (attempt + 2))
     return all_results[:max_articles]
 def categorize_article_by_date(pub_date):
-    """Categorize article as recent or historical based on publication date"""
     if not pub_date:
         return "unknown"
     one_year_ago = datetime.now() - timedelta(days=365)
-    if pub_date >= one_year_ago:
-        return "recent"
     else:
-        return "historical"
-def search_articles(name: str, max_articles: int = 4) -> str:
-    """Enhanced search that ensures both recent and historical articles"""
-    # Split articles between recent and historical
     recent_count = max_articles // 2
     historical_count = max_articles - recent_count
-    print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
     # Search for recent articles
-    recent_results = search_articles_by_timeframe(name, "recent", recent_count)
-    time.sleep(2)  # Brief pause between timeframe searches
-    # Search for historical articles
-    historical_results = search_articles_by_timeframe(name, "historical", historical_count)
-    # Combine and process all results
-    all_results = []
-    # Process recent articles
     for result in recent_results:
         result['expected_timeframe'] = 'recent'
         all_results.append(result)
-    # Process historical articles
     for result in historical_results:
         result['expected_timeframe'] = 'historical'
         all_results.append(result)
     if not all_results:
         return f"[INFO] No articles found for {name}"
-    # Fetch and categorize articles
     articles = []
     recent_found = 0
     historical_found = 0
-    for i, result in enumerate(all_results, 1):
-        url = result.get('href', 'No URL')
-        title = result.get('title', 'No Title')
-        snippet = result.get('body', 'No snippet available')
-        expected_timeframe = result.get('expected_timeframe', 'unknown')
-        if i > 1:
-            time.sleep(2)
-        full_text, pub_date = get_full_article(url)
-        actual_timeframe = categorize_article_by_date(pub_date)
-        # Count articles by actual timeframe
-        if actual_timeframe == "recent":
-            recent_found += 1
-        elif actual_timeframe == "historical":
-            historical_found += 1
-        if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
-            print(f"Using snippet fallback for article {i}")
-            content = f"[SNIPPET ONLY]\n{snippet}"
-        else:
-            content = full_text
-        # Create timeframe indicator
-        timeframe_indicator = ""
-        if pub_date:
-            date_str = pub_date.strftime("%B %d, %Y")
-            timeframe_indicator = f"📅 **Published**: {date_str} ({actual_timeframe.title()})"
-        else:
-            timeframe_indicator = f"📅 **Timeframe**: {expected_timeframe.title()} (estimated)"
-        article = f"### {i}. {title}\n"
-        article += f"[Source]({url})\n"
-        article += f"{timeframe_indicator}\n\n"
-        article += f"{content}\n"
-        articles.append(article)
-    # Add summary of coverage
     summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
-    return summary + "\n---\n".join(articles)
-def extract_entities(search_results: str, company_name: str) -> str:
-    """Extract entities using Claude 4"""
     MAX_CHARS = 15000
     if len(search_results) > MAX_CHARS:
         trunc = search_results[:MAX_CHARS]
         last_period = trunc.rfind('. ')
         search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
     prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
 Only include founders who are explicitly mentioned as founders of {company_name}.
 Ignore founders of other companies that may be mentioned in the text.
 Return a JSON object with the following structure:
 {{
     "founders": [
@@ -292,15 +395,15 @@ Return a JSON object with the following structure:
         }}
     ]
 }}
 Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
 You have to examine every article available in the search results below.
 Text:
 {search_results}"""
     try:
         message = client.messages.create(
             model="claude-sonnet-4-20250514",
             max_tokens=1500,
@@ -312,39 +415,64 @@ Text:
                 }
             ]
         )
-        return message.content[0].text
     except Exception as e:
         return f"[ERROR] Extraction failed: {str(e)}"
-# === Gradio interface functions ===
-def search_only(name: str, article_count: int):
     if not name.strip():
-        return "No name provided", ""
     try:
         start = time.time()
-        articles_output = search_articles(name.strip(), max_articles=article_count)
         elapsed = time.time() - start
         results = f"✅ **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
         results += articles_output
         return results, articles_output
     except Exception as e:
-        return f"[ERROR] Search failed: {str(e)}", ""
-def extract_only(stored_results: str, company_name: str):
     if not stored_results.strip():
-        return "No search results available. Please search first."
     if not company_name.strip():
-        return "No company name provided. Please search first."
     try:
         start = time.time()
-        entities = extract_entities(stored_results, company_name.strip())
         elapsed = time.time() - start
         # Try to format JSON for better readability
@@ -354,50 +482,103 @@ def extract_only(stored_results: str, company_name: str):
             return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
         except:
             return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
     except Exception as e:
-        return f"[ERROR] Extraction failed: {str(e)}"
-# === Gradio UI ===
-with gr.Blocks(title="Enhanced Founder Finder") as demo:
     gr.Markdown("# 🔎 Enhanced Founder Finder")
-    gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
-    gr.Markdown("*🚀 **New**: Automatically searches for both recent news AND historical founding information*")
-    gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
     search_state = gr.State("")
     with gr.Row():
-        name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
-        article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
-        with gr.Column():
-            search_btn = gr.Button("🔍 Enhanced Temporal Search", variant="primary")
-            extract_btn = gr.Button("📊 Extract Founder Intelligence", variant="secondary")
-    output1 = gr.Markdown(label="Search Results with Temporal Analysis")
-    output2 = gr.Textbox(
-        label="Founder Intelligence Report",
-        lines=15,
-        max_lines=25,
-        show_copy_button=True
-    )
-    search_btn.click(
-        fn=search_only,
         inputs=[name_input, article_count_slider],
-        outputs=[output1, search_state]
     )
     extract_btn.click(
-        fn=extract_only,
         inputs=[search_state, name_input],
-        outputs=[output2]
     )
 if __name__ == "__main__":
-    demo.launch()
-''' import gradio as gr
 import requests
 import time
 import re
@@ -405,6 +586,9 @@ from duckduckgo_search import DDGS
 from bs4 import BeautifulSoup
 import anthropic
 import os
 # Initialize Anthropic client
 client = anthropic.Anthropic(
@@ -413,6 +597,50 @@ client = anthropic.Anthropic(
 # === Model functions ===
 def get_full_article(url):
     try:
         headers = {
@@ -427,6 +655,9 @@ def get_full_article(url):
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
         for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
             element.decompose()
@@ -444,89 +675,185 @@ def get_full_article(url):
                     text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
                     full_text = '\n\n'.join(text_parts)
                     if len(full_text) > 300:
-                        return full_text[:10000]
         body_text = soup.get_text(separator='\n\n', strip=True)
         body_text = re.sub(r'\n{3,}', '\n\n', body_text)
-        return body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"
     except requests.exceptions.Timeout:
-        return "[WARNING] Article fetch timeout - using snippet instead"
     except requests.exceptions.RequestException:
-        return "[ERROR] Could not fetch article: Network error"
     except Exception as e:
-        return f"[ERROR] Could not fetch article: {str(e)}"
-def search_articles(name: str, max_articles: int = 2) -> str:
-    keywords = ['founder']
-    search_query = f'"{name}" ({" AND ".join(keywords)}) site:news'
-    max_retries = 3
     base_delay = 3
-    for attempt in range(max_retries):
-        try:
-            print(f"Search attempt {attempt + 1}: {search_query}")
-            time.sleep(base_delay * (attempt + 1))
-            configs = [
-                {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
-                {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'},
-                {'timeout': 30, 'region': None, 'safesearch': 'moderate'}
-            ]
-            config = configs[min(attempt, len(configs)-1)]
-            with DDGS(timeout=config['timeout']) as ddgs:
-                search_params = {
-                    'keywords': search_query,
-                    'max_results': max_articles,
-                    'safesearch': config['safesearch']
-                }
-                if config['region']:
-                    search_params['region'] = config['region']
-                results = list(ddgs.text(**search_params))
-                print(f"Found {len(results)} results on attempt {attempt + 1}")
-            if not results:
-                continue
-            articles = []
-            for i, result in enumerate(results, 1):
-                url = result.get('href', 'No URL')
-                title = result.get('title', 'No Title')
-                snippet = result.get('body', 'No snippet available')
-                if i > 1:
-                    time.sleep(2)
-                full_text = get_full_article(url)
-                if any(error in full_text for error in ["[ERROR]", "timeout", "Network error"]):
-                    print(f"Using snippet fallback for article {i}")
-                    content = f"[SNIPPET ONLY]\n{snippet}"
-                else:
-                    content = full_text
-                article = f"### {i}. {title}\n"
-                article += f"[Source]({url})\n\n"
-                article += f"{content}\n"
-                articles.append(article)
-            return "\n---\n".join(articles)
-        except Exception as e:
-            print(f"Attempt {attempt + 1} failed: {str(e)}")
-            if attempt < max_retries - 1:
-                time.sleep(base_delay * (attempt + 2))
-            else:
-                return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"
-    return f"[INFO] No articles found for {name}"
 def extract_entities(search_results: str, company_name: str) -> str:
     """Extract entities using Claude 4"""
-    MAX_CHARS = 8000
     if len(search_results) > MAX_CHARS:
         trunc = search_results[:MAX_CHARS]
         last_period = trunc.rfind('. ')
@@ -535,18 +862,29 @@ def extract_entities(search_results: str, company_name: str) -> str:
     prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
 Only include founders who are explicitly mentioned as founders of {company_name}.
 Ignore founders of other companies that may be mentioned in the text.
-Return a JSON object with the following two keys:
-- "people": a list of names of people mentioned as founders of {company_name}
-- "organizations": a list of organization names mentioned as founders of {company_name}
-Respond only with valid JSON. Do not include any explanations, comments, or additional formatting. Double check that you included all founders. Double check that the founders you included are indeed founders of {company_name}.
 Text:
 {search_results}"""
     try:
         message = client.messages.create(
             model="claude-sonnet-4-20250514",
-            max_tokens=1000,
-            temperature=0.15,
             messages=[
                 {
                     "role": "user",
@@ -570,7 +908,7 @@ def search_only(name: str, article_count: int):
         articles_output = search_articles(name.strip(), max_articles=article_count)
         elapsed = time.time() - start
-        results = f"✅ Search completed for **{name}** in {elapsed:.1f}s\n\n"
         results += articles_output
         return results, articles_output
@@ -588,31 +926,39 @@ def extract_only(stored_results: str, company_name: str):
         start = time.time()
         entities = extract_entities(stored_results, company_name.strip())
         elapsed = time.time() - start
-        return f"✅ Extraction completed in {elapsed:.1f}s\n\n{entities}"
     except Exception as e:
         return f"[ERROR] Extraction failed: {str(e)}"
 # === Gradio UI ===
-with gr.Blocks(title="Founder Finder") as demo:
-    gr.Markdown("# 🔎 Founder Finder")
-    gr.Markdown("Enter a business or project name to search for its founder.")
-    gr.Markdown("*Note: Full article extraction may take 30–60 seconds.")
     search_state = gr.State("")
     with gr.Row():
         name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
-        article_count_slider = gr.Slider(1, 10, value=2, step=1, label="Number of Articles")
         with gr.Column():
-            search_btn = gr.Button("🔍 Search Articles", variant="primary")
-            extract_btn = gr.Button("📋 Extract Entities", variant="secondary")
-    output1 = gr.Markdown(label="Search Results")
     output2 = gr.Textbox(
-        label="Extracted Entities and Relationships",
-        lines=10,
-        max_lines=20,
         show_copy_button=True
     )
@@ -630,5 +976,5 @@ with gr.Blocks(title="Founder Finder") as demo:
 if __name__ == "__main__":
     demo.launch()
-'''

 from datetime import datetime, timedelta
 from dateutil import parser
 import json
+import threading
+from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
+import signal
 # Initialize Anthropic client
 client = anthropic.Anthropic(
+    api_key=os.getenv("ANTHROPIC_API_KEY")
 )
+# Global variable to track cancellation
+cancel_operation = threading.Event()
+def reset_cancellation():
+    """Reset the cancellation flag"""
+    cancel_operation.clear()
+def check_cancellation():
+    """Check if operation should be cancelled"""
+    return cancel_operation.is_set()
+# === Enhanced Model functions with progress tracking ===
 def extract_publication_date(soup, url):
+    """Extract publication date from article HTML - same as before"""
     try:
         date_selectors = [
             'time[datetime]',
             '.date', '.publish-date', '.published', '.post-date',
                     except:
                         continue
         date_patterns = [
+            r'(\w+ \d{1,2}, \d{4})',
+            r'(\d{1,2}/\d{1,2}/\d{4})',
+            r'(\d{4}-\d{2}-\d{2})'
         ]
+        text = soup.get_text()[:2000]
         for pattern in date_patterns:
             matches = re.findall(pattern, text)
             if matches:
     return None
+def get_full_article_with_timeout(url, timeout=15):
+    """Enhanced article fetching with timeout and better error handling"""
+    if check_cancellation():
+        return "[CANCELLED] Operation was cancelled", None
     try:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
             'Upgrade-Insecure-Requests': '1'
         }
+        response = requests.get(url, headers=headers, timeout=timeout, verify=True)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
         pub_date = extract_publication_date(soup, url)
         for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
     except requests.exceptions.Timeout:
         return "[WARNING] Article fetch timeout - using snippet instead", None
+    except requests.exceptions.RequestException as e:
+        return f"[ERROR] Network error: {str(e)}", None
     except Exception as e:
         return f"[ERROR] Could not fetch article: {str(e)}", None
+def search_articles_by_timeframe_enhanced(name: str, timeframe: str, max_articles: int = 3, progress=None) -> list:
+    """Enhanced search with progress tracking and better error handling"""
+    if check_cancellation():
+        return []
     if timeframe == "recent":
         search_queries = [
             f'"{name}" founder news 2024 2025',
             f'"{name}" CEO founder recent',
             f'"{name}" founder update latest'
         ]
+    else:
         search_queries = [
             f'"{name}" founded established history',
             f'"{name}" founder origin story',
     all_results = []
     max_retries = 2
+    base_delay = 2  # Reduced delay
+    total_queries = len(search_queries)
     for query_idx, search_query in enumerate(search_queries):
+        if len(all_results) >= max_articles or check_cancellation():
             break
+        if progress:
+            query_progress = (query_idx / total_queries) * 0.3  # 30% of progress for queries
+            progress(query_progress, desc=f"Searching {timeframe} articles ({query_idx + 1}/{total_queries})")
         for attempt in range(max_retries):
+            if check_cancellation():
+                return all_results
             try:
                 print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
+                if attempt > 0:
+                    time.sleep(base_delay * attempt)
                 configs = [
+                    {'timeout': 15, 'region': 'us-en', 'safesearch': 'moderate'},
+                    {'timeout': 20, 'region': 'wt-wt', 'safesearch': 'off'}
                 ]
                 config = configs[min(attempt, len(configs)-1)]
                 with DDGS(timeout=config['timeout']) as ddgs:
                     search_params = {
                         'keywords': search_query,
+                        'max_results': max_articles - len(all_results) + 2,
                         'safesearch': config['safesearch']
                     }
                     if config['region']:
                         search_params['region'] = config['region']
                     results = list(ddgs.text(**search_params))
                 if results:
                     existing_urls = {r.get('url', '') for r in all_results}
                     for result in results:
                         if len(all_results) >= max_articles:
             except Exception as e:
                 print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
                 if attempt < max_retries - 1:
+                    time.sleep(base_delay * (attempt + 1))
     return all_results[:max_articles]
 def categorize_article_by_date(pub_date):
+    """Same as before"""
     if not pub_date:
         return "unknown"
     one_year_ago = datetime.now() - timedelta(days=365)
+    return "recent" if pub_date >= one_year_ago else "historical"
+def fetch_article_parallel(result, article_num, total_articles, progress=None):
+    """Fetch single article with progress update"""
+    if check_cancellation():
+        return None
+    url = result.get('href', 'No URL')
+    title = result.get('title', 'No Title')
+    snippet = result.get('body', 'No snippet available')
+    expected_timeframe = result.get('expected_timeframe', 'unknown')
+    if progress:
+        fetch_progress = 0.4 + (article_num / total_articles) * 0.5  # 40-90% of total progress
+        progress(fetch_progress, desc=f"Fetching article {article_num + 1}/{total_articles}: {title[:50]}...")
+    full_text, pub_date = get_full_article_with_timeout(url, timeout=12)
+    if check_cancellation():
+        return None
+    actual_timeframe = categorize_article_by_date(pub_date)
+    if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error", "[CANCELLED]"]):
+        content = f"[SNIPPET ONLY]\n{snippet}"
     else:
+        content = full_text
+    timeframe_indicator = ""
+    if pub_date:
+        date_str = pub_date.strftime("%B %d, %Y")
+        timeframe_indicator = f"📅 **Published**: {date_str} ({actual_timeframe.title()})"
+    else:
+        timeframe_indicator = f"📅 **Timeframe**: {expected_timeframe.title()} (estimated)"
+    article = f"### {article_num + 1}. {title}\n"
+    article += f"[Source]({url})\n"
+    article += f"{timeframe_indicator}\n\n"
+    article += f"{content}\n"
+    return {
+        'article': article,
+        'timeframe': actual_timeframe,
+        'url': url,
+        'title': title
+    }
+def search_articles_enhanced(name: str, max_articles: int = 4, progress=None) -> str:
+    """Enhanced search with progress tracking and parallel processing"""
+    reset_cancellation()  # Reset cancellation flag
+    if progress:
+        progress(0, desc="Initializing enhanced search...")
     recent_count = max_articles // 2
     historical_count = max_articles - recent_count
+    if progress:
+        progress(0.05, desc=f"Planning search: {recent_count} recent + {historical_count} historical articles")
     # Search for recent articles
+    if progress:
+        progress(0.1, desc="Searching for recent articles...")
+    recent_results = search_articles_by_timeframe_enhanced(name, "recent", recent_count, progress)
+    if check_cancellation():
+        return "[CANCELLED] Search was cancelled by user"
+    if progress:
+        progress(0.3, desc="Searching for historical articles...")
+    # Brief pause between searches
+    time.sleep(1)
+    historical_results = search_articles_by_timeframe_enhanced(name, "historical", historical_count, progress)
+    if check_cancellation():
+        return "[CANCELLED] Search was cancelled by user"
+    # Combine results
+    all_results = []
     for result in recent_results:
         result['expected_timeframe'] = 'recent'
         all_results.append(result)
     for result in historical_results:
         result['expected_timeframe'] = 'historical'
         all_results.append(result)
     if not all_results:
+        if progress:
+            progress(1.0, desc="Search completed - no results found")
         return f"[INFO] No articles found for {name}"
+    if progress:
+        progress(0.4, desc=f"Found {len(all_results)} articles, now fetching content...")
+    # Fetch articles with parallel processing (but limited concurrency)
     articles = []
     recent_found = 0
     historical_found = 0
+    # Use ThreadPoolExecutor for controlled parallel fetching
+    max_workers = min(3, len(all_results))  # Limit concurrent requests
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all tasks
+        future_to_result = {
+            executor.submit(fetch_article_parallel, result, i, len(all_results), progress): (result, i)
+            for i, result in enumerate(all_results)
+        }
+        # Collect results as they complete
+        for future in as_completed(future_to_result, timeout=60):  # 60 second timeout
+            if check_cancellation():
+                # Cancel remaining futures
+                for f in future_to_result:
+                    f.cancel()
+                return "[CANCELLED] Search was cancelled by user"
+            try:
+                result_data = future.result(timeout=15)
+                if result_data:
+                    articles.append(result_data)
+                    # Count by timeframe
+                    if result_data['timeframe'] == "recent":
+                        recent_found += 1
+                    elif result_data['timeframe'] == "historical":
+                        historical_found += 1
+            except TimeoutError:
+                print("Article fetch timed out")
+                continue
+            except Exception as e:
+                print(f"Error fetching article: {e}")
+                continue
+    if check_cancellation():
+        return "[CANCELLED] Search was cancelled by user"
+    if progress:
+        progress(0.95, desc="Formatting results...")
+    # Sort articles to maintain order
+    articles.sort(key=lambda x: all_results.index(next(r for r in all_results if r.get('href', '') == x['url'])))
+    # Create final output
     summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
+    article_texts = [article_data['article'] for article_data in articles]
+    if progress:
+        progress(1.0, desc=f"Search completed! Found {len(articles)} articles")
+    return summary + "\n---\n".join(article_texts)
+def extract_entities_enhanced(search_results: str, company_name: str, progress=None) -> str:
+    """Enhanced entity extraction with progress tracking"""
+    if progress:
+        progress(0, desc="Preparing text for analysis...")
     MAX_CHARS = 15000
     if len(search_results) > MAX_CHARS:
         trunc = search_results[:MAX_CHARS]
         last_period = trunc.rfind('. ')
         search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
+    if progress:
+        progress(0.2, desc="Analyzing articles with AI...")
     prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
 Only include founders who are explicitly mentioned as founders of {company_name}.
 Ignore founders of other companies that may be mentioned in the text.
 Return a JSON object with the following structure:
 {{
     "founders": [
         }}
     ]
 }}
 Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
 You have to examine every article available in the search results below.
 Text:
 {search_results}"""
     try:
+        if progress:
+            progress(0.5, desc="Sending request to AI model...")
         message = client.messages.create(
             model="claude-sonnet-4-20250514",
             max_tokens=1500,
                 }
             ]
         )
+        if progress:
+            progress(0.9, desc="Processing AI response...")
+        result = message.content[0].text
+        if progress:
+            progress(1.0, desc="Analysis completed!")
+        return result
     except Exception as e:
+        if progress:
+            progress(1.0, desc="Analysis failed")
         return f"[ERROR] Extraction failed: {str(e)}"
+# === Enhanced Gradio interface functions ===
+def search_only_enhanced(name: str, article_count: int, progress=gr.Progress()):
+    """Enhanced search with progress tracking"""
     if not name.strip():
+        return "❌ No name provided", ""
     try:
         start = time.time()
+        progress(0, desc="Starting enhanced temporal search...")
+        articles_output = search_articles_enhanced(name.strip(), max_articles=article_count, progress=progress)
+        if "[CANCELLED]" in articles_output:
+            return articles_output, ""
         elapsed = time.time() - start
+        progress(1.0, desc=f"Search completed in {elapsed:.1f}s")
         results = f"✅ **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
         results += articles_output
         return results, articles_output
     except Exception as e:
+        progress(1.0, desc="Search failed")
+        return f"❌ **Search failed**: {str(e)}", ""
+def extract_only_enhanced(stored_results: str, company_name: str, progress=gr.Progress()):
+    """Enhanced extraction with progress tracking"""
     if not stored_results.strip():
+        return "❌ No search results available. Please search first."
     if not company_name.strip():
+        return "❌ No company name provided. Please search first."
+    if "[CANCELLED]" in stored_results:
+        return "❌ Cannot extract from cancelled search results. Please search again."
     try:
         start = time.time()
+        entities = extract_entities_enhanced(stored_results, company_name.strip(), progress)
         elapsed = time.time() - start
         # Try to format JSON for better readability
             return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
         except:
             return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
     except Exception as e:
+        progress(1.0, desc="Extraction failed")
+        return f"❌ **Extraction failed**: {str(e)}"
+def cancel_search():
+    """Cancel the current search operation"""
+    cancel_operation.set()
+    return "🛑 **Cancellation requested** - stopping current operation..."
+# === Enhanced Gradio UI ===
+with gr.Blocks(title="Enhanced Founder Finder", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🔎 Enhanced Founder Finder")
+    gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy** with **real-time progress tracking**.")
+    gr.Markdown("*🚀 **New Features**: Progress bars, cancellation support, parallel processing, better error handling*")
+    gr.Markdown("*⏱️ Note: Enhanced search typically takes 30–60 seconds with full progress visibility.*")
     search_state = gr.State("")
     with gr.Row():
+        with gr.Column(scale=2):
+            name_input = gr.Textbox(
+                label="Company Name",
+                placeholder="Enter business name (e.g., 'Tesla', 'SpaceX', 'Microsoft')",
+                lines=1
+            )
+        with gr.Column(scale=1):
+            article_count_slider = gr.Slider(
+                2, 12,
+                value=4,
+                step=2,
+                label="Total Articles",
+                info="Split between recent/historical"
+            )
+    with gr.Row():
+        search_btn = gr.Button("🔍 Enhanced Temporal Search", variant="primary", size="lg")
+        cancel_btn = gr.Button("🛑 Cancel Search", variant="secondary", size="lg")
+        extract_btn = gr.Button("📊 Extract Founder Intelligence", variant="secondary", size="lg")
+    with gr.Row():
+        status_output = gr.Markdown("Ready to search...")
+    with gr.Row():
+        with gr.Column():
+            output1 = gr.Markdown(label="Search Results with Temporal Analysis", height=400)
+        with gr.Column():
+            output2 = gr.Textbox(
+                label="Founder Intelligence Report",
+                lines=15,
+                max_lines=25,
+                show_copy_button=True
+            )
+    # Event handlers
+    search_click = search_btn.click(
+        fn=search_only_enhanced,
         inputs=[name_input, article_count_slider],
+        outputs=[output1, search_state],
+        show_progress=True
+    )
+    cancel_btn.click(
+        fn=cancel_search,
+        outputs=[status_output]
     )
     extract_btn.click(
+        fn=extract_only_enhanced,
         inputs=[search_state, name_input],
+        outputs=[output2],
+        show_progress=True
+    )
+    # Add some example companies
+    gr.Examples(
+        examples=[
+            ["Tesla", 4],
+            ["SpaceX", 6],
+            ["Microsoft", 4],
+            ["Apple", 6],
+            ["OpenAI", 4]
+        ],
+        inputs=[name_input, article_count_slider],
     )
 if __name__ == "__main__":
+    demo.launch(
+        share=False,
+        show_error=True,
+        show_tips=True,
+        height=800
+    )
+'''
+import gradio as gr
 import requests
 import time
 import re
 from bs4 import BeautifulSoup
 import anthropic
 import os
+from datetime import datetime, timedelta
+from dateutil import parser
+import json
 # Initialize Anthropic client
 client = anthropic.Anthropic(
 # === Model functions ===
+def extract_publication_date(soup, url):
+    """Extract publication date from article HTML"""
+    try:
+        # Common date selectors
+        date_selectors = [
+            'time[datetime]',
+            '.date', '.publish-date', '.published', '.post-date',
+            '[class*="date"]', '[class*="time"]',
+            'meta[property="article:published_time"]',
+            'meta[name="publishdate"]',
+            'meta[name="date"]'
+        ]
+        for selector in date_selectors:
+            element = soup.select_one(selector)
+            if element:
+                date_text = element.get('datetime') or element.get('content') or element.get_text()
+                if date_text:
+                    try:
+                        return parser.parse(date_text)
+                    except:
+                        continue
+        # Look for date patterns in text
+        date_patterns = [
+            r'(\w+ \d{1,2}, \d{4})',  # January 15, 2023
+            r'(\d{1,2}/\d{1,2}/\d{4})',  # 01/15/2023
+            r'(\d{4}-\d{2}-\d{2})'  # 2023-01-15
+        ]
+        text = soup.get_text()[:2000]  # First 2000 chars
+        for pattern in date_patterns:
+            matches = re.findall(pattern, text)
+            if matches:
+                try:
+                    return parser.parse(matches[0])
+                except:
+                    continue
+    except Exception as e:
+        print(f"Date extraction error for {url}: {e}")
+    return None
 def get_full_article(url):
     try:
         headers = {
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
+        # Extract publication date
+        pub_date = extract_publication_date(soup, url)
         for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
             element.decompose()
                     text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
                     full_text = '\n\n'.join(text_parts)
                     if len(full_text) > 300:
+                        return full_text[:10000], pub_date
         body_text = soup.get_text(separator='\n\n', strip=True)
         body_text = re.sub(r'\n{3,}', '\n\n', body_text)
+        return (body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"), pub_date
     except requests.exceptions.Timeout:
+        return "[WARNING] Article fetch timeout - using snippet instead", None
     except requests.exceptions.RequestException:
+        return "[ERROR] Could not fetch article: Network error", None
     except Exception as e:
+        return f"[ERROR] Could not fetch article: {str(e)}", None
+def search_articles_by_timeframe(name: str, timeframe: str, max_articles: int = 3) -> list:
+    """Search for articles in specific timeframe"""
+    # Define search queries based on timeframe
+    if timeframe == "recent":
+        # Recent articles (news, updates, current events)
+        search_queries = [
+            f'"{name}" founder news 2024 2025',
+            f'"{name}" CEO founder recent',
+            f'"{name}" founder update latest'
+        ]
+    else:  # historical
+        # Historical articles (founding, establishment, origin stories)
+        search_queries = [
+            f'"{name}" founded established history',
+            f'"{name}" founder origin story',
+            f'"{name}" started began founder',
+            f'"{name}" founder early days'
+        ]
+    all_results = []
+    max_retries = 2
     base_delay = 3
+    for query_idx, search_query in enumerate(search_queries):
+        if len(all_results) >= max_articles:
+            break
+        for attempt in range(max_retries):
+            try:
+                print(f"Search attempt {attempt + 1} for query {query_idx + 1} ({timeframe}): {search_query}")
+                time.sleep(base_delay * (attempt + 1))
+                configs = [
+                    {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
+                    {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'}
+                ]
+                config = configs[min(attempt, len(configs)-1)]
+                with DDGS(timeout=config['timeout']) as ddgs:
+                    search_params = {
+                        'keywords': search_query,
+                        'max_results': max_articles - len(all_results) + 2,  # Get a few extra to filter
+                        'safesearch': config['safesearch']
+                    }
+                    if config['region']:
+                        search_params['region'] = config['region']
+                    results = list(ddgs.text(**search_params))
+                    print(f"Found {len(results)} results for query {query_idx + 1}")
+                if results:
+                    # Add unique results (avoid duplicates)
+                    existing_urls = {r.get('url', '') for r in all_results}
+                    for result in results:
+                        if len(all_results) >= max_articles:
+                            break
+                        url = result.get('href', '')
+                        if url and url not in existing_urls:
+                            all_results.append(result)
+                            existing_urls.add(url)
+                    break
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed for {timeframe} query {query_idx + 1}: {str(e)}")
+                if attempt < max_retries - 1:
+                    time.sleep(base_delay * (attempt + 2))
+    return all_results[:max_articles]
+def categorize_article_by_date(pub_date):
+    """Categorize article as recent or historical based on publication date"""
+    if not pub_date:
+        return "unknown"
+    one_year_ago = datetime.now() - timedelta(days=365)
+    if pub_date >= one_year_ago:
+        return "recent"
+    else:
+        return "historical"
+def search_articles(name: str, max_articles: int = 4) -> str:
+    """Enhanced search that ensures both recent and historical articles"""
+    # Split articles between recent and historical
+    recent_count = max_articles // 2
+    historical_count = max_articles - recent_count
+    print(f"Searching for {recent_count} recent and {historical_count} historical articles about {name}")
+    # Search for recent articles
+    recent_results = search_articles_by_timeframe(name, "recent", recent_count)
+    time.sleep(2)  # Brief pause between timeframe searches
+    # Search for historical articles
+    historical_results = search_articles_by_timeframe(name, "historical", historical_count)
+    # Combine and process all results
+    all_results = []
+    # Process recent articles
+    for result in recent_results:
+        result['expected_timeframe'] = 'recent'
+        all_results.append(result)
+    # Process historical articles
+    for result in historical_results:
+        result['expected_timeframe'] = 'historical'
+        all_results.append(result)
+    if not all_results:
+        return f"[INFO] No articles found for {name}"
+    # Fetch and categorize articles
+    articles = []
+    recent_found = 0
+    historical_found = 0
+    for i, result in enumerate(all_results, 1):
+        url = result.get('href', 'No URL')
+        title = result.get('title', 'No Title')
+        snippet = result.get('body', 'No snippet available')
+        expected_timeframe = result.get('expected_timeframe', 'unknown')
+        if i > 1:
+            time.sleep(2)
+        full_text, pub_date = get_full_article(url)
+        actual_timeframe = categorize_article_by_date(pub_date)
+        # Count articles by actual timeframe
+        if actual_timeframe == "recent":
+            recent_found += 1
+        elif actual_timeframe == "historical":
+            historical_found += 1
+        if any(error in str(full_text) for error in ["[ERROR]", "timeout", "Network error"]):
+            print(f"Using snippet fallback for article {i}")
+            content = f"[SNIPPET ONLY]\n{snippet}"
+        else:
+            content = full_text
+        # Create timeframe indicator
+        timeframe_indicator = ""
+        if pub_date:
+            date_str = pub_date.strftime("%B %d, %Y")
+            timeframe_indicator = f"📅 **Published**: {date_str} ({actual_timeframe.title()})"
+        else:
+            timeframe_indicator = f"📅 **Timeframe**: {expected_timeframe.title()} (estimated)"
+        article = f"### {i}. {title}\n"
+        article += f"[Source]({url})\n"
+        article += f"{timeframe_indicator}\n\n"
+        article += f"{content}\n"
+        articles.append(article)
+    # Add summary of coverage
+    summary = f"**Search Summary**: Found {len(articles)} articles total - {recent_found} recent, {historical_found} historical, {len(articles) - recent_found - historical_found} unknown timeframe\n\n"
+    return summary + "\n---\n".join(articles)
 def extract_entities(search_results: str, company_name: str) -> str:
     """Extract entities using Claude 4"""
+    MAX_CHARS = 15000
     if len(search_results) > MAX_CHARS:
         trunc = search_results[:MAX_CHARS]
         last_period = trunc.rfind('. ')
     prompt = f"""Extract all named entities that are described as founders of "{company_name}" specifically from the following text.
 Only include founders who are explicitly mentioned as founders of {company_name}.
 Ignore founders of other companies that may be mentioned in the text.
+Return a JSON object with the following structure:
+{{
+    "founders": [
+        {{
+            "name": "Founder Name",
+            "evidence": ["brief quote or context where they were mentioned as founder"]
+        }}
+    ]
+}}
+Respond only with valid JSON. Do not include any explanations, comments, or additional formatting.
+You have to examine every article available in the search results below.
 Text:
 {search_results}"""
     try:
         message = client.messages.create(
             model="claude-sonnet-4-20250514",
+            max_tokens=1500,
+            temperature=0.1,
             messages=[
                 {
                     "role": "user",
         articles_output = search_articles(name.strip(), max_articles=article_count)
         elapsed = time.time() - start
+        results = f"✅ **Enhanced Temporal Search** completed for **{name}** in {elapsed:.1f}s\n\n"
         results += articles_output
         return results, articles_output
         start = time.time()
         entities = extract_entities(stored_results, company_name.strip())
         elapsed = time.time() - start
+        # Try to format JSON for better readability
+        try:
+            parsed = json.loads(entities)
+            formatted = json.dumps(parsed, indent=2)
+            return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n```json\n{formatted}\n```"
+        except:
+            return f"✅ **Enhanced Extraction** completed in {elapsed:.1f}s\n\n{entities}"
     except Exception as e:
         return f"[ERROR] Extraction failed: {str(e)}"
 # === Gradio UI ===
+with gr.Blocks(title="Enhanced Founder Finder") as demo:
+    gr.Markdown("# 🔎 Enhanced Founder Finder")
+    gr.Markdown("Enter a business or project name to search for its founder using **temporal search strategy**.")
+    gr.Markdown("*🚀 **New**: Automatically searches for both recent news AND historical founding information*")
+    gr.Markdown("*⏱️ Note: Enhanced search may take 60–90 seconds for comprehensive results.*")
     search_state = gr.State("")
     with gr.Row():
         name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
+        article_count_slider = gr.Slider(2, 12, value=4, step=2, label="Total Articles (split between recent/historical)")
         with gr.Column():
+            search_btn = gr.Button("🔍 Enhanced Temporal Search", variant="primary")
+            extract_btn = gr.Button("📊 Extract Founder Intelligence", variant="secondary")
+    output1 = gr.Markdown(label="Search Results with Temporal Analysis")
     output2 = gr.Textbox(
+        label="Founder Intelligence Report",
+        lines=15,
+        max_lines=25,
         show_copy_button=True
     )
 if __name__ == "__main__":
     demo.launch()
+'''