dygoo's picture
Update app.py
45ed981 verified
raw
history blame
8.36 kB
import gradio as gr
import requests
import time
import re
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
import anthropic
import os
# Initialize Anthropic client
client = anthropic.Anthropic(
api_key=os.getenv("ANTHROPIC_API_KEY") # Set as secret in HF Space settings
)
# === Model functions ===
def get_full_article(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(url, headers=headers, timeout=20, verify=True)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
element.decompose()
article_selectors = [
'article', '.article-content', '.post-content', '.story-body', '.story-content',
'.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
'[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
]
for selector in article_selectors:
content = soup.select_one(selector)
if content:
paragraphs = content.find_all(['p', 'div'], string=True)
if paragraphs:
text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
full_text = '\n\n'.join(text_parts)
if len(full_text) > 300:
return full_text[:10000]
body_text = soup.get_text(separator='\n\n', strip=True)
body_text = re.sub(r'\n{3,}', '\n\n', body_text)
return body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"
except requests.exceptions.Timeout:
return "[WARNING] Article fetch timeout - using snippet instead"
except requests.exceptions.RequestException:
return "[ERROR] Could not fetch article: Network error"
except Exception as e:
return f"[ERROR] Could not fetch article: {str(e)}"
def search_articles(name: str, max_articles: int = 2) -> str:
keywords = ['founder']
search_query = f'"{name}" ({" AND ".join(keywords)}) site:news'
max_retries = 3
base_delay = 3
for attempt in range(max_retries):
try:
print(f"Search attempt {attempt + 1}: {search_query}")
time.sleep(base_delay * (attempt + 1))
configs = [
{'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
{'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'},
{'timeout': 30, 'region': None, 'safesearch': 'moderate'}
]
config = configs[min(attempt, len(configs)-1)]
with DDGS(timeout=config['timeout']) as ddgs:
search_params = {
'keywords': search_query,
'max_results': max_articles,
'safesearch': config['safesearch']
}
if config['region']:
search_params['region'] = config['region']
results = list(ddgs.text(**search_params))
print(f"Found {len(results)} results on attempt {attempt + 1}")
if not results:
continue
articles = []
for i, result in enumerate(results, 1):
url = result.get('href', 'No URL')
title = result.get('title', 'No Title')
snippet = result.get('body', 'No snippet available')
if i > 1:
time.sleep(2)
full_text = get_full_article(url)
if any(error in full_text for error in ["[ERROR]", "timeout", "Network error"]):
print(f"Using snippet fallback for article {i}")
content = f"[SNIPPET ONLY]\n{snippet}"
else:
content = full_text
article = f"### {i}. {title}\n"
article += f"[Source]({url})\n\n"
article += f"{content}\n"
articles.append(article)
return "\n---\n".join(articles)
except Exception as e:
print(f"Attempt {attempt + 1} failed: {str(e)}")
if attempt < max_retries - 1:
time.sleep(base_delay * (attempt + 2))
else:
return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"
return f"[INFO] No articles found for {name}"
def extract_entities(search_results: str) -> str:
"""Extract entities using Claude 4"""
MAX_CHARS = 8000
if len(search_results) > MAX_CHARS:
trunc = search_results[:MAX_CHARS]
last_period = trunc.rfind('. ')
search_results = trunc[:last_period + 1] if last_period > 3000 else trunc
prompt = f"""Extract all named entities that are described as founders of the searched business from the following text.
Return a JSON object with the following two keys:
- "people": a list of names of people mentioned
- "organizations": a list of organization names mentioned
Respond only with valid JSON. Do not include any explanations, comments, or additional formatting. Double check that you included all founders. Double check that the founders you included are indeed founders of the business described in the search.
Text:
{search_results}"""
try:
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1000,
temperature=0.15,
messages=[
{
"role": "user",
"content": prompt
}
]
)
return message.content[0].text
except Exception as e:
return f"[ERROR] Extraction failed: {str(e)}"
# === Gradio interface functions ===
def search_only(name: str, article_count: int):
if not name.strip():
return "No name provided", ""
try:
start = time.time()
articles_output = search_articles(name.strip(), max_articles=article_count)
elapsed = time.time() - start
results = f"βœ… Search completed for **{name}** in {elapsed:.1f}s\n\n"
results += articles_output
return results, articles_output
except Exception as e:
return f"[ERROR] Search failed: {str(e)}", ""
def extract_only(stored_results: str):
if not stored_results.strip():
return "No search results available. Please search first."
try:
start = time.time()
entities = extract_entities(stored_results)
elapsed = time.time() - start
return f"βœ… Extraction completed in {elapsed:.1f}s\n\n{entities}"
except Exception as e:
return f"[ERROR] Extraction failed: {str(e)}"
# === Gradio UI ===
with gr.Blocks(title="Founder Finder") as demo:
gr.Markdown("# πŸ”Ž Founder Finder")
gr.Markdown("Enter a business or project name to search for its founder.")
gr.Markdown("*Note: Full article extraction may take 30–60 seconds.")
search_state = gr.State("")
with gr.Row():
name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
article_count_slider = gr.Slider(1, 10, value=2, step=1, label="Number of Articles")
with gr.Column():
search_btn = gr.Button("πŸ” Search Articles", variant="primary")
extract_btn = gr.Button("πŸ“‹ Extract Entities", variant="secondary")
output1 = gr.Markdown(label="Search Results")
output2 = gr.Textbox(
label="Extracted Entities and Relationships",
lines=10,
max_lines=20,
show_copy_button=True
)
search_btn.click(
fn=search_only,
inputs=[name_input, article_count_slider],
outputs=[output1, search_state]
)
extract_btn.click(
fn=extract_only,
inputs=[search_state],
outputs=[output2]
)
if __name__ == "__main__":
demo.launch()