Founder_Name_Extraction_v3

Sleeping

App Files Files Community

Founder_Name_Extraction_v3 / app.py

dygoo

Update app.py

45ed981 verified 5 months ago

raw

history blame

8.36 kB

	import gradio as gr
	import requests
	import time
	import re
	from duckduckgo_search import DDGS
	from bs4 import BeautifulSoup
	import anthropic
	import os

	# Initialize Anthropic client
	client = anthropic.Anthropic(
	api_key=os.getenv("ANTHROPIC_API_KEY") # Set as secret in HF Space settings
	)

	# === Model functions ===

	def get_full_article(url):
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	}

	response = requests.get(url, headers=headers, timeout=20, verify=True)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
	element.decompose()

	article_selectors = [
	'article', '.article-content', '.post-content', '.story-body', '.story-content',
	'.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
	'[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
	]

	for selector in article_selectors:
	content = soup.select_one(selector)
	if content:
	paragraphs = content.find_all(['p', 'div'], string=True)
	if paragraphs:
	text_parts = [p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30]
	full_text = '\n\n'.join(text_parts)
	if len(full_text) > 300:
	return full_text[:10000]

	body_text = soup.get_text(separator='\n\n', strip=True)
	body_text = re.sub(r'\n{3,}', '\n\n', body_text)
	return body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"

	except requests.exceptions.Timeout:
	return "[WARNING] Article fetch timeout - using snippet instead"
	except requests.exceptions.RequestException:
	return "[ERROR] Could not fetch article: Network error"
	except Exception as e:
	return f"[ERROR] Could not fetch article: {str(e)}"

	def search_articles(name: str, max_articles: int = 2) -> str:
	keywords = ['founder']
	search_query = f'"{name}" ({" AND ".join(keywords)}) site:news'

	max_retries = 3
	base_delay = 3

	for attempt in range(max_retries):
	try:
	print(f"Search attempt {attempt + 1}: {search_query}")
	time.sleep(base_delay * (attempt + 1))

	configs = [
	{'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
	{'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'},
	{'timeout': 30, 'region': None, 'safesearch': 'moderate'}
	]

	config = configs[min(attempt, len(configs)-1)]

	with DDGS(timeout=config['timeout']) as ddgs:
	search_params = {
	'keywords': search_query,
	'max_results': max_articles,
	'safesearch': config['safesearch']
	}
	if config['region']:
	search_params['region'] = config['region']

	results = list(ddgs.text(**search_params))
	print(f"Found {len(results)} results on attempt {attempt + 1}")

	if not results:
	continue

	articles = []
	for i, result in enumerate(results, 1):
	url = result.get('href', 'No URL')
	title = result.get('title', 'No Title')
	snippet = result.get('body', 'No snippet available')

	if i > 1:
	time.sleep(2)

	full_text = get_full_article(url)
	if any(error in full_text for error in ["[ERROR]", "timeout", "Network error"]):
	print(f"Using snippet fallback for article {i}")
	content = f"[SNIPPET ONLY]\n{snippet}"
	else:
	content = full_text

	article = f"### {i}. {title}\n"
	article += f"[Source]({url})\n\n"
	article += f"{content}\n"
	articles.append(article)

	return "\n---\n".join(articles)

	except Exception as e:
	print(f"Attempt {attempt + 1} failed: {str(e)}")
	if attempt < max_retries - 1:
	time.sleep(base_delay * (attempt + 2))
	else:
	return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"

	return f"[INFO] No articles found for {name}"

	def extract_entities(search_results: str) -> str:
	"""Extract entities using Claude 4"""
	MAX_CHARS = 8000
	if len(search_results) > MAX_CHARS:
	trunc = search_results[:MAX_CHARS]
	last_period = trunc.rfind('. ')
	search_results = trunc[:last_period + 1] if last_period > 3000 else trunc

	prompt = f"""Extract all named entities that are described as founders of the searched business from the following text.
	Return a JSON object with the following two keys:
	- "people": a list of names of people mentioned
	- "organizations": a list of organization names mentioned
	Respond only with valid JSON. Do not include any explanations, comments, or additional formatting. Double check that you included all founders. Double check that the founders you included are indeed founders of the business described in the search.
	Text:
	{search_results}"""

	try:
	message = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=1000,
	temperature=0.15,
	messages=[
	{
	"role": "user",
	"content": prompt
	}
	]
	)
	return message.content[0].text

	except Exception as e:
	return f"[ERROR] Extraction failed: {str(e)}"

	# === Gradio interface functions ===

	def search_only(name: str, article_count: int):
	if not name.strip():
	return "No name provided", ""

	try:
	start = time.time()
	articles_output = search_articles(name.strip(), max_articles=article_count)
	elapsed = time.time() - start

	results = f"✅ Search completed for {name} in {elapsed:.1f}s\n\n"
	results += articles_output

	return results, articles_output
	except Exception as e:
	return f"[ERROR] Search failed: {str(e)}", ""

	def extract_only(stored_results: str):
	if not stored_results.strip():
	return "No search results available. Please search first."

	try:
	start = time.time()
	entities = extract_entities(stored_results)
	elapsed = time.time() - start
	return f"✅ Extraction completed in {elapsed:.1f}s\n\n{entities}"
	except Exception as e:
	return f"[ERROR] Extraction failed: {str(e)}"

	# === Gradio UI ===

	with gr.Blocks(title="Founder Finder") as demo:
	gr.Markdown("# 🔎 Founder Finder")
	gr.Markdown("Enter a business or project name to search for its founder.")
	gr.Markdown("*Note: Full article extraction may take 30–60 seconds.")

	search_state = gr.State("")

	with gr.Row():
	name_input = gr.Textbox(label="Company Name", placeholder="Enter business name")
	article_count_slider = gr.Slider(1, 10, value=2, step=1, label="Number of Articles")
	with gr.Column():
	search_btn = gr.Button("🔍 Search Articles", variant="primary")
	extract_btn = gr.Button("📋 Extract Entities", variant="secondary")

	output1 = gr.Markdown(label="Search Results")
	output2 = gr.Textbox(
	label="Extracted Entities and Relationships",
	lines=10,
	max_lines=20,
	show_copy_button=True
	)

	search_btn.click(
	fn=search_only,
	inputs=[name_input, article_count_slider],
	outputs=[output1, search_state]
	)

	extract_btn.click(
	fn=extract_only,
	inputs=[search_state],
	outputs=[output2]
	)

	if __name__ == "__main__":
	demo.launch()