dygoo commited on
Commit
aa62c4f
·
verified ·
1 Parent(s): 570841b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -33
app.py CHANGED
@@ -2,50 +2,181 @@ import gradio as gr
2
  import requests
3
  import time
4
  from duckduckgo_search import DDGS
 
5
 
6
  # === Model functions ===
7
 
8
- def search_articles(name: str) -> str:
9
- """Search for a newspaper article containing the name and keywords using DuckDuckGo"""
10
- keywords = ['founders', 'partners', 'funders', 'owners']
11
- search_query = f'"{name}" ({" AND ".join(keywords)}) site:news'
12
  try:
13
- print(f"Search query: {search_query}")
14
- with DDGS() as ddgs:
15
- results = list(ddgs.text(search_query, max_results=2))
16
- print(f"Raw results: {results}")
17
- if not results:
18
- return f"No articles found for {name}"
19
-
20
- articles = []
21
- for i, result in enumerate(results, 1):
22
- article = f"**{i}. {result.get('title', 'No Title')}**\n"
23
- article += f"Source: {result.get('href', 'No URL')}\n"
24
- article += f"{full_text}\n"
25
- articles.append(article)
26
-
27
- return "\n\n".join(articles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  except Exception as e:
29
- return f"[ERROR] Search failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  def extract_entities(search_results: str) -> str:
33
  """Extract entities using Mistral 7B endpoint"""
34
  modal_endpoint = "https://msoaresdiego--mistral-llm-endpoint-fastapi-app.modal.run/generate"
35
- prompt = f"""Extract all person names and organization names from the following text.Do not extract products and service names. Only individuals and organizations.Bring the full details of the name in the newspaper article. For example, if only ACME is mentioned as company name, bring only ACME. IF ACME Inc is mentioned as company name, then you have to extract ACME Inc. In addition, define the relationship between the entity and the company that is being searched. For example, is ACME Inc an owner of the company being searched? Then write 'owner'. Is ACME Inc. a funder of the company being searched? Then write 'funder'
 
36
  Format as:
37
- PERSON: [name]
38
- ORG: [organization name]
 
39
  Text: {search_results}"""
 
40
  try:
41
  response = requests.post(
42
  modal_endpoint,
43
- json={"prompt": prompt, "max_tokens": 1000, "temperature": 0.20}
 
44
  )
45
  if response.status_code == 200:
46
  return response.json().get("response", "No entities extracted")
47
  else:
48
- return f"[ERROR] API Error: {response.status_code}"
 
 
49
  except Exception as e:
50
  return f"[ERROR] Extraction failed: {str(e)}"
51
 
@@ -62,7 +193,7 @@ def search_only(name: str):
62
  articles_output = search_articles(name.strip())
63
  search_time = time.time() - search_start
64
 
65
- search_results = f"Search completed for: {name} in {search_time:.2f}s\n\n"
66
  search_results += articles_output
67
 
68
  return search_results, articles_output # Return both display and raw results
@@ -81,7 +212,7 @@ def extract_only(stored_search_results: str):
81
  entities = extract_entities(stored_search_results)
82
  extract_time = time.time() - extract_start
83
 
84
- extraction_results = f"Entity extraction completed in {extract_time:.2f}s\n\n"
85
  extraction_results += entities
86
 
87
  return extraction_results
@@ -94,19 +225,30 @@ def extract_only(stored_search_results: str):
94
  with gr.Blocks(title="Related Entities Finder") as demo:
95
  gr.Markdown("# 🔎 Related Entities Finder")
96
  gr.Markdown("Enter a business or project name to search for related articles and extract key entities.")
 
97
 
98
  # State to store search results between operations
99
  search_state = gr.State("")
100
 
101
  with gr.Row():
102
- name_input = gr.Textbox(label="Name", placeholder="Enter business or project name")
103
  with gr.Column():
104
- search_btn = gr.Button("Search Articles", variant="primary")
105
- extract_btn = gr.Button("Extract Entities", variant="secondary")
106
 
107
  with gr.Column():
108
- output1 = gr.Textbox(label="Search Results", lines=50, max_lines=100)
109
- output2 = gr.Textbox(label="Extracted Entities and Relationships", lines=5, max_lines=10)
 
 
 
 
 
 
 
 
 
 
110
 
111
  # Search button click
112
  search_btn.click(
@@ -124,4 +266,4 @@ with gr.Blocks(title="Related Entities Finder") as demo:
124
 
125
 
126
  if __name__ == "__main__":
127
- demo.launch()
 
2
  import requests
3
  import time
4
  from duckduckgo_search import DDGS
5
+ from bs4 import BeautifulSoup
6
 
7
  # === Model functions ===
8
 
9
+ def get_full_article(url):
10
+ """Fetch full article content from URL"""
 
 
11
  try:
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
15
+ 'Accept-Language': 'en-US,en;q=0.5',
16
+ 'Accept-Encoding': 'gzip, deflate',
17
+ 'Connection': 'keep-alive',
18
+ 'Upgrade-Insecure-Requests': '1'
19
+ }
20
+
21
+ response = requests.get(url, headers=headers, timeout=15, verify=False)
22
+ response.raise_for_status()
23
+
24
+ soup = BeautifulSoup(response.content, 'html.parser')
25
+
26
+ # Remove unwanted elements
27
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
28
+ element.decompose()
29
+
30
+ # Try common article selectors in order of preference
31
+ article_selectors = [
32
+ 'article',
33
+ '.article-content', '.post-content', '.story-body', '.story-content',
34
+ '.entry-content', '.content-body', '.article-body',
35
+ 'main article', 'main .content', 'main',
36
+ '[role="main"]', '.main-content', '.page-content',
37
+ '.text', '.article-text'
38
+ ]
39
+
40
+ for selector in article_selectors:
41
+ content = soup.select_one(selector)
42
+ if content:
43
+ # Extract paragraphs for better formatting
44
+ paragraphs = content.find_all(['p', 'div'], string=True)
45
+ if paragraphs:
46
+ text_parts = []
47
+ for p in paragraphs:
48
+ text = p.get_text(strip=True)
49
+ if len(text) > 30: # Filter out short/empty paragraphs
50
+ text_parts.append(text)
51
+
52
+ full_text = '\n\n'.join(text_parts)
53
+ if len(full_text) > 300: # Only return if substantial content
54
+ return full_text[:10000] # Limit to 10000 chars
55
+
56
+ # Fallback to body text
57
+ body_text = soup.get_text(separator='\n\n', strip=True)
58
+ # Clean up multiple newlines
59
+ import re
60
+ body_text = re.sub(r'\n{3,}', '\n\n', body_text)
61
+
62
+ return body_text[:10000] if len(body_text) > 300 else "Could not extract substantial content"
63
+
64
+ except requests.exceptions.Timeout:
65
+ return "Article fetch timeout - using snippet instead"
66
+ except requests.exceptions.RequestException as e:
67
+ return f"Could not fetch article: Network error"
68
  except Exception as e:
69
+ return f"Could not fetch article: {str(e)}"
70
+
71
+
72
+ def search_articles(name: str) -> str:
73
+ """Search for newspaper articles containing the name and keywords using DuckDuckGo"""
74
+ keywords = ['founders', 'partners', 'funders', 'owners']
75
+ search_query = f'"{name}" ({" OR ".join(keywords)}) site:news'
76
+
77
+ max_retries = 3
78
+ base_delay = 3
79
+
80
+ for attempt in range(max_retries):
81
+ try:
82
+ print(f"Search attempt {attempt + 1}: {search_query}")
83
+
84
+ # Progressive delay
85
+ delay = base_delay * (attempt + 1)
86
+ time.sleep(delay)
87
+
88
+ # Use different configurations for each attempt
89
+ configs = [
90
+ {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
91
+ {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'},
92
+ {'timeout': 30, 'region': None, 'safesearch': 'moderate'}
93
+ ]
94
+
95
+ config = configs[min(attempt, len(configs)-1)]
96
+
97
+ with DDGS(timeout=config['timeout']) as ddgs:
98
+ search_params = {
99
+ 'keywords': search_query,
100
+ 'max_results': 2,
101
+ 'safesearch': config['safesearch']
102
+ }
103
+ if config['region']:
104
+ search_params['region'] = config['region']
105
+
106
+ results = list(ddgs.text(**search_params))
107
+ print(f"Found {len(results)} results on attempt {attempt + 1}")
108
+
109
+ if not results:
110
+ if attempt < max_retries - 1:
111
+ print(f"No results found, retrying with different parameters...")
112
+ continue
113
+ return f"No articles found for {name} after {max_retries} attempts"
114
+
115
+ articles = []
116
+ for i, result in enumerate(results, 1):
117
+ url = result.get('href', 'No URL')
118
+ title = result.get('title', 'No Title')
119
+ snippet = result.get('body', 'No snippet available')
120
+
121
+ print(f"Processing article {i}: {title}")
122
+ print(f"URL: {url}")
123
+
124
+ # Add delay between article fetches
125
+ if i > 1:
126
+ time.sleep(2)
127
+
128
+ # Try to get full article
129
+ full_text = get_full_article(url)
130
+
131
+ # Use snippet as fallback if full article extraction fails
132
+ if any(error in full_text for error in ["Could not fetch", "timeout", "Network error"]):
133
+ print(f"Using snippet fallback for article {i}")
134
+ content = f"[SNIPPET ONLY - Full article unavailable]\n{snippet}"
135
+ else:
136
+ content = full_text
137
+
138
+ article = f"**{i}. {title}**\n"
139
+ article += f"Source: {url}\n\n"
140
+ article += f"{content}\n"
141
+ articles.append(article)
142
+
143
+ return "\n" + "="*80 + "\n".join(articles)
144
+
145
+ except Exception as e:
146
+ error_msg = f"Attempt {attempt + 1} failed: {str(e)}"
147
+ print(error_msg)
148
+
149
+ if attempt < max_retries - 1:
150
+ wait_time = base_delay * (attempt + 2)
151
+ print(f"Waiting {wait_time} seconds before retry...")
152
+ time.sleep(wait_time)
153
+ else:
154
+ return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"
155
 
156
 
157
  def extract_entities(search_results: str) -> str:
158
  """Extract entities using Mistral 7B endpoint"""
159
  modal_endpoint = "https://msoaresdiego--mistral-llm-endpoint-fastapi-app.modal.run/generate"
160
+ prompt = f"""Extract all person names and organization names from the following text. Do not extract products and service names. Only individuals and organizations. Bring the full details of the name in the newspaper article. For example, if only ACME is mentioned as company name, bring only ACME. IF ACME Inc is mentioned as company name, then you have to extract ACME Inc. In addition, define the relationship between the entity and the company that is being searched. For example, is ACME Inc an owner of the company being searched? Then write 'owner'. Is ACME Inc. a funder of the company being searched? Then write 'funder'
161
+
162
  Format as:
163
+ PERSON: [name] - [relationship]
164
+ ORG: [organization name] - [relationship]
165
+
166
  Text: {search_results}"""
167
+
168
  try:
169
  response = requests.post(
170
  modal_endpoint,
171
+ json={"prompt": prompt, "max_tokens": 1500, "temperature": 0.15},
172
+ timeout=30
173
  )
174
  if response.status_code == 200:
175
  return response.json().get("response", "No entities extracted")
176
  else:
177
+ return f"[ERROR] API Error: {response.status_code} - {response.text}"
178
+ except requests.exceptions.Timeout:
179
+ return "[ERROR] Entity extraction timeout - please try again"
180
  except Exception as e:
181
  return f"[ERROR] Extraction failed: {str(e)}"
182
 
 
193
  articles_output = search_articles(name.strip())
194
  search_time = time.time() - search_start
195
 
196
+ search_results = f"Search completed for: {name} in {search_time:.1f}s\n\n"
197
  search_results += articles_output
198
 
199
  return search_results, articles_output # Return both display and raw results
 
212
  entities = extract_entities(stored_search_results)
213
  extract_time = time.time() - extract_start
214
 
215
+ extraction_results = f"Entity extraction completed in {extract_time:.1f}s\n\n"
216
  extraction_results += entities
217
 
218
  return extraction_results
 
225
  with gr.Blocks(title="Related Entities Finder") as demo:
226
  gr.Markdown("# 🔎 Related Entities Finder")
227
  gr.Markdown("Enter a business or project name to search for related articles and extract key entities.")
228
+ gr.Markdown("*Note: Full article extraction may take 30-60 seconds. Snippets will be used as fallback if needed.*")
229
 
230
  # State to store search results between operations
231
  search_state = gr.State("")
232
 
233
  with gr.Row():
234
+ name_input = gr.Textbox(label="Company/Project Name", placeholder="Enter business or project name")
235
  with gr.Column():
236
+ search_btn = gr.Button("🔍 Search Articles", variant="primary", size="lg")
237
+ extract_btn = gr.Button("📋 Extract Entities", variant="secondary", size="lg")
238
 
239
  with gr.Column():
240
+ output1 = gr.Textbox(
241
+ label="Search Results",
242
+ lines=40,
243
+ max_lines=100,
244
+ show_copy_button=True
245
+ )
246
+ output2 = gr.Textbox(
247
+ label="Extracted Entities and Relationships",
248
+ lines=10,
249
+ max_lines=20,
250
+ show_copy_button=True
251
+ )
252
 
253
  # Search button click
254
  search_btn.click(
 
266
 
267
 
268
  if __name__ == "__main__":
269
+ demo.launch(share=False, server_name="0.0.0.0")