dygoo commited on
Commit
64dcaa8
·
verified ·
1 Parent(s): fede833

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -130
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import requests
3
  import time
 
4
  from duckduckgo_search import DDGS
5
  from bs4 import BeautifulSoup
6
 
@@ -13,163 +14,131 @@ def get_full_article(url):
13
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
15
  'Accept-Language': 'en-US,en;q=0.5',
16
- 'Accept-Encoding': 'gzip, deflate',
17
  'Connection': 'keep-alive',
18
  'Upgrade-Insecure-Requests': '1'
19
  }
20
-
21
- response = requests.get(url, headers=headers, timeout=15, verify=False)
22
  response.raise_for_status()
23
-
24
  soup = BeautifulSoup(response.content, 'html.parser')
25
-
26
- # Remove unwanted elements
27
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
28
  element.decompose()
29
-
30
- # Try common article selectors in order of preference
31
  article_selectors = [
32
- 'article',
33
- '.article-content', '.post-content', '.story-body', '.story-content',
34
- '.entry-content', '.content-body', '.article-body',
35
- 'main article', 'main .content', 'main',
36
- '[role="main"]', '.main-content', '.page-content',
37
- '.text', '.article-text'
38
  ]
39
-
40
  for selector in article_selectors:
41
  content = soup.select_one(selector)
42
  if content:
43
- # Extract paragraphs for better formatting
44
  paragraphs = content.find_all(['p', 'div'], string=True)
45
  if paragraphs:
46
  text_parts = []
47
  for p in paragraphs:
48
  text = p.get_text(strip=True)
49
- if len(text) > 30: # Filter out short/empty paragraphs
50
  text_parts.append(text)
51
-
52
  full_text = '\n\n'.join(text_parts)
53
- if len(full_text) > 300: # Only return if substantial content
54
- return full_text[:10000] # Limit to 10000 chars
55
-
56
- # Fallback to body text
57
  body_text = soup.get_text(separator='\n\n', strip=True)
58
- # Clean up multiple newlines
59
- import re
60
  body_text = re.sub(r'\n{3,}', '\n\n', body_text)
61
-
62
- return body_text[:10000] if len(body_text) > 300 else "Could not extract substantial content"
63
-
64
  except requests.exceptions.Timeout:
65
- return "Article fetch timeout - using snippet instead"
66
- except requests.exceptions.RequestException as e:
67
- return f"Could not fetch article: Network error"
68
  except Exception as e:
69
- return f"Could not fetch article: {str(e)}"
70
 
71
 
72
- def search_articles(name: str) -> str:
73
- """Search for newspaper articles containing the name and keywords using DuckDuckGo"""
74
  keywords = ['founders', 'partners', 'funders', 'owners']
75
  search_query = f'"{name}" ({" OR ".join(keywords)}) site:news'
76
-
77
  max_retries = 3
78
  base_delay = 3
79
-
80
  for attempt in range(max_retries):
81
  try:
82
  print(f"Search attempt {attempt + 1}: {search_query}")
83
-
84
- # Progressive delay
85
- delay = base_delay * (attempt + 1)
86
- time.sleep(delay)
87
-
88
- # Use different configurations for each attempt
89
  configs = [
90
  {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
91
  {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'},
92
  {'timeout': 30, 'region': None, 'safesearch': 'moderate'}
93
  ]
94
-
95
  config = configs[min(attempt, len(configs)-1)]
96
-
97
  with DDGS(timeout=config['timeout']) as ddgs:
98
  search_params = {
99
  'keywords': search_query,
100
- 'max_results': 2,
101
  'safesearch': config['safesearch']
102
  }
103
  if config['region']:
104
  search_params['region'] = config['region']
105
-
106
  results = list(ddgs.text(**search_params))
107
  print(f"Found {len(results)} results on attempt {attempt + 1}")
108
-
109
  if not results:
110
- if attempt < max_retries - 1:
111
- print(f"No results found, retrying with different parameters...")
112
- continue
113
- return f"No articles found for {name} after {max_retries} attempts"
114
 
115
  articles = []
116
  for i, result in enumerate(results, 1):
117
  url = result.get('href', 'No URL')
118
  title = result.get('title', 'No Title')
119
  snippet = result.get('body', 'No snippet available')
120
-
121
- print(f"Processing article {i}: {title}")
122
- print(f"URL: {url}")
123
-
124
- # Add delay between article fetches
125
  if i > 1:
126
  time.sleep(2)
127
-
128
- # Try to get full article
129
  full_text = get_full_article(url)
130
-
131
- # Use snippet as fallback if full article extraction fails
132
- if any(error in full_text for error in ["Could not fetch", "timeout", "Network error"]):
133
  print(f"Using snippet fallback for article {i}")
134
- content = f"[SNIPPET ONLY - Full article unavailable]\n{snippet}"
135
  else:
136
  content = full_text
137
-
138
- article = f"**{i}. {title}**\n"
139
- article += f"Source: {url}\n\n"
140
  article += f"{content}\n"
141
  articles.append(article)
142
 
143
- return "\n" + "="*80 + "\n".join(articles)
144
-
145
  except Exception as e:
146
- error_msg = f"Attempt {attempt + 1} failed: {str(e)}"
147
- print(error_msg)
148
-
149
  if attempt < max_retries - 1:
150
- wait_time = base_delay * (attempt + 2)
151
- print(f"Waiting {wait_time} seconds before retry...")
152
- time.sleep(wait_time)
153
  else:
154
  return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"
155
 
 
 
156
 
157
  def extract_entities(search_results: str) -> str:
158
- """Extract entities using Mistral 7B endpoint"""
159
  modal_endpoint = "https://msoaresdiego--mistral-llm-endpoint-fastapi-app.modal.run/generate"
160
  prompt = f"""Extract all person names and organization names from the following text. Do not extract products and service names. Only individuals and organizations. Bring the full details of the name in the newspaper article. For example, if only ACME is mentioned as company name, bring only ACME. IF ACME Inc is mentioned as company name, then you have to extract ACME Inc. In addition, define the relationship between the entity and the company that is being searched. For example, is ACME Inc an owner of the company being searched? Then write 'owner'. Is ACME Inc. a funder of the company being searched? Then write 'funder'
161
-
162
  Format as:
163
  PERSON: [name] - [relationship]
164
  ORG: [organization name] - [relationship]
165
-
166
  Text: {search_results}"""
167
-
168
  try:
169
  response = requests.post(
170
  modal_endpoint,
171
  json={"prompt": prompt, "max_tokens": 1500, "temperature": 0.15},
172
- timeout=30
173
  )
174
  if response.status_code == 200:
175
  return response.json().get("response", "No entities extracted")
@@ -180,90 +149,71 @@ Text: {search_results}"""
180
  except Exception as e:
181
  return f"[ERROR] Extraction failed: {str(e)}"
182
 
183
-
184
  # === Gradio interface functions ===
185
 
186
- def search_only(name: str):
187
- """Perform search only and return results"""
188
  if not name.strip():
189
  return "No name provided", ""
190
-
191
  try:
192
- search_start = time.time()
193
- articles_output = search_articles(name.strip())
194
- search_time = time.time() - search_start
195
-
196
- search_results = f"Search completed for: {name} in {search_time:.1f}s\n\n"
197
- search_results += articles_output
198
-
199
- return search_results, articles_output # Return both display and raw results
200
- except Exception as e:
201
- error_msg = f"[ERROR] Search failed: {str(e)}"
202
- return error_msg, ""
203
 
 
 
 
 
 
 
204
 
205
- def extract_only(stored_search_results: str):
206
- """Extract entities from stored search results"""
207
- if not stored_search_results.strip():
208
  return "No search results available. Please search first."
209
-
210
  try:
211
- extract_start = time.time()
212
- entities = extract_entities(stored_search_results)
213
- extract_time = time.time() - extract_start
214
-
215
- extraction_results = f"Entity extraction completed in {extract_time:.1f}s\n\n"
216
- extraction_results += entities
217
-
218
- return extraction_results
219
  except Exception as e:
220
  return f"[ERROR] Extraction failed: {str(e)}"
221
 
222
-
223
  # === Gradio UI ===
224
 
225
  with gr.Blocks(title="Related Entities Finder") as demo:
226
  gr.Markdown("# 🔎 Related Entities Finder")
227
  gr.Markdown("Enter a business or project name to search for related articles and extract key entities.")
228
- gr.Markdown("*Note: Full article extraction may take 30-60 seconds. Snippets will be used as fallback if needed.*")
229
 
230
- # State to store search results between operations
231
  search_state = gr.State("")
232
 
233
  with gr.Row():
234
  name_input = gr.Textbox(label="Company/Project Name", placeholder="Enter business or project name")
 
235
  with gr.Column():
236
- search_btn = gr.Button("🔍 Search Articles", variant="primary", size="lg")
237
- extract_btn = gr.Button("📋 Extract Entities", variant="secondary", size="lg")
238
-
239
- with gr.Column():
240
- output1 = gr.Textbox(
241
- label="Search Results",
242
- lines=40,
243
- max_lines=100,
244
- show_copy_button=True
245
- )
246
- output2 = gr.Textbox(
247
- label="Extracted Entities and Relationships",
248
- lines=10,
249
- max_lines=20,
250
- show_copy_button=True
251
- )
252
-
253
- # Search button click
254
  search_btn.click(
255
  fn=search_only,
256
- inputs=[name_input],
257
  outputs=[output1, search_state]
258
  )
259
-
260
- # Extract button click
261
  extract_btn.click(
262
  fn=extract_only,
263
  inputs=[search_state],
264
  outputs=[output2]
265
  )
266
 
267
-
268
  if __name__ == "__main__":
269
- demo.launch(share=False, server_name="0.0.0.0")
 
1
  import gradio as gr
2
  import requests
3
  import time
4
+ import re
5
  from duckduckgo_search import DDGS
6
  from bs4 import BeautifulSoup
7
 
 
14
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
15
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
16
  'Accept-Language': 'en-US,en;q=0.5',
 
17
  'Connection': 'keep-alive',
18
  'Upgrade-Insecure-Requests': '1'
19
  }
20
+
21
+ response = requests.get(url, headers=headers, timeout=20, verify=True)
22
  response.raise_for_status()
 
23
  soup = BeautifulSoup(response.content, 'html.parser')
24
+
 
25
  for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'ads', 'noscript', 'form']):
26
  element.decompose()
27
+
 
28
  article_selectors = [
29
+ 'article', '.article-content', '.post-content', '.story-body', '.story-content',
30
+ '.entry-content', '.content-body', '.article-body', 'main article', 'main .content', 'main',
31
+ '[role="main"]', '.main-content', '.page-content', '.text', '.article-text'
 
 
 
32
  ]
33
+
34
  for selector in article_selectors:
35
  content = soup.select_one(selector)
36
  if content:
 
37
  paragraphs = content.find_all(['p', 'div'], string=True)
38
  if paragraphs:
39
  text_parts = []
40
  for p in paragraphs:
41
  text = p.get_text(strip=True)
42
+ if len(text) > 30:
43
  text_parts.append(text)
 
44
  full_text = '\n\n'.join(text_parts)
45
+ if len(full_text) > 300:
46
+ return full_text[:10000]
47
+
 
48
  body_text = soup.get_text(separator='\n\n', strip=True)
 
 
49
  body_text = re.sub(r'\n{3,}', '\n\n', body_text)
50
+
51
+ return body_text[:10000] if len(body_text) > 300 else "[INFO] Could not extract substantial content"
52
+
53
  except requests.exceptions.Timeout:
54
+ return "[WARNING] Article fetch timeout - using snippet instead"
55
+ except requests.exceptions.RequestException:
56
+ return "[ERROR] Could not fetch article: Network error"
57
  except Exception as e:
58
+ return f"[ERROR] Could not fetch article: {str(e)}"
59
 
60
 
61
+ def search_articles(name: str, max_articles: int = 2) -> str:
 
62
  keywords = ['founders', 'partners', 'funders', 'owners']
63
  search_query = f'"{name}" ({" OR ".join(keywords)}) site:news'
64
+
65
  max_retries = 3
66
  base_delay = 3
67
+
68
  for attempt in range(max_retries):
69
  try:
70
  print(f"Search attempt {attempt + 1}: {search_query}")
71
+ time.sleep(base_delay * (attempt + 1))
72
+
 
 
 
 
73
  configs = [
74
  {'timeout': 20, 'region': 'us-en', 'safesearch': 'moderate'},
75
  {'timeout': 25, 'region': 'wt-wt', 'safesearch': 'off'},
76
  {'timeout': 30, 'region': None, 'safesearch': 'moderate'}
77
  ]
78
+
79
  config = configs[min(attempt, len(configs)-1)]
80
+
81
  with DDGS(timeout=config['timeout']) as ddgs:
82
  search_params = {
83
  'keywords': search_query,
84
+ 'max_results': max_articles,
85
  'safesearch': config['safesearch']
86
  }
87
  if config['region']:
88
  search_params['region'] = config['region']
89
+
90
  results = list(ddgs.text(**search_params))
91
  print(f"Found {len(results)} results on attempt {attempt + 1}")
92
+
93
  if not results:
94
+ continue
 
 
 
95
 
96
  articles = []
97
  for i, result in enumerate(results, 1):
98
  url = result.get('href', 'No URL')
99
  title = result.get('title', 'No Title')
100
  snippet = result.get('body', 'No snippet available')
101
+
 
 
 
 
102
  if i > 1:
103
  time.sleep(2)
104
+
 
105
  full_text = get_full_article(url)
106
+ if any(error in full_text for error in ["[ERROR]", "timeout", "Network error"]):
 
 
107
  print(f"Using snippet fallback for article {i}")
108
+ content = f"[SNIPPET ONLY]\n{snippet}"
109
  else:
110
  content = full_text
111
+
112
+ article = f"### {i}. {title}\n"
113
+ article += f"[Source]({url})\n\n"
114
  article += f"{content}\n"
115
  articles.append(article)
116
 
117
+ return "\n---\n".join(articles)
118
+
119
  except Exception as e:
120
+ print(f"Attempt {attempt + 1} failed: {str(e)}")
 
 
121
  if attempt < max_retries - 1:
122
+ time.sleep(base_delay * (attempt + 2))
 
 
123
  else:
124
  return f"[ERROR] Search failed after {max_retries} attempts. Last error: {str(e)}"
125
 
126
+ return f"[INFO] No articles found for {name}"
127
+
128
 
129
  def extract_entities(search_results: str) -> str:
 
130
  modal_endpoint = "https://msoaresdiego--mistral-llm-endpoint-fastapi-app.modal.run/generate"
131
  prompt = f"""Extract all person names and organization names from the following text. Do not extract products and service names. Only individuals and organizations. Bring the full details of the name in the newspaper article. For example, if only ACME is mentioned as company name, bring only ACME. IF ACME Inc is mentioned as company name, then you have to extract ACME Inc. In addition, define the relationship between the entity and the company that is being searched. For example, is ACME Inc an owner of the company being searched? Then write 'owner'. Is ACME Inc. a funder of the company being searched? Then write 'funder'
 
132
  Format as:
133
  PERSON: [name] - [relationship]
134
  ORG: [organization name] - [relationship]
 
135
  Text: {search_results}"""
136
+
137
  try:
138
  response = requests.post(
139
  modal_endpoint,
140
  json={"prompt": prompt, "max_tokens": 1500, "temperature": 0.15},
141
+ timeout=60
142
  )
143
  if response.status_code == 200:
144
  return response.json().get("response", "No entities extracted")
 
149
  except Exception as e:
150
  return f"[ERROR] Extraction failed: {str(e)}"
151
 
 
152
  # === Gradio interface functions ===
153
 
154
+ def search_only(name: str, article_count: int):
 
155
  if not name.strip():
156
  return "No name provided", ""
157
+
158
  try:
159
+ start = time.time()
160
+ articles_output = search_articles(name.strip(), max_articles=article_count)
161
+ elapsed = time.time() - start
 
 
 
 
 
 
 
 
162
 
163
+ results = f"✅ Search completed for **{name}** in {elapsed:.1f}s\n\n"
164
+ results += articles_output
165
+
166
+ return results, articles_output
167
+ except Exception as e:
168
+ return f"[ERROR] Search failed: {str(e)}", ""
169
 
170
+ def extract_only(stored_results: str):
171
+ if not stored_results.strip():
 
172
  return "No search results available. Please search first."
173
+
174
  try:
175
+ start = time.time()
176
+ entities = extract_entities(stored_results)
177
+ elapsed = time.time() - start
178
+ return f"✅ Extraction completed in {elapsed:.1f}s\n\n{entities}"
 
 
 
 
179
  except Exception as e:
180
  return f"[ERROR] Extraction failed: {str(e)}"
181
 
 
182
  # === Gradio UI ===
183
 
184
  with gr.Blocks(title="Related Entities Finder") as demo:
185
  gr.Markdown("# 🔎 Related Entities Finder")
186
  gr.Markdown("Enter a business or project name to search for related articles and extract key entities.")
187
+ gr.Markdown("*Note: Full article extraction may take 3060 seconds. Snippets will be used as fallback if needed.*")
188
 
 
189
  search_state = gr.State("")
190
 
191
  with gr.Row():
192
  name_input = gr.Textbox(label="Company/Project Name", placeholder="Enter business or project name")
193
+ article_count_slider = gr.Slider(1, 10, value=2, step=1, label="Number of Articles")
194
  with gr.Column():
195
+ search_btn = gr.Button("🔍 Search Articles", variant="primary")
196
+ extract_btn = gr.Button("📋 Extract Entities", variant="secondary")
197
+
198
+ output1 = gr.Markdown(label="Search Results")
199
+ output2 = gr.Textbox(
200
+ label="Extracted Entities and Relationships",
201
+ lines=10,
202
+ max_lines=20,
203
+ show_copy_button=True
204
+ )
205
+
 
 
 
 
 
 
 
206
  search_btn.click(
207
  fn=search_only,
208
+ inputs=[name_input, article_count_slider],
209
  outputs=[output1, search_state]
210
  )
211
+
 
212
  extract_btn.click(
213
  fn=extract_only,
214
  inputs=[search_state],
215
  outputs=[output2]
216
  )
217
 
 
218
  if __name__ == "__main__":
219
+ demo.launch()