cyberandy commited on
Commit
dfbd641
·
verified ·
1 Parent(s): 75c2c2b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +319 -0
app.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import os
5
+ import json
6
+ import logging
7
+ import pandas as pd # Useful for creating the dataframe output
8
+
9
+ # ------------------------
10
+ # Configuration
11
+ # ------------------------
12
+ WORDLIFT_API_URL = "https://api.wordlift.io/content-evaluations"
13
+ WORDLIFT_API_KEY = os.getenv("WORDLIFT_API_KEY") # Get API key from environment variable
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # ------------------------
20
+ # Custom CSS & Theme
21
+ # ------------------------
22
+
23
+ css = """
24
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
25
+ body {
26
+ font-family: 'Open Sans', sans-serif !important;
27
+ }
28
+ .primary-btn {
29
+ background-color: #3452db !important;
30
+ color: white !important;
31
+ }
32
+ .primary-btn:hover {
33
+ background-color: #2a41af !important;
34
+ }
35
+ .gradio-container {
36
+ max-width: 1200px; /* Limit width for better readability */
37
+ margin: auto;
38
+ }
39
+ """
40
+
41
+ theme = gr.themes.Soft(
42
+ primary_hue=gr.themes.colors.Color(
43
+ name="blue",
44
+ c50="#eef1ff",
45
+ c100="#e0e5ff",
46
+ c200="#c3cbff",
47
+ c300="#a5b2ff",
48
+ c400="#8798ff",
49
+ c500="#6a7eff",
50
+ c600="#3452db",
51
+ c700="#2a41af",
52
+ c800="#1f3183",
53
+ c900="#152156",
54
+ c950="#0a102b",
55
+ )
56
+ )
57
+
58
+ # ------------------------
59
+ # Content Fetching Logic
60
+ # ------------------------
61
+
62
+ def fetch_content_from_url(url: str, timeout: int = 15) -> str:
63
+ """Fetches main text content from a URL."""
64
+ logger.info(f"Fetching content from: {url}")
65
+ try:
66
+ headers = {
67
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
68
+ }
69
+ response = requests.get(url, headers=headers, timeout=timeout)
70
+ response.raise_for_status() # Raise an exception for bad status codes
71
+
72
+ soup = BeautifulSoup(response.content, 'html.parser')
73
+
74
+ # Attempt to find main content block
75
+ main_content = soup.find('main') or soup.find('article')
76
+
77
+ if main_content:
78
+ # Extract text from common text-containing tags within the main block
79
+ text_elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote'])
80
+ text = ' '.join([elem.get_text() for elem in text_elements])
81
+ else:
82
+ # Fallback to extracting text from body if no main block found
83
+ text_elements = soup.body.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote'])
84
+ text = ' '.join([elem.get_text() for elem in text_elements])
85
+ logger.warning(f"No <main> or <article> found for {url}, extracting from body.")
86
+
87
+
88
+ # Clean up extra whitespace
89
+ text = ' '.join(text.split())
90
+
91
+ # Limit text length to avoid excessively large API calls (adjust as needed)
92
+ max_text_length = 150000 # approx 25k words, adjust based on API limits/cost
93
+ if len(text) > max_text_length:
94
+ logger.warning(f"Content for {url} is too long ({len(text)} chars), truncating to {max_text_length} chars.")
95
+ text = text[:max_text_length] + "..." # Indicate truncation
96
+
97
+ return text
98
+
99
+ except requests.exceptions.RequestException as e:
100
+ logger.error(f"Failed to fetch content from {url}: {e}")
101
+ return None
102
+ except Exception as e:
103
+ logger.error(f"Error processing content from {url}: {e}")
104
+ return None
105
+
106
+ # ------------------------
107
+ # WordLift API Call Logic
108
+ # ------------------------
109
+
110
+ def call_wordlift_api(text: str, keywords: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
111
+ """Calls the WordLift Content Evaluation API."""
112
+ if not WORDLIFT_API_KEY:
113
+ logger.error("WORDLIFT_API_KEY environment variable not set.")
114
+ return {"error": "API key not configured."}
115
+
116
+ if not text:
117
+ return {"error": "No content provided or fetched."}
118
+
119
+ payload = {
120
+ "text": text,
121
+ "keywords": keywords if keywords else []
122
+ }
123
+
124
+ headers = {
125
+ 'Authorization': f'Key {WORDLIFT_API_KEY}',
126
+ 'Content-Type': 'application/json',
127
+ 'Accept': 'application/json'
128
+ }
129
+
130
+ logger.info(f"Calling WordLift API with text length {len(text)} and {len(keywords or [])} keywords.")
131
+
132
+ try:
133
+ response = requests.post(WORDLIFT_API_URL, headers=headers, json=payload, timeout=60) # Increased timeout
134
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
135
+ return response.json()
136
+
137
+ except requests.exceptions.HTTPError as e:
138
+ logger.error(f"WordLift API HTTP error: {e.response.status_code} - {e.response.text}")
139
+ try:
140
+ error_detail = e.response.json()
141
+ except json.JSONDecodeError:
142
+ error_detail = e.response.text
143
+ return {"error": f"API returned status code {e.response.status_code}", "details": error_detail}
144
+ except requests.exceptions.RequestException as e:
145
+ logger.error(f"WordLift API request error: {e}")
146
+ return {"error": f"API request failed: {e}"}
147
+ except Exception as e:
148
+ logger.error(f"Unexpected error during API call: {e}")
149
+ return {"error": f"An unexpected error occurred: {e}"}
150
+
151
+
152
+ # ------------------------
153
+ # Main Evaluation Batch Function
154
+ # ------------------------
155
+
156
+ def evaluate_urls_batch(url_data: pd.DataFrame):
157
+ """
158
+ Evaluates a batch of URLs using the WordLift API.
159
+
160
+ Args:
161
+ url_data: A pandas DataFrame with columns ['URL', 'Target Keywords (comma-separated)'].
162
+
163
+ Returns:
164
+ A tuple containing:
165
+ - A pandas DataFrame with the summary results.
166
+ - A dictionary containing the full results (including errors) keyed by URL.
167
+ """
168
+ if not url_data or url_data.empty:
169
+ return pd.DataFrame(columns=['URL', 'Status', 'Overall Score', 'Content Purpose', 'Content Accuracy', 'Content Depth', 'Readability Score (API)', 'Readability Grade Level', 'SEO Score', 'Word Count', 'Error/Details']), {}
170
+
171
+ summary_results = []
172
+ full_results = {}
173
+
174
+ for index, row in url_data.iterrows():
175
+ url = row['URL'].strip()
176
+ keywords_str = row['Target Keywords (comma-separated)'].strip() if row['Target Keywords (comma-separated)'] else ""
177
+ keywords = [kw.strip() for kw in keywords_str.split(',') if kw.strip()]
178
+
179
+ if not url:
180
+ summary_results.append([url, "Skipped", None, None, None, None, None, None, None, None, "Empty URL"])
181
+ full_results[url if url else f"Row_{index}"] = {"status": "Skipped", "error": "Empty URL input."}
182
+ continue
183
+
184
+ logger.info(f"Processing URL: {url} with keywords: {keywords}")
185
+
186
+ # 1. Fetch Content
187
+ content = fetch_content_from_url(url)
188
+
189
+ if content is None or not content.strip():
190
+ status = "Failed"
191
+ error_msg = "Failed to fetch or extract content."
192
+ summary_results.append([url, status, None, None, None, None, None, None, None, None, error_msg])
193
+ full_results[url] = {"status": status, "error": error_msg}
194
+ logger.error(f"Processing failed for {url}: {error_msg}")
195
+ continue # Move to next URL
196
+
197
+ # 2. Call WordLift API
198
+ api_result = call_wordlift_api(content, keywords)
199
+
200
+ # 3. Process API Result
201
+ summary_row = [url]
202
+ if api_result and "error" not in api_result:
203
+ status = "Success"
204
+ qs = api_result.get('quality_score', {})
205
+ breakdown = qs.get('breakdown', {})
206
+ content_breakdown = breakdown.get('content', {})
207
+ readability_breakdown = breakdown.get('readability', {})
208
+ seo_breakdown = breakdown.get('seo', {})
209
+ metadata = api_result.get('metadata', {})
210
+
211
+ summary_row.extend([
212
+ status,
213
+ qs.get('overall', None),
214
+ content_breakdown.get('purpose', None),
215
+ content_breakdown.get('accuracy', None),
216
+ content_breakdown.get('depth', None),
217
+ readability_breakdown.get('score', None), # API's readability score (e.g. 2.5)
218
+ readability_breakdown.get('grade_level', None),
219
+ seo_breakdown.get('score', None),
220
+ metadata.get('word_count', None),
221
+ None # No error
222
+ ])
223
+ full_results[url] = api_result # Store full API result
224
+
225
+ else:
226
+ status = "Failed"
227
+ error_msg = api_result.get("error", "Unknown API error.") if api_result else "API call failed."
228
+ details = api_result.get("details", "") if api_result else ""
229
+ summary_row.extend([
230
+ status,
231
+ None, None, None, None, None, None, None, None,
232
+ f"{error_msg} {details}"
233
+ ])
234
+ full_results[url] = {"status": status, "error": error_msg, "details": details}
235
+ logger.error(f"API call failed for {url}: {error_msg} {details}")
236
+
237
+ summary_results.append(summary_row)
238
+
239
+ # Create pandas DataFrame for summary output
240
+ summary_df = pd.DataFrame(summary_results, columns=[
241
+ 'URL', 'Status', 'Overall Score', 'Content Purpose',
242
+ 'Content Accuracy', 'Content Depth', 'Readability Score (API)',
243
+ 'Readability Grade Level', 'SEO Score', 'Word Count', 'Error/Details'
244
+ ])
245
+
246
+ # Format numeric columns for display if they are not None
247
+ for col in ['Overall Score', 'Content Purpose', 'Content Accuracy', 'Content Depth', 'Readability Score (API)', 'Readability Grade Level', 'SEO Score', 'Word Count']:
248
+ if col in summary_df.columns:
249
+ # Convert to numeric, coercing errors, then format
250
+ summary_df[col] = pd.to_numeric(summary_df[col], errors='coerce')
251
+ if col in ['Overall Score', 'Readability Score (API)', 'SEO Score']:
252
+ summary_df[col] = summary_df[col].apply(lambda x: f'{x:.1f}' if pd.notna(x) else '-')
253
+ else:
254
+ summary_df[col] = summary_df[col].apply(lambda x: f'{int(x)}' if pd.notna(x) else '-')
255
+
256
+
257
+ return summary_df, full_results
258
+
259
+ # ------------------------
260
+ # Gradio Blocks Interface Setup
261
+ # ------------------------
262
+
263
+ with gr.Blocks(css=css, theme=theme) as demo:
264
+ gr.Markdown("# WordLift Multi-URL Content Evaluator")
265
+ gr.Markdown(
266
+ "Enter up to 30 URLs in the table below. "
267
+ "Optionally, provide comma-separated target keywords for each URL. "
268
+ "The app will fetch content from each URL and evaluate it using the WordLift API."
269
+ )
270
+
271
+ with gr.Row():
272
+ with gr.Column():
273
+ url_input_df = gr.Dataframe(
274
+ headers=["URL", "Target Keywords (comma-separated)"],
275
+ datatype=["str", "str"],
276
+ row_count=(1, 30), # Allow adding rows up to 30
277
+ col_count=(2, "fixed"),
278
+ value=[["https://example.com/article1", "keyword A, keyword B"], ["https://example.com/article2", ""]], # Default examples
279
+ label="URLs and Keywords"
280
+ )
281
+ submit_button = gr.Button("Evaluate All URLs", elem_classes=["primary-btn"])
282
+
283
+ gr.Markdown("## Evaluation Results")
284
+
285
+ with gr.Column():
286
+ summary_output_df = gr.DataFrame(
287
+ label="Summary Results",
288
+ headers=['URL', 'Status', 'Overall Score', 'Content Purpose', 'Content Accuracy', 'Content Depth', 'Readability Score (API)', 'Readability Grade Level', 'SEO Score', 'Word Count', 'Error/Details'],
289
+ datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "str"], # Use str to handle '-' for missing values
290
+ wrap=True # Wrap text in columns
291
+ )
292
+ with gr.Accordion("Full JSON Results", open=False):
293
+ full_results_json = gr.JSON(label="Raw API Results per URL")
294
+
295
+ submit_button.click(
296
+ fn=evaluate_urls_batch,
297
+ inputs=[url_input_df],
298
+ outputs=[summary_output_df, full_results_json]
299
+ )
300
+
301
+ # Launch the app
302
+ if __name__ == "__main__":
303
+ if not WORDLIFT_API_KEY:
304
+ logger.error("\n----------------------------------------------------------")
305
+ logger.error("WORDLIFT_API_KEY environment variable is not set.")
306
+ logger.error("Please set it before running the script:")
307
+ logger.error(" export WORDLIFT_API_KEY='YOUR_API_KEY'")
308
+ logger.error("Or if using a .env file and python-dotenv:")
309
+ logger.error(" pip install python-dotenv")
310
+ logger.error(" # Add WORDLIFT_API_KEY=YOUR_API_KEY to a .env file")
311
+ logger.error(" # import dotenv; dotenv.load_dotenv()")
312
+ logger.error(" # in your script before getting the key.")
313
+ logger.error("----------------------------------------------------------\n")
314
+ # Optionally exit or raise error here if the key is strictly required to launch
315
+ # exit()
316
+ pass # Allow launching, but API calls will fail
317
+
318
+ logger.info("Launching Gradio app...")
319
+ demo.launch()