apexherbert200 commited on
Commit
d3fac2e
Β·
1 Parent(s): 1aef688

Worked on get_page function

Browse files
Files changed (2) hide show
  1. dashboard.py +392 -0
  2. webrify2.py +33 -3
dashboard.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enhanced_dashboard.py
2
+ import streamlit as st
3
+ import requests
4
+ import base64
5
+ import json
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ from datetime import datetime
10
+ import time
11
+
12
+ # Page configuration
13
+ st.set_page_config(
14
+ page_title="Website Intelligence Dashboard",
15
+ page_icon="πŸš€",
16
+ layout="wide",
17
+ initial_sidebar_state="expanded"
18
+ )
19
+
20
+ # Custom CSS for better styling
21
+ st.markdown("""
22
+ <style>
23
+ .main-header {
24
+ font-size: 3rem;
25
+ color: #1f77b4;
26
+ text-align: center;
27
+ margin-bottom: 2rem;
28
+ }
29
+ .metric-card {
30
+ background-color: #f0f2f6;
31
+ padding: 1rem;
32
+ border-radius: 0.5rem;
33
+ border-left: 4px solid #1f77b4;
34
+ }
35
+ .success-metric {
36
+ border-left-color: #28a745;
37
+ }
38
+ .warning-metric {
39
+ border-left-color: #ffc107;
40
+ }
41
+ .danger-metric {
42
+ border-left-color: #dc3545;
43
+ }
44
+ .sidebar-info {
45
+ background-color: #e8f4fd;
46
+ padding: 1rem;
47
+ border-radius: 0.5rem;
48
+ margin-bottom: 1rem;
49
+ }
50
+ </style>
51
+ """, unsafe_allow_html=True)
52
+
53
+ # API Configuration
54
+ API_BASE = "https://apexherbert200-playwright-scraper-clean.hf.space"
55
+
56
+ # Sidebar configuration
57
+ st.sidebar.markdown('<div class="sidebar-info"><h3>πŸš€ Website Intelligence</h3><p>Comprehensive website analysis and monitoring platform</p></div>', unsafe_allow_html=True)
58
+
59
+ # API endpoint selection
60
+ analysis_type = st.sidebar.selectbox(
61
+ "Choose Analysis Type",
62
+ ["Complete Analysis", "SEO Only", "Performance Only", "Metadata Only", "Screenshot Only"]
63
+ )
64
+
65
+ # Advanced options
66
+ st.sidebar.markdown("### βš™οΈ Advanced Options")
67
+ screenshot_width = st.sidebar.slider("Screenshot Width", 800, 1920, 1200)
68
+ screenshot_height = st.sidebar.slider("Screenshot Height", 600, 1080, 800)
69
+ full_page_screenshot = st.sidebar.checkbox("Full Page Screenshot", value=True)
70
+
71
+ # Main dashboard
72
+ st.markdown('<h1 class="main-header">πŸš€ Website Intelligence Dashboard</h1>', unsafe_allow_html=True)
73
+
74
+ # URL input with validation
75
+ col1, col2 = st.columns([3, 1])
76
+ with col1:
77
+ url = st.text_input(
78
+ "🌐 Enter Website URL",
79
+ value="https://www.example.com",
80
+ placeholder="https://www.yourwebsite.com"
81
+ )
82
+ with col2:
83
+ st.markdown("<br>", unsafe_allow_html=True)
84
+ analyze_button = st.button("πŸ” Analyze Website", type="primary")
85
+
86
+ # URL validation
87
+ def validate_url(url):
88
+ if not url:
89
+ return False, "Please enter a URL"
90
+ if not url.startswith(('http://', 'https://')):
91
+ return False, "URL must start with http:// or https://"
92
+ return True, ""
93
+
94
+ # API request function with error handling
95
+ def make_api_request(endpoint, params):
96
+ try:
97
+ response = requests.get(f"{API_BASE}/{endpoint}", params=params, timeout=30)
98
+ response.raise_for_status()
99
+ return response.json(), None
100
+ except requests.exceptions.Timeout:
101
+ return None, "Request timed out. Please try again."
102
+ except requests.exceptions.ConnectionError:
103
+ return None, "Connection error. Please check your internet connection."
104
+ except requests.exceptions.HTTPError as e:
105
+ return None, f"HTTP error: {e.response.status_code}"
106
+ except Exception as e:
107
+ return None, f"Unexpected error: {str(e)}"
108
+
109
+ # Main analysis logic
110
+ if analyze_button:
111
+ is_valid, error_msg = validate_url(url)
112
+
113
+ if not is_valid:
114
+ st.error(f"❌ {error_msg}")
115
+ else:
116
+ # Progress tracking
117
+ progress_bar = st.progress(0)
118
+ status_text = st.empty()
119
+
120
+ # Initialize data containers
121
+ seo_data = None
122
+ perf_data = None
123
+ meta_data = None
124
+ screenshot_data = None
125
+
126
+ try:
127
+ # Metadata Analysis
128
+ if analysis_type in ["Complete Analysis", "Metadata Only"]:
129
+ status_text.text("πŸ“„ Analyzing metadata...")
130
+ progress_bar.progress(20)
131
+ meta_data, error = make_api_request("metadata", {"url": url})
132
+ if error:
133
+ st.error(f"Metadata error: {error}")
134
+
135
+ # SEO Analysis
136
+ if analysis_type in ["Complete Analysis", "SEO Only"]:
137
+ status_text.text("πŸ” Performing SEO audit...")
138
+ progress_bar.progress(40)
139
+ seo_data, error = make_api_request("seo", {"url": url})
140
+ if error:
141
+ st.error(f"SEO error: {error}")
142
+
143
+ # Performance Analysis
144
+ if analysis_type in ["Complete Analysis", "Performance Only"]:
145
+ status_text.text("⚑ Measuring performance...")
146
+ progress_bar.progress(60)
147
+ perf_data, error = make_api_request("performance", {"url": url})
148
+ if error:
149
+ st.error(f"Performance error: {error}")
150
+
151
+ # Screenshot
152
+ if analysis_type in ["Complete Analysis", "Screenshot Only"]:
153
+ status_text.text("πŸ“Έ Capturing screenshot...")
154
+ progress_bar.progress(80)
155
+ screenshot_params = {
156
+ "url": url,
157
+ "width": screenshot_width,
158
+ "height": screenshot_height,
159
+ "full_page": full_page_screenshot
160
+ }
161
+ screenshot_response, error = make_api_request("screenshot", screenshot_params)
162
+ if error:
163
+ st.error(f"Screenshot error: {error}")
164
+ else:
165
+ screenshot_data = screenshot_response.get("screenshot")
166
+
167
+ progress_bar.progress(100)
168
+ status_text.text("βœ… Analysis complete!")
169
+ time.sleep(1)
170
+ progress_bar.empty()
171
+ status_text.empty()
172
+
173
+ except Exception as e:
174
+ st.error(f"❌ Analysis failed: {str(e)}")
175
+ st.stop()
176
+
177
+ # Display Results
178
+ st.markdown("---")
179
+
180
+ # Overview Section
181
+ if any([meta_data, seo_data, perf_data]):
182
+ st.header("πŸ“Š Website Overview")
183
+
184
+ col1, col2, col3, col4 = st.columns(4)
185
+
186
+ with col1:
187
+ if meta_data and meta_data.get('title'):
188
+ st.metric("πŸ“„ Page Title", "βœ… Found" if meta_data['title'] else "❌ Missing")
189
+
190
+ with col2:
191
+ if seo_data:
192
+ h1_count = seo_data.get('h1_count', 0)
193
+ h1_status = "βœ… Good" if h1_count == 1 else f"⚠️ {h1_count} H1s"
194
+ st.metric("🏷️ H1 Tags", h1_status)
195
+
196
+ with col3:
197
+ if seo_data:
198
+ missing_alts = len(seo_data.get('missing_image_alts', []))
199
+ alt_status = "βœ… All Good" if missing_alts == 0 else f"❌ {missing_alts} Missing"
200
+ st.metric("πŸ–ΌοΈ Image Alt Tags", alt_status)
201
+
202
+ with col4:
203
+ if perf_data and perf_data.get('page_load_time_ms'):
204
+ load_time = perf_data['page_load_time_ms']
205
+ if load_time < 2000:
206
+ load_status = "πŸš€ Fast"
207
+ elif load_time < 4000:
208
+ load_status = "⚠️ Moderate"
209
+ else:
210
+ load_status = "🐌 Slow"
211
+ st.metric("⚑ Load Time", f"{load_time:.0f}ms", delta=load_status)
212
+
213
+ # Metadata Section
214
+ if meta_data:
215
+ st.header("πŸ“„ Metadata Analysis")
216
+
217
+ col1, col2 = st.columns(2)
218
+
219
+ with col1:
220
+ st.subheader("Basic Information")
221
+ st.write(f"**Title:** {meta_data.get('title', 'Not found')}")
222
+ st.write(f"**Description:** {meta_data.get('description', 'Not found')}")
223
+ st.write(f"**Canonical URL:** {meta_data.get('canonical', 'Not found')}")
224
+ if meta_data.get('favicon'):
225
+ st.write(f"**Favicon:** βœ… Found")
226
+ st.image(meta_data['favicon'], width=32)
227
+
228
+ with col2:
229
+ st.subheader("Social Media")
230
+ og_data = meta_data.get('og', {})
231
+ twitter_data = meta_data.get('twitter', {})
232
+
233
+ if og_data.get('og:title'):
234
+ st.write(f"**OG Title:** {og_data['og:title']}")
235
+ if og_data.get('og:description'):
236
+ st.write(f"**OG Description:** {og_data['og:description']}")
237
+ if twitter_data.get('twitter:title'):
238
+ st.write(f"**Twitter Title:** {twitter_data['twitter:title']}")
239
+
240
+ # SEO Section
241
+ if seo_data:
242
+ st.header("πŸ” SEO Analysis")
243
+
244
+ col1, col2, col3 = st.columns(3)
245
+
246
+ with col1:
247
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
248
+ st.metric("H1 Tags Count", seo_data.get('h1_count', 0))
249
+ if seo_data.get('h1_count', 0) != 1:
250
+ st.warning("⚠️ Should have exactly 1 H1 tag")
251
+ st.markdown('</div>', unsafe_allow_html=True)
252
+
253
+ with col2:
254
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
255
+ internal_links = seo_data.get('internal_links', 0)
256
+ external_links = seo_data.get('external_links', 0)
257
+ st.metric("Internal Links", internal_links)
258
+ st.metric("External Links", external_links)
259
+ st.markdown('</div>', unsafe_allow_html=True)
260
+
261
+ with col3:
262
+ st.markdown('<div class="metric-card">', unsafe_allow_html=True)
263
+ missing_alts = seo_data.get('missing_image_alts', [])
264
+ st.metric("Missing Alt Tags", len(missing_alts))
265
+ if missing_alts:
266
+ st.warning(f"��️ {len(missing_alts)} images missing alt text")
267
+ st.markdown('</div>', unsafe_allow_html=True)
268
+
269
+ # SEO Details
270
+ st.subheader("SEO Details")
271
+ col1, col2 = st.columns(2)
272
+
273
+ with col1:
274
+ st.write(f"**Robots Meta:** {seo_data.get('robots_meta', 'Not found')}")
275
+ st.write(f"**Has Canonical:** {'βœ… Yes' if seo_data.get('has_canonical') else '❌ No'}")
276
+ st.write(f"**Meta Keywords:** {seo_data.get('meta_keywords', 'Not found')}")
277
+
278
+ with col2:
279
+ if missing_alts:
280
+ st.write("**Images Missing Alt Text:**")
281
+ for img in missing_alts[:5]: # Show first 5
282
+ st.write(f"- {img}")
283
+ if len(missing_alts) > 5:
284
+ st.write(f"... and {len(missing_alts) - 5} more")
285
+
286
+ # Performance Section
287
+ if perf_data:
288
+ st.header("⚑ Performance Metrics")
289
+
290
+ # Create performance chart
291
+ metrics = []
292
+ values = []
293
+ colors = []
294
+
295
+ if perf_data.get('page_load_time_ms'):
296
+ metrics.append('Page Load Time (ms)')
297
+ values.append(perf_data['page_load_time_ms'])
298
+ colors.append('#1f77b4')
299
+
300
+ if perf_data.get('first_contentful_paint'):
301
+ metrics.append('First Contentful Paint (ms)')
302
+ values.append(perf_data['first_contentful_paint'])
303
+ colors.append('#ff7f0e')
304
+
305
+ if perf_data.get('largest_contentful_paint'):
306
+ metrics.append('Largest Contentful Paint (ms)')
307
+ values.append(perf_data['largest_contentful_paint'])
308
+ colors.append('#2ca02c')
309
+
310
+ if metrics:
311
+ fig = px.bar(
312
+ x=metrics,
313
+ y=values,
314
+ title="Performance Metrics",
315
+ color=metrics,
316
+ color_discrete_sequence=colors
317
+ )
318
+ fig.update_layout(showlegend=False)
319
+ st.plotly_chart(fig, use_container_width=True)
320
+
321
+ # Performance details
322
+ col1, col2 = st.columns(2)
323
+
324
+ with col1:
325
+ st.subheader("Core Web Vitals")
326
+ if perf_data.get('first_contentful_paint'):
327
+ fcp = perf_data['first_contentful_paint']
328
+ fcp_status = "🟒 Good" if fcp < 1800 else "🟑 Needs Improvement" if fcp < 3000 else "πŸ”΄ Poor"
329
+ st.metric("First Contentful Paint", f"{fcp:.0f}ms", delta=fcp_status)
330
+
331
+ if perf_data.get('largest_contentful_paint'):
332
+ lcp = perf_data['largest_contentful_paint']
333
+ lcp_status = "🟒 Good" if lcp < 2500 else "🟑 Needs Improvement" if lcp < 4000 else "πŸ”΄ Poor"
334
+ st.metric("Largest Contentful Paint", f"{lcp:.0f}ms", delta=lcp_status)
335
+
336
+ with col2:
337
+ st.subheader("Additional Metrics")
338
+ if perf_data.get('cumulative_layout_shift'):
339
+ cls = perf_data['cumulative_layout_shift']
340
+ cls_status = "🟒 Good" if cls < 0.1 else "🟑 Needs Improvement" if cls < 0.25 else "πŸ”΄ Poor"
341
+ st.metric("Cumulative Layout Shift", f"{cls:.3f}", delta=cls_status)
342
+
343
+ if perf_data.get('page_load_time_ms'):
344
+ load_time = perf_data['page_load_time_ms']
345
+ st.metric("Total Load Time", f"{load_time:.0f}ms")
346
+
347
+ # Screenshot Section
348
+ if screenshot_data:
349
+ st.header("πŸ“Έ Website Screenshot")
350
+ try:
351
+ screenshot_bytes = base64.b64decode(screenshot_data)
352
+ st.image(screenshot_bytes, caption=f"Screenshot of {url}", use_column_width=True)
353
+
354
+ # Download button for screenshot
355
+ st.download_button(
356
+ label="πŸ“₯ Download Screenshot",
357
+ data=screenshot_bytes,
358
+ file_name=f"screenshot_{url.replace('https://', '').replace('http://', '').replace('/', '_')}.png",
359
+ mime="image/png"
360
+ )
361
+ except Exception as e:
362
+ st.error(f"Failed to display screenshot: {str(e)}")
363
+
364
+ # Footer
365
+ st.markdown("---")
366
+ st.markdown("""
367
+ <div style='text-align: center; color: #666; padding: 2rem;'>
368
+ <p>πŸš€ <strong>Website Intelligence Dashboard</strong> | Powered by Advanced Web Analysis APIs</p>
369
+ <p>Built with ❀️ using Streamlit | © 2024</p>
370
+ </div>
371
+ """, unsafe_allow_html=True)
372
+
373
+ # Sidebar additional info
374
+ st.sidebar.markdown("---")
375
+ st.sidebar.markdown("### πŸ“Š Analysis Features")
376
+ st.sidebar.markdown("""
377
+ - **SEO Audit**: H1 tags, meta data, links analysis
378
+ - **Performance**: Core Web Vitals, load times
379
+ - **Metadata**: Social media tags, canonical URLs
380
+ - **Screenshots**: Visual website capture
381
+ - **Real-time**: Live website analysis
382
+ """)
383
+
384
+ st.sidebar.markdown("### πŸ”§ API Status")
385
+ try:
386
+ health_response = requests.get(f"{API_BASE}/health", timeout=5)
387
+ if health_response.status_code == 200:
388
+ st.sidebar.success("🟒 API Online")
389
+ else:
390
+ st.sidebar.error("πŸ”΄ API Issues")
391
+ except:
392
+ st.sidebar.warning("🟑 API Status Unknown")
webrify2.py CHANGED
@@ -20,14 +20,44 @@ class MetadataResponse(BaseModel):
20
  canonical: Optional[str]
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  async def get_page(url):
24
  pw = await async_playwright().start()
25
  browser = await pw.chromium.launch(headless=True)
26
- page = await browser.new_page()
 
 
 
 
 
 
 
 
 
27
  try:
28
- await page.goto(url, timeout=30000)
 
 
29
  except PlaywrightTimeoutError:
30
- raise HTTPException(status_code=504, detail="Page load timed out")
 
 
 
 
 
 
 
31
  return page, browser, pw
32
 
33
 
 
20
  canonical: Optional[str]
21
 
22
 
23
+ # async def get_page(url):
24
+ # pw = await async_playwright().start()
25
+ # browser = await pw.chromium.launch(headless=True)
26
+ # page = await browser.new_page()
27
+ # try:
28
+ # await page.goto(url, timeout=30000)
29
+ # except PlaywrightTimeoutError:
30
+ # raise HTTPException(status_code=504, detail="Page load timed out")
31
+ # return page, browser, pw
32
+
33
+
34
+
35
  async def get_page(url):
36
  pw = await async_playwright().start()
37
  browser = await pw.chromium.launch(headless=True)
38
+ context = await browser.new_context()
39
+
40
+ # Stealth: hide headless detection
41
+ await context.add_init_script(
42
+ "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
43
+ )
44
+
45
+ page = await context.new_page()
46
+ page.set_default_timeout(90000) # Apply to all waits
47
+
48
  try:
49
+ # Try networkidle first (wait for full load)
50
+ await page.goto(url, timeout=90000, wait_until="networkidle")
51
+ await page.wait_for_selector("body", timeout=10000) # Ensure DOM is visible
52
  except PlaywrightTimeoutError:
53
+ try:
54
+ # Fallback to lighter load event
55
+ await page.goto(url, timeout=90000, wait_until="load")
56
+ except Exception as e:
57
+ await browser.close()
58
+ await pw.stop()
59
+ raise HTTPException(status_code=504, detail=f"Page load failed: {str(e)}")
60
+
61
  return page, browser, pw
62
 
63