apexherbert200 commited on
Commit
e736965
·
1 Parent(s): b55a15f

Building new logic

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. test1.py +228 -136
Dockerfile CHANGED
@@ -53,4 +53,4 @@ RUN python -m playwright install chromium
53
  EXPOSE 7860
54
 
55
  # Run the FastAPI application
56
- CMD ["python", "-m", "uvicorn", "scrape:app", "--host", "0.0.0.0", "--port", "7860"]
 
53
  EXPOSE 7860
54
 
55
  # Run the FastAPI application
56
+ CMD ["python", "-m", "uvicorn", "test1:app", "--host", "0.0.0.0", "--port", "7860"]
test1.py CHANGED
@@ -5,161 +5,253 @@ import asyncio
5
  import base64
6
  import logging
7
  from typing import List, Optional
8
- from urllib.parse import urlparse, parse_qs
9
 
10
- # Set up logging
 
 
 
 
 
 
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
- app = FastAPI(title="Query-Based Web Scraper", description="Scrape websites based on search queries")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # ... (Keep all your Pydantic models unchanged) ...
 
 
 
 
 
17
 
18
- @app.get("/")
19
- async def root():
20
- return {
21
- "message": "🚀 Query-Based Web Scraper API",
22
- "tagline": "Search and scrape websites based on queries",
23
- "endpoints": {
24
- "/scrape": "Search Google for the query and scrape the top result",
25
- "/docs": "API documentation"
26
- },
27
- "example": "/scrape?query=plumbers+near+me&lead_generation=true&screenshot=true",
28
- "note": "Now accepts search queries instead of direct URLs"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- async def get_top_search_result(query: str):
32
- """Perform Google search and return top result URL with CAPTCHA handling"""
33
- user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
34
  async with async_playwright() as p:
35
- # Use a proxy to avoid CAPTCHAs
36
- proxy_server = "us.proxyrack.net:10000"
37
- browser = await p.chromium.launch(
38
- headless=True,
39
- proxy={
40
- "server": f"http://{proxy_server}",
41
- "username": "your-proxy-username", # Replace with actual credentials
42
- "password": "your-proxy-password"
43
- },
44
- args=[
45
- '--no-sandbox',
46
- '--disable-setuid-sandbox',
47
- '--disable-dev-shm-usage',
48
- '--disable-accelerated-2d-canvas',
49
- '--no-first-run',
50
- '--no-zygote',
51
- '--disable-gpu'
52
- ]
53
- )
54
- context = await browser.new_context(
55
- user_agent=user_agent,
56
- locale='en-US',
57
- viewport={'width': 1920, 'height': 1080},
58
- # Bypass automation detection
59
- java_script_enabled=True,
60
- bypass_csp=True
61
- )
62
- page = await context.new_page()
63
-
64
  try:
65
- logger.info(f"Searching Google for: {query}")
66
- await page.goto("https://www.google.com", timeout=60000)
 
67
 
68
- # Handle consent form if it appears
69
- try:
70
- consent_button = await page.wait_for_selector('button:has-text("Accept all"), button:has-text("I agree")', timeout=5000)
71
- if consent_button:
72
- await consent_button.click()
73
- logger.info("Accepted Google consent form")
74
- await asyncio.sleep(1) # Small delay for consent to apply
75
- except:
76
- pass # Consent form didn't appear
77
 
78
- # Perform search
79
- search_box = await page.wait_for_selector('textarea[name="q"]', timeout=10000)
80
- await search_box.fill(query)
81
- await page.keyboard.press("Enter")
82
-
83
- # Wait for search results - use more reliable method
84
- try:
85
- # Check if CAPTCHA appeared
86
- captcha = await page.query_selector('form#captcha-form, div#recaptcha')
87
- if captcha:
88
- logger.error("CAPTCHA encountered during search")
89
- raise Exception("Google CAPTCHA encountered. Cannot proceed with search.")
90
-
91
- # Wait for search results to appear
92
- await page.wait_for_selector('.g, .tF2Cxc', timeout=30000)
93
- except:
94
- # Try alternative search result container
95
- try:
96
- await page.wait_for_selector('#search', timeout=10000)
97
- except:
98
- logger.error("Search results not found")
99
- raise Exception("Search results not found")
100
 
101
- # Extract top results
102
- results = await page.query_selector_all('.g, .tF2Cxc')
103
- if not results:
104
- results = await page.query_selector_all('div[data-snf]')
 
 
 
 
105
 
106
- if not results:
107
- raise Exception("No search results found")
 
 
108
 
109
- urls = []
110
- for result in results[:3]: # Check top 3 results
111
- try:
112
- link = await result.query_selector('a')
113
- if not link:
114
- continue
115
-
116
- # Extract both data-href and href attributes
117
- data_href = await link.get_attribute('data-href')
118
- href = await link.get_attribute('href')
119
- target_url = data_href or href
120
-
121
- if target_url and target_url.startswith('/url?q='):
122
- target_url = f"https://www.google.com{target_url}"
123
-
124
- if target_url and target_url.startswith('https://www.google.com/url?'):
125
- parsed = urlparse(target_url)
126
- qs = parse_qs(parsed.query)
127
- target_url = qs.get('q', [target_url])[0]
128
-
129
- if target_url and target_url.startswith('http'):
130
- urls.append(target_url)
131
- logger.info(f"Found search result: {target_url}")
132
- except Exception as e:
133
- logger.warning(f"Error processing result: {str(e)}")
134
-
135
- if not urls:
136
- raise Exception("No valid URLs found in search results")
137
 
138
  await browser.close()
139
- return urls[0] # Return top result
140
-
141
  except Exception as e:
142
- logger.error(f"Search failed: {str(e)}")
143
- await page.screenshot(path="search_error.png")
144
  await browser.close()
145
- raise
146
 
147
- @app.get("/scrape")
148
- async def scrape_page(
149
- query: str = Query(..., description="Search query to find a website"),
150
- lead_generation: bool = Query(True, description="Extract lead generation data"),
151
- screenshot: bool = Query(True, description="Take a full page screenshot"),
152
- get_links: bool = Query(True, description="Extract all links from the page"),
153
- get_body: bool = Query(False, description="Extract body tag content")
154
  ):
155
- logger.info(f"Starting scrape for query: {query}")
156
-
157
- try:
158
- # Get top search result URL
159
- target_url = await get_top_search_result(query)
160
- logger.info(f"Scraping top result: {target_url}")
161
- except Exception as e:
162
- logger.error(f"Search error: {str(e)}")
163
- raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
164
 
165
- # ... (keep the rest of the scraping function unchanged) ...
 
 
 
5
  import base64
6
  import logging
7
  from typing import List, Optional
8
+ from urllib.parse import urlparse
9
 
10
+ app = FastAPI(
11
+ title="Website Quality & Compliance Analyzer",
12
+ description="API that analyzes websites for SEO, accessibility, compliance and technical quality",
13
+ version="1.0.0"
14
+ )
15
+
16
+ # Configure logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
+ class SEOResult(BaseModel):
21
+ title: Optional[str] = None
22
+ meta_description: Optional[str] = None
23
+ h1_tags: List[str] = []
24
+ canonical_url: Optional[str] = None
25
+ robots_txt_present: bool = False
26
+ sitemap_present: bool = False
27
+
28
+ class AccessibilityResult(BaseModel):
29
+ missing_alt_tags: int = 0
30
+ images_without_alt: List[str] = []
31
+ aria_roles: List[str] = []
32
+ contrast_issues: List[str] = []
33
+
34
+ class ComplianceResult(BaseModel):
35
+ has_cookie_banner: bool = False
36
+ gdpr_compliant: Optional[bool] = None
37
+ has_privacy_policy: bool = False
38
+ has_terms_of_service: bool = False
39
+
40
+ class TechnicalResult(BaseModel):
41
+ tech_stack: List[str] = []
42
+ viewport_meta: Optional[str] = None
43
+ doctype: Optional[str] = None
44
+ is_https: bool = False
45
+ has_analytics: bool = False
46
+
47
+ class BrokenLink(BaseModel):
48
+ url: str
49
+ status: Optional[int] = None
50
+ text: Optional[str] = None
51
 
52
+ class AnalysisRequest(BaseModel):
53
+ url: str
54
+ screenshot: bool = False
55
+ mobile_test: bool = False
56
+ check_broken_links: bool = False
57
+ depth: int = 1 # How many levels deep to check broken links
58
 
59
+ class AnalysisResponse(BaseModel):
60
+ url: str
61
+ seo: SEOResult
62
+ accessibility: AccessibilityResult
63
+ compliance: ComplianceResult
64
+ technical: TechnicalResult
65
+ broken_links: List[BrokenLink] = []
66
+ mobile_friendly: Optional[bool] = None
67
+ screenshot_base64: Optional[str] = None
68
+ load_time: Optional[float] = None
69
+ success: bool
70
+ error: Optional[str] = None
71
+
72
+ async def analyze_page(page, url: str, options: AnalysisRequest):
73
+ result = {
74
+ "url": url,
75
+ "seo": {},
76
+ "accessibility": {},
77
+ "compliance": {},
78
+ "technical": {},
79
+ "broken_links": [],
80
+ "success": True
81
+ }
82
+
83
+ # Basic SEO checks
84
+ title = await page.title()
85
+ meta_description = await page.evaluate('''() => {
86
+ const meta = document.querySelector('meta[name="description"]');
87
+ return meta ? meta.content : null;
88
+ }''')
89
+
90
+ h1_tags = await page.evaluate('''() => {
91
+ return Array.from(document.querySelectorAll('h1')).map(h => h.textContent.trim());
92
+ }''')
93
+
94
+ result["seo"] = {
95
+ "title": title,
96
+ "meta_description": meta_description,
97
+ "h1_tags": h1_tags
98
  }
99
+
100
+ # Accessibility checks
101
+ images_without_alt = await page.evaluate('''() => {
102
+ return Array.from(document.querySelectorAll('img:not([alt])'))
103
+ .map(img => img.src);
104
+ }''')
105
+
106
+ result["accessibility"] = {
107
+ "missing_alt_tags": len(images_without_alt),
108
+ "images_without_alt": images_without_alt
109
+ }
110
+
111
+ # Compliance checks
112
+ has_cookie_banner = await page.evaluate('''() => {
113
+ const keywords = ['cookie', 'gdpr', 'privacy', 'consent'];
114
+ const elements = document.querySelectorAll('*');
115
+ for (let el of elements) {
116
+ const text = el.textContent.toLowerCase();
117
+ if (keywords.some(kw => text.includes(kw))) {
118
+ return true;
119
+ }
120
+ }
121
+ return false;
122
+ }''')
123
+
124
+ result["compliance"] = {
125
+ "has_cookie_banner": has_cookie_banner
126
+ }
127
+
128
+ # Technical checks
129
+ tech_stack = []
130
+
131
+ # Check for common JS libraries
132
+ libraries = await page.evaluate('''() => {
133
+ const libs = [];
134
+ if (window.jQuery) libs.push('jQuery');
135
+ if (window.React) libs.push('React');
136
+ if (window.Vue) libs.push('Vue');
137
+ if (window.angular) libs.push('Angular');
138
+ return libs;
139
+ }''')
140
+
141
+ tech_stack.extend(libraries)
142
+
143
+ # Check for analytics
144
+ has_analytics = await page.evaluate('''() => {
145
+ return !!document.querySelector('script[src*="google-analytics.com"], script[src*="googletagmanager.com"]');
146
+ }''')
147
+
148
+ is_https = url.startswith('https://')
149
+
150
+ result["technical"] = {
151
+ "tech_stack": tech_stack,
152
+ "is_https": is_https,
153
+ "has_analytics": has_analytics
154
+ }
155
+
156
+ # Broken links check (if requested)
157
+ if options.check_broken_links and options.depth > 0:
158
+ links = await page.evaluate('''() => {
159
+ return Array.from(document.querySelectorAll('a[href]')).map(a => ({
160
+ href: a.href,
161
+ text: a.textContent.trim()
162
+ }));
163
+ }''')
164
+
165
+ # Filter out external links and non-http links
166
+ domain = urlparse(url).netloc
167
+ internal_links = [
168
+ link for link in links
169
+ if link['href'].startswith('http') and domain in link['href']
170
+ ][:10] # Limit to 10 links for demo purposes
171
+
172
+ broken_links = []
173
+ for link in internal_links:
174
+ try:
175
+ response = await page.goto(link['href'], wait_until="domcontentloaded")
176
+ status = response.status if response else None
177
+ if status and status >= 400:
178
+ broken_links.append({
179
+ "url": link['href'],
180
+ "status": status,
181
+ "text": link['text']
182
+ })
183
+ except Exception as e:
184
+ broken_links.append({
185
+ "url": link['href'],
186
+ "status": None,
187
+ "text": link['text']
188
+ })
189
+
190
+ result["broken_links"] = broken_links
191
+
192
+ return result
193
 
194
+ @app.post("/analyze", response_model=AnalysisResponse)
195
+ async def analyze_website(request: AnalysisRequest):
196
+ """Analyze a website for quality and compliance metrics"""
197
  async with async_playwright() as p:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  try:
199
+ browser = await p.chromium.launch()
200
+ context = await browser.new_context()
201
+ page = await context.new_page()
202
 
203
+ # Start timing
204
+ start_time = asyncio.get_event_loop().time()
 
 
 
 
 
 
 
205
 
206
+ # Navigate to the page
207
+ response = await page.goto(request.url, wait_until="domcontentloaded")
208
+ if not response or response.status >= 400:
209
+ raise HTTPException(status_code=400, detail=f"Failed to load page. Status: {response.status if response else 'unknown'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ # Mobile test if requested
212
+ if request.mobile_test:
213
+ mobile_viewport = {'width': 375, 'height': 667}
214
+ await page.set_viewport_size(mobile_viewport)
215
+ result = await analyze_page(page, request.url, request)
216
+ result["mobile_friendly"] = True # Basic check - would need more sophisticated testing
217
+ else:
218
+ result = await analyze_page(page, request.url, request)
219
 
220
+ # Screenshot if requested
221
+ if request.screenshot:
222
+ screenshot = await page.screenshot(full_page=True)
223
+ result["screenshot_base64"] = base64.b64encode(screenshot).decode('utf-8')
224
 
225
+ # Calculate load time
226
+ end_time = asyncio.get_event_loop().time()
227
+ result["load_time"] = end_time - start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  await browser.close()
230
+ return result
231
+
232
  except Exception as e:
233
+ logger.error(f"Error analyzing website: {str(e)}")
 
234
  await browser.close()
235
+ raise HTTPException(status_code=500, detail=str(e))
236
 
237
+ @app.get("/analyze", response_model=AnalysisResponse)
238
+ async def analyze_website_get(
239
+ url: str = Query(..., description="URL to analyze"),
240
+ screenshot: bool = Query(False, description="Include screenshot"),
241
+ mobile_test: bool = Query(False, description="Test mobile responsiveness"),
242
+ check_broken_links: bool = Query(False, description="Check for broken links"),
243
+ depth: int = Query(1, description="Depth for broken links check")
244
  ):
245
+ """GET endpoint for website analysis"""
246
+ request = AnalysisRequest(
247
+ url=url,
248
+ screenshot=screenshot,
249
+ mobile_test=mobile_test,
250
+ check_broken_links=check_broken_links,
251
+ depth=depth
252
+ )
253
+ return await analyze_website(request)
254
 
255
+ if __name__ == "__main__":
256
+ import uvicorn
257
+ uvicorn.run(app, host="0.0.0.0", port=8000)