Spaces:

Sameercodes
/

Amazon_web_scraper

Sleeping

App Files Files Community

Amazon_web_scraper / Scrapper.py

Sameercodes

Update Scrapper.py

bdbe8e1 verified 4 months ago

raw

history blame

5.95 kB

	import time
	import random
	import re
	from datetime import datetime
	import pandas as pd
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service

	def scrape_amazon(search_term, pincode, num_pages=5):
	options = Options()
	options.add_argument('--headless')
	options.add_argument('--disable-blink-features=AutomationControlled')
	options.add_argument('--disable-gpu')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('--window-size=1920,1080')

	driver = webdriver.Chrome(service=Service(), options=options)

	all_products = []
	seen_titles = set()

	for page in range(1, num_pages + 1):
	url = f"https://www.amazon.in/s?k={search_term}&page={page}"
	driver.get(url)

	time.sleep(random.uniform(3, 5))
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(random.uniform(2, 4))

	products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
	print(f"Scraping page {page}, found {len(products)} products...")

	for product in products:
	try:
	title_elem = product.find_element(By.XPATH, ".//h2//span")
	title = title_elem.text.strip()
	except:
	title = "No Title"

	if title in seen_titles:
	continue
	seen_titles.add(title)

	try:
	link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']")
	link = link_elem.get_attribute('href')
	if link and link.startswith("/"):
	link = "https://www.amazon.in" + link
	except:
	link = "No Link"

	try:
	price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']")
	selling_price = price_elem.text.replace(',', '').strip()
	except:
	try:
	price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']")
	selling_price = price_elem.text.replace('₹', '').replace(',', '').strip()
	except:
	selling_price = "No Price"

	try:
	mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']")
	raw_price = mrp_elem.get_attribute("textContent")
	mrp = raw_price.replace('₹', '').replace(',', '').strip()
	except:
	mrp = "No Price"

	try:
	if selling_price != "No Price" and mrp != "No Price":
	discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2)
	else:
	discount_percent = 0.0
	except:
	discount_percent = 0.0

	try:
	grammage_match = re.search(r'(\d+\.?\d*\s?(ml\|g\|kg\|l))', title.lower())
	grammage = grammage_match.group(0) if grammage_match else "No Grammage"
	except:
	grammage = "No Grammage"

	try:
	badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]")
	deal_tag = badge.text.strip()
	except:
	deal_tag = "No Deal"

	try:
	qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip()
	except:
	qty = "No data"

	try:
	rating_elem = product.find_element(By.XPATH, ".//span[@class='a-icon-alt']")
	rating = rating_elem.get_attribute("textContent").split()[0]
	except:
	rating = "No Rating"

	try:
	reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip()
	except:
	reviews = "No Reviews"

	try:
	product.find_element(By.XPATH, ".//span[contains(@class, 'a-color-secondary') and contains(text(), 'Sponsored')]")
	ad_status = "Ad"
	except:
	ad_status = "Not Ad"

	product_data = {
	'Title': title,
	'Grammage': grammage,
	'Selling Price': selling_price,
	'MRP': mrp,
	'Discount %': discount_percent,
	'Deal Tags': deal_tag,
	'Quantity Bought': qty,
	'Rating': rating,
	'Reviews': reviews,
	'Link': link,
	'Ad/Not Ad': ad_status,
	'Date': datetime.now().strftime("%d-%m-%Y"),
	'Search Term': search_term,
	'Pincode': pincode,
	'Category': search_term,
	}

	all_products.append(product_data)

	time.sleep(random.uniform(2, 4))

	driver.quit()

	df = pd.DataFrame(all_products)
	today_date = datetime.now().strftime("%Y-%m-%d")
	filename_base = f"{search_term}_scrape_{today_date}"
	excel_path = f"{filename_base}.xlsx"
	df.to_excel(excel_path, index=False)

	return excel_path

	def scrape_amazon_interface(search_term, pincode, num_pages):
	excel_path = scrape_amazon(search_term, pincode, num_pages)
	return excel_path