Spaces:

Sameercodes
/

Amazon_web_scraper

Sleeping

App Files Files Community

Amazon_web_scraper / Scraper.py

Sameercodes

Update Scraper.py

8573458 verified 4 months ago

raw

history blame

5.95 kB

	import time
	import random
	import re
	from datetime import datetime
	import pandas as pd
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC

	def scrape_amazon(search_term, pincode, num_pages=5):
	options = Options()
	options.add_argument('--headless')
	options.add_argument('--disable-blink-features=AutomationControlled')
	options.add_argument('--disable-gpu')
	options.add_argument('--no-sandbox')

	driver = webdriver.Chrome(service=Service(), options=options)

	all_products = []
	seen_titles = set()

	for page in range(1, num_pages + 1):
	url = f"https://www.amazon.in/s?k={search_term}&page={page}"
	driver.get(url)

	time.sleep(random.uniform(3, 5))

	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(random.uniform(2, 4))

	products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
	print(f"Scraping page {page}, found {len(products)} products...")

	for product in products:
	try:
	title_elem = product.find_element(By.XPATH, ".//h2//span")
	title = title_elem.text.strip()
	except:
	title = "No Title"

	if title in seen_titles:
	continue
	seen_titles.add(title)

	try:
	link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']")
	link = link_elem.get_attribute('href')
	if link and link.startswith("/"):
	link = "https://www.amazon.in" + link
	except:
	link = "No Link"

	try:
	price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']")
	selling_price = (price_elem.text).replace(',', '').strip()
	except:
	try:
	price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']")
	selling_price = price_elem.text.replace('₹', '').replace(',', '').strip()
	except:
	selling_price = "No Price"

	try:
	mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']")
	mrp = mrp_elem.get_attribute("textContent").replace('₹', '').replace(',', '').strip()
	except:
	mrp = "No Price"

	try:
	if selling_price != "No Price" and mrp != "No Price":
	discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2)
	else:
	discount_percent = 0.0
	except:
	discount_percent = 0.0

	try:
	grammage_match = re.search(r'(\d+\.?\d*\s?(ml\|g\|kg\|l))', title.lower())
	grammage = grammage_match.group(0) if grammage_match else "No Grammage"
	except:
	grammage = "No Grammage"

	try:
	badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]")
	deal_tag = badge.text.strip()
	except:
	deal_tag = "No Deal"

	try:
	qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip()
	except:
	qty = "No data"

	try:
	rating_elem = product.find_element(By.XPATH, ".//span[@class='a-icon-alt']")
	rating = rating_elem.get_attribute("textContent").split()[0]
	except:
	rating = "No Rating"

	try:
	reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip()
	except:
	reviews = "No Reviews"

	try:
	ad_elem = product.find_element(By.XPATH, ".//span[contains(@class, 'a-color-secondary') and contains(text(), 'Sponsored')]")
	ad_status = "Ad"
	except:
	ad_status = "Not Ad"

	product_data = {
	'Title': title,
	'Grammage': grammage,
	'Selling Price': selling_price,
	'MRP': mrp,
	'Discount %': discount_percent,
	'Deal Tags': deal_tag,
	'Quantity Bought': qty,
	'Rating': rating,
	'Reviews': reviews,
	'Link': link,
	'Ad/Not Ad': ad_status,
	'Date': datetime.now().strftime("%d-%m-%Y"),
	'Search Term': search_term,
	'Pincode': pincode,
	'Category': search_term,
	}

	all_products.append(product_data)

	time.sleep(random.uniform(2, 4))

	driver.quit()

	df = pd.DataFrame(all_products)

	today_date = datetime.now().strftime("%Y-%m-%d")
	filename_base = f"{search_term}_scrape_{today_date}.xlsx"
	df.to_excel(filename_base, index=False)

	print(f"\nSaved: {filename_base}")
	return filename_base


	# Interface function for Gradio
	def scrape_amazon_interface(search_term, pincode, num_pages):
	return scrape_amazon(search_term, pincode, num_pages)