import time import random import re from datetime import datetime import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service def scrape_amazon(search_term, pincode, num_pages=5): options = Options() options.add_argument('--headless') options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('--window-size=1920,1080') driver = webdriver.Chrome(service=Service(), options=options) all_products = [] seen_titles = set() for page in range(1, num_pages + 1): url = f"https://www.amazon.in/s?k={search_term}&page={page}" driver.get(url) time.sleep(random.uniform(3, 5)) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(random.uniform(2, 4)) products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']") print(f"Scraping page {page}, found {len(products)} products...") for product in products: try: title_elem = product.find_element(By.XPATH, ".//h2//span") title = title_elem.text.strip() except: title = "No Title" if title in seen_titles: continue seen_titles.add(title) try: link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']") link = link_elem.get_attribute('href') if link and link.startswith("/"): link = "https://www.amazon.in" + link except: link = "No Link" try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']") selling_price = price_elem.text.replace(',', '').strip() except: try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']") selling_price = price_elem.text.replace('₹', '').replace(',', '').strip() except: selling_price = "No Price" try: mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']") raw_price = mrp_elem.get_attribute("textContent") mrp = raw_price.replace('₹', '').replace(',', '').strip() except: mrp = "No Price" try: if selling_price != "No Price" and mrp != "No Price": discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2) else: discount_percent = 0.0 except: discount_percent = 0.0 try: grammage_match = re.search(r'(\d+\.?\d*\s?(ml|g|kg|l))', title.lower()) grammage = grammage_match.group(0) if grammage_match else "No Grammage" except: grammage = "No Grammage" try: badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]") deal_tag = badge.text.strip() except: deal_tag = "No Deal" try: qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip() except: qty = "No data" try: rating_elem = product.find_element(By.XPATH, ".//span[@class='a-icon-alt']") rating = rating_elem.get_attribute("textContent").split()[0] except: rating = "No Rating" try: reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip() except: reviews = "No Reviews" try: product.find_element(By.XPATH, ".//span[contains(@class, 'a-color-secondary') and contains(text(), 'Sponsored')]") ad_status = "Ad" except: ad_status = "Not Ad" product_data = { 'Title': title, 'Grammage': grammage, 'Selling Price': selling_price, 'MRP': mrp, 'Discount %': discount_percent, 'Deal Tags': deal_tag, 'Quantity Bought': qty, 'Rating': rating, 'Reviews': reviews, 'Link': link, 'Ad/Not Ad': ad_status, 'Date': datetime.now().strftime("%d-%m-%Y"), 'Search Term': search_term, 'Pincode': pincode, 'Category': search_term, } all_products.append(product_data) time.sleep(random.uniform(2, 4)) driver.quit() df = pd.DataFrame(all_products) today_date = datetime.now().strftime("%Y-%m-%d") filename_base = f"{search_term}_scrape_{today_date}" excel_path = f"{filename_base}.xlsx" df.to_excel(excel_path, index=False) return excel_path def scrape_amazon_interface(search_term, pincode, num_pages): excel_path = scrape_amazon(search_term, pincode, num_pages) return excel_path