import time import random import re from datetime import datetime import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC def scrape_amazon(search_term, pincode, num_pages=5): options = Options() options.add_argument('--headless') options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') driver = webdriver.Chrome(service=Service(), options=options) all_products = [] seen_titles = set() for page in range(1, num_pages + 1): url = f"https://www.amazon.in/s?k={search_term}&page={page}" driver.get(url) time.sleep(random.uniform(3, 5)) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(random.uniform(2, 4)) products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']") print(f"Scraping page {page}, found {len(products)} products...") for product in products: try: title_elem = product.find_element(By.XPATH, ".//h2//span") title = title_elem.text.strip() except: title = "No Title" if title in seen_titles: continue seen_titles.add(title) try: link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']") link = link_elem.get_attribute('href') if link and link.startswith("/"): link = "https://www.amazon.in" + link except: link = "No Link" try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']") selling_price = (price_elem.text).replace(',', '').strip() except: try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']") selling_price = price_elem.text.replace('₹', '').replace(',', '').strip() except: selling_price = "No Price" try: mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']") mrp = mrp_elem.get_attribute("textContent").replace('₹', '').replace(',', '').strip() except: mrp = "No Price" try: if selling_price != "No Price" and mrp != "No Price": discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2) else: discount_percent = 0.0 except: discount_percent = 0.0 try: grammage_match = re.search(r'(\d+\.?\d*\s?(ml|g|kg|l))', title.lower()) grammage = grammage_match.group(0) if grammage_match else "No Grammage" except: grammage = "No Grammage" try: badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]") deal_tag = badge.text.strip() except: deal_tag = "No Deal" try: qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip() except: qty = "No data" try: rating_elem = product.find_element(By.XPATH, ".//span[@class='a-icon-alt']") rating = rating_elem.get_attribute("textContent").split()[0] except: rating = "No Rating" try: reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip() except: reviews = "No Reviews" try: ad_elem = product.find_element(By.XPATH, ".//span[contains(@class, 'a-color-secondary') and contains(text(), 'Sponsored')]") ad_status = "Ad" except: ad_status = "Not Ad" product_data = { 'Title': title, 'Grammage': grammage, 'Selling Price': selling_price, 'MRP': mrp, 'Discount %': discount_percent, 'Deal Tags': deal_tag, 'Quantity Bought': qty, 'Rating': rating, 'Reviews': reviews, 'Link': link, 'Ad/Not Ad': ad_status, 'Date': datetime.now().strftime("%d-%m-%Y"), 'Search Term': search_term, 'Pincode': pincode, 'Category': search_term, } all_products.append(product_data) time.sleep(random.uniform(2, 4)) driver.quit() df = pd.DataFrame(all_products) today_date = datetime.now().strftime("%Y-%m-%d") filename_base = f"{search_term}_scrape_{today_date}.xlsx" df.to_excel(filename_base, index=False) print(f"\nSaved: {filename_base}") return filename_base # Interface function for Gradio def scrape_amazon_interface(search_term, pincode, num_pages): return scrape_amazon(search_term, pincode, num_pages)