import time import random import re from datetime import datetime import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import gradio as gr def scrape_amazon(search_term, pincode, num_pages=5): options = Options() options.add_argument('--headless') options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') driver = webdriver.Chrome(service=Service(), options=options) all_products = [] seen_titles = set() for page in range(1, num_pages + 1): url = f"https://www.amazon.in/s?k={search_term}&page={page}&crid=2M096C61O4MLT&sprefix={search_term},aps,283" driver.get(url) time.sleep(random.uniform(3, 5)) # Let page load # Scroll down to load dynamic content driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(random.uniform(2, 4)) products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']") print(f"Scraping page {page}, found {len(products)} products...") for product in products: try: title_elem = product.find_element(By.XPATH, ".//h2//span") title = title_elem.text.strip() except: title = "No Title" if title in seen_titles: continue seen_titles.add(title) # Link Extraction try: link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']") link = link_elem.get_attribute('href') if link and link.startswith("/"): link = "https://www.amazon.com" + link except: link = "No Link" # Selling Price Extraction try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']") selling_price = (price_elem.text).replace(',', '').strip() except: try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']") selling_price = price_elem.text.replace('₹', '').replace(',', '').strip() except: selling_price = "No Price" try: mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']") mrp = mrp_elem.text.replace('₹', '').replace(',', '').strip() except: mrp = selling_price # Discount Extraction try: if selling_price != "No Price" and mrp != "No Price": discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2) else: discount_percent = 0.0 except: discount_percent = 0.0 # Grammage Extraction try: grammage_match = re.search(r'(\d+\.?\d*\s?(ml|g|kg|l))', title.lower()) grammage = grammage_match.group(0) if grammage_match else "No Grammage" except: grammage = "No Grammage" # Deal Tags Extraction try: badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]") deal_tag = badge.text.strip() except: deal_tag = "No Deal" # Quantity Bought Extraction try: qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip() except: qty = "No data" # Rating Extraction try: rating_elem = product.find_element(By.XPATH, ".//span[contains(@aria-label,'out of 5 stars')]") rating = rating_elem.get_attribute("aria-label").split()[0] except: rating = "No Rating" # Reviews Extraction try: reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip() except: reviews = "No Reviews" # Ad / Not Ad Detection try: ad_elem = product.find_element(By.XPATH, ".//span[contains(@class, 'puis-sponsored-label-text') and contains(text(), 'Sponsored')]") ad_status = "Ad" except: ad_status = "Not Ad" # Compile product info product_data = { 'Title': title, 'Grammage': grammage, 'Selling Price': selling_price, 'MRP': mrp, 'Discount %': discount_percent, 'Deal Tags': deal_tag, 'Quantity Bought': qty, 'Rating': rating, 'Reviews': reviews, 'Link': link, 'Ad/Not Ad': ad_status, 'Date': datetime.now().strftime("%d-%m-%Y"), 'Search Term': search_term, 'Pincode': pincode, 'Category': search_term, } all_products.append(product_data) time.sleep(random.uniform(2, 4)) # Pause between pages driver.quit() # Create DataFrame df = pd.DataFrame(all_products) # Save outputs today_date = datetime.now().strftime("%Y-%m-%d") filename_base = f"{search_term}scrape{today_date}" excel_path = f"{filename_base}.xlsx" csv_path = f"{filename_base}.csv" json_path = f"{filename_base}.json" df.to_excel(excel_path, index=False) df.to_csv(csv_path, index=False) df.to_json(json_path, orient="records", lines=True) return excel_path, csv_path, json_path def scrape_and_return_files(product_name, pincode, num_pages): excel_path, csv_path, json_path = scrape_amazon(product_name, pincode, int(num_pages)) return excel_path, csv_path, json_path with gr.Blocks() as demo: gr.Markdown("## 🛒 Amazon Scraper") with gr.Row(): product_name = gr.Textbox(label="Product Name", placeholder="e.g., atta") pincode = gr.Textbox(label="Pincode", placeholder="e.g., 400076") num_pages = gr.Number(label="Number of Pages", value=2) scrape_button = gr.Button("Scrape Amazon!") output_excel = gr.File(label="Download Excel (.xlsx)") output_csv = gr.File(label="Download CSV (.csv)") output_json = gr.File(label="Download JSON (.json)") scrape_button.click( scrape_and_return_files, inputs=[product_name, pincode, num_pages], outputs=[output_excel, output_csv, output_json] ) demo.launch(share=True)