File size: 3,541 Bytes
3515cf9 07b96fa 99362db 07b96fa 3515cf9 07b96fa 3515cf9 07b96fa 3515cf9 07b96fa 3515cf9 07b96fa 3515cf9 07b96fa 3515cf9 07b96fa 3515cf9 07b96fa 3515cf9 07b96fa 3515cf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import random
import time
import os
# List of user agents to avoid bot detection
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
]
# Function to initialize Selenium driver (headless)
def get_driver():
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
return driver
# Function to scrape Flipkart laptop data
def scrape_flipkart(url):
try:
# Set up Selenium driver
driver = get_driver()
headers = {"User-Agent": random.choice(USER_AGENTS)}
# Load the page
driver.get(url)
time.sleep(5) # Wait for JavaScript to load content
# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit() # Close the driver
# Lists to store scraped data
products = []
prices = []
ratings = []
# Find laptop items (adjust class names based on Flipkart's HTML structure)
items = soup.find_all("div", class_="_1AtVbE") # Parent div for each product
for item in items:
# Product name
name_tag = item.find("div", class_="_4rR01T")
name = name_tag.text.strip() if name_tag else "N/A"
# Price
price_tag = item.find("div", class_="_30jeq3")
price = price_tag.text.strip() if price_tag else "N/A"
# Rating
rating_tag = item.find("div", class_="_3LWZlK")
rating = rating_tag.text.strip() if rating_tag else "N/A"
if name != "N/A": # Only append valid entries
products.append(name)
prices.append(price)
ratings.append(rating)
# Create DataFrame
df = pd.DataFrame({
"Product Name": products,
"Price": prices,
"Rating": ratings
})
# Save to CSV
csv_path = "flipkart_laptops.csv"
df.to_csv(csv_path, index=False, encoding="utf-8")
return f"Scraped {len(products)} laptops successfully!", csv_path
except Exception as e:
return f"Error: {str(e)}", None
# Gradio interface
with gr.Blocks(title="Flipkart Laptop Scraper") as demo:
gr.Markdown("# Flipkart Laptop Scraper")
gr.Markdown("Enter a Flipkart laptop category URL to scrape data and download as CSV.")
url_input = gr.Textbox(label="Flipkart URL", placeholder="e.g., https://www.flipkart.com/laptops/pr?sid=6bo,b5g")
scrape_btn = gr.Button("Scrape Data")
output_text = gr.Textbox(label="Status")
output_file = gr.File(label="Download CSV")
scrape_btn.click(
fn=scrape_flipkart,
inputs=url_input,
outputs=[output_text, output_file]
)
demo.launch() |