File size: 3,541 Bytes
3515cf9
07b96fa
 
 
99362db
07b96fa
 
 
3515cf9
07b96fa
 
3515cf9
07b96fa
 
 
 
 
 
3515cf9
07b96fa
 
 
 
 
 
 
 
3515cf9
07b96fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3515cf9
07b96fa
 
3515cf9
 
07b96fa
 
 
3515cf9
07b96fa
 
 
 
3515cf9
07b96fa
 
 
 
 
3515cf9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import random
import time
import os

# List of user agents to avoid bot detection
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# Function to initialize Selenium driver (headless)
def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

# Function to scrape Flipkart laptop data
def scrape_flipkart(url):
    try:
        # Set up Selenium driver
        driver = get_driver()
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        
        # Load the page
        driver.get(url)
        time.sleep(5)  # Wait for JavaScript to load content
        
        # Get page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()  # Close the driver
        
        # Lists to store scraped data
        products = []
        prices = []
        ratings = []
        
        # Find laptop items (adjust class names based on Flipkart's HTML structure)
        items = soup.find_all("div", class_="_1AtVbE")  # Parent div for each product
        for item in items:
            # Product name
            name_tag = item.find("div", class_="_4rR01T")
            name = name_tag.text.strip() if name_tag else "N/A"
            
            # Price
            price_tag = item.find("div", class_="_30jeq3")
            price = price_tag.text.strip() if price_tag else "N/A"
            
            # Rating
            rating_tag = item.find("div", class_="_3LWZlK")
            rating = rating_tag.text.strip() if rating_tag else "N/A"
            
            if name != "N/A":  # Only append valid entries
                products.append(name)
                prices.append(price)
                ratings.append(rating)
        
        # Create DataFrame
        df = pd.DataFrame({
            "Product Name": products,
            "Price": prices,
            "Rating": ratings
        })
        
        # Save to CSV
        csv_path = "flipkart_laptops.csv"
        df.to_csv(csv_path, index=False, encoding="utf-8")
        
        return f"Scraped {len(products)} laptops successfully!", csv_path
    
    except Exception as e:
        return f"Error: {str(e)}", None

# Gradio interface
with gr.Blocks(title="Flipkart Laptop Scraper") as demo:
    gr.Markdown("# Flipkart Laptop Scraper")
    gr.Markdown("Enter a Flipkart laptop category URL to scrape data and download as CSV.")
    
    url_input = gr.Textbox(label="Flipkart URL", placeholder="e.g., https://www.flipkart.com/laptops/pr?sid=6bo,b5g")
    scrape_btn = gr.Button("Scrape Data")
    output_text = gr.Textbox(label="Status")
    output_file = gr.File(label="Download CSV")
    
    scrape_btn.click(
        fn=scrape_flipkart,
        inputs=url_input,
        outputs=[output_text, output_file]
    )

demo.launch()