Spaces:

ZweliM
/

sales_assistant

No application file

App Files Files Community

ZweliM commited on 25 days ago

Commit

7a511b0

verified ·

1 Parent(s): e3eda0f

Create web_scraper.py

Browse files

Files changed (1) hide show

web_scraper.py +214 -0

web_scraper.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from playwright.sync_api import sync_playwright
+import urllib.parse
+def scrape_hificorp(page, product_name: str) -> dict | None:
+    """
+    Scrape HiFiCorp for the given product_name.
+    Returns a dict with keys: title, normal_price, promotion_price, source, product_link
+    or None if no product found.
+    """
+    search_url = (
+        "https://www.hificorp.co.za/catalogsearch/result/?q="
+        + urllib.parse.quote_plus(product_name)
+    )
+    page.goto(search_url, timeout=120_000)
+    page.wait_for_selector(".product-item-link", timeout=60_000)
+    product_url = page.locator(
+        ".product-item-link").first.get_attribute("href")
+    if not product_url:
+        return None
+    page.goto(product_url, timeout=120_000)
+    page.wait_for_selector("h1.page-title", timeout=60_000)
+    title = page.locator("h1.page-title").inner_text().strip()
+    # Promotion (final) price
+    try:
+        promotion_price = (
+            page.locator('[data-price-type="finalPrice"] .price')
+            .first.inner_text()
+            .strip()
+        )
+    except Exception:
+        promotion_price = None
+    # Old (normal) price, if present
+    try:
+        old_nodes = page.locator('[data-price-type="oldPrice"] .price')
+        normal_price = (
+            old_nodes.first.inner_text().strip() if old_nodes.count() else None
+        )
+    except Exception:
+        normal_price = None
+    # Fallback if no old price
+    normal_price = normal_price or promotion_price
+    return {
+        "title": title,
+        "normal_price": normal_price,
+        "promotion_price": promotion_price,
+        "source": "HiFiCorp",
+        "product_link": product_url,
+    }
+def scrape_incredible(page, product_name: str) -> dict | None:
+    """
+    Scrape Incredible Connection for the given product_name.
+    Returns a dict with keys: title, normal_price, promotion_price, source, product_link
+    or None if no product found.
+    """
+    search_url = (
+        "https://www.incredible.co.za/catalogsearch/result/?q="
+        + urllib.parse.quote_plus(product_name)
+    )
+    page.goto(search_url, timeout=120_000)
+    page.wait_for_selector(".product-item-link", timeout=60_000)
+    product_url = page.locator(
+        ".product-item-link").first.get_attribute("href")
+    if not product_url:
+        return None
+    page.goto(product_url, timeout=120_000)
+    page.wait_for_selector("h1.page-title", timeout=60_000)
+    title = page.locator("h1.page-title").inner_text().strip()
+    try:
+        promotion_price = (
+            page.locator('[data-price-type="finalPrice"] .price')
+            .first.inner_text()
+            .strip()
+        )
+    except Exception:
+        promotion_price = None
+    try:
+        old_nodes = page.locator('[data-price-type="oldPrice"] .price')
+        normal_price = (
+            old_nodes.first.inner_text().strip() if old_nodes.count() else None
+        )
+    except Exception:
+        normal_price = None
+    normal_price = normal_price or promotion_price
+    return {
+        "title": title,
+        "normal_price": normal_price,
+        "promotion_price": promotion_price,
+        "source": "Incredible Connection",
+        "product_link": product_url,
+    }
+def search_product(product_name: str) -> list[dict]:
+    """
+    Uses Playwright to scrape HiFiCorp and Incredible Connection for product_name.
+    Returns a list of dictionaries, each dict with keys:
+      title, normal_price, promotion_price, source, product_link.
+    If Playwright cannot run or no products found, returns an empty list.
+    """
+    results = []
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(
+                headless=True,
+                args=["--no-sandbox", "--disable-setuid-sandbox",
+                      "--disable-dev-shm-usage"],
+            )
+            page = browser.new_page()
+            # Scrape HiFiCorp
+            try:
+                hifi_data = scrape_hificorp(page, product_name)
+                if hifi_data:
+                    results.append(hifi_data)
+            except Exception as e:
+                return (r"HiFiCorp scraping error:", type(e).__name__, e)
+            browser.close()
+    except NotImplementedError:
+        # Playwright cannot launch a browser in this environment
+        return ("Playwright NotImplementedError: scraping skipped.")
+    except Exception as e:
+        # Any other Playwright/browser launch error
+        print("Playwright launch error:", type(e).__name__, e)
+        return []
+    return results
+def get_scraped_product_data(product_name: str):
+    """
+    Wrapper function to search for product data.
+    Returns a list of dictionaries with product details.
+    """
+    if not product_name:
+        return []
+    results = search_product(product_name)
+    # def save_df_to_csv(df: pd.DataFrame, filename="shop_out_results.csv"):
+    results.to_csv("scraped.csv", index=False)
+    if not results:
+        return []
+    return results
+def search_your_product(query: str):
+    """Search for a product using the provided query string."""
+    json_out = search_product(query)
+    if not json_out:
+        return "No results found."
+    else:
+        product = []
+        for item in json_out:
+            product.append({
+                "title": item["title"],
+                "normal_price": item["normal_price"],
+                "promotion_price": item["promotion_price"],
+                "source": item["source"],
+                "product_link": item["product_link"]
+            })
+    return product
+# For debugging or manual runs:
+if __name__ == "__main__":
+    query = input("Enter product name: ")
+    json_out = search_product(query)
+    if not json_out:
+        print("No results found.")
+    else:
+        product = []
+        for item in json_out:
+            product.append({
+                "title": item["title"],
+                "normal_price": item["normal_price"],
+                "promotion_price": item["promotion_price"],
+                "source": item["source"],
+                "product_link": item["product_link"]
+            })
+    for items in product:
+        print(f"Title: {items['title']}")
+        print(f"Normal Price: {items['normal_price']}")
+        print(f"Promotion Price: {items['promotion_price']}")
+        print(f"Source: {items['source']}")
+        print(f"Product Link: {items['product_link']}")
+        print("-" * 40)
+    print(f"Found {len(product)} results for '{query}'.")
+    print("Search complete!")