import os import google.generativeai as genai from playwright.async_api import async_playwright from dotenv import load_dotenv from fastapi import FastAPI, HTTPException, Header from pydantic import BaseModel from typing import Optional import uvicorn import asyncio import json import requests from bs4 import BeautifulSoup import logging # Load environment variables load_dotenv() # Configure Google Generative AI API key genai.configure(api_key=os.environ["API_KEY"]) # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler() ] ) logger = logging.getLogger("ScrapeStructureApp") # FastAPI app initialization app = FastAPI() # Function to scrape webpage and extract visible text async def scrape_visible_text(url): try: logger.info(f"Starting to scrape visible text from URL: {url}") async with async_playwright() as p: browser = await p.chromium.launch(headless=True) # Launch browser in headless mode context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", viewport={"width": 1280, "height": 800}, extra_http_headers={ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-encoding": "gzip, deflate, br, zstd", "accept-language": "en-US,en;q=0.9,hi;q=0.8", "cache-control": "max-age=0", "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1" } ) page = await context.new_page() await page.goto(url, wait_until="domcontentloaded") visible_text = await page.evaluate("document.body.innerText") await browser.close() logger.info(f"Successfully scraped visible text from URL: {url}") return visible_text except Exception as e: logger.error(f"Error while scraping visible text from URL {url}: {e}") raise # Function to structure data using Google's Gemini model def structure_data(text, college_name): try: logger.info(f"Starting to structure data for college: {college_name}") prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no formatting! --- \n{text} ---. Use only the text between the '---' markers as input source text. If information is not available about any specific thing dont mention it." model = genai.GenerativeModel("gemini-1.5-pro") response = model.generate_content(prompt) logger.info(f"Successfully structured data for college: {college_name}") return response.text.strip() except Exception as e: logger.error(f"Error while structuring data for college {college_name}: {e}") raise # Pydantic model for request body class URLRequest(BaseModel): url: str college_name: str # Pydantic model for Crawler request class CrawlerRequest(BaseModel): topic_title: str # Function to perform Google search and return top N links def google_search(query, num_results=5): try: logger.info(f"Performing Google search for query: {query}") search_url = f"https://www.google.com/search?q={query}&num={num_results}" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36" } response = requests.get(search_url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") links = [] for a in soup.find_all('a', href=True, attrs={'jsname': True}): link = a['href'] if link.startswith("https://") and not link.__contains__("google.com"): links.append(link) logger.info(f"Successfully retrieved {len(links)} links for query: {query}") return links[:num_results] except Exception as e: logger.error(f"Error while performing Google search for query {query}: {e}") raise # Function to perform advanced search on specific sites def advanced_search_on_site(site, topic, num_results=10): query = f"site:{site} {topic}" return google_search(query, num_results) # FastAPI endpoint to scrape and structure data @app.post("/scrape") async def scrape_and_structure_data(request: URLRequest): try: logger.info(f"Received scrape request for URL: {request.url}, College Name: {request.college_name}") # Scrape visible text from the webpage visible_text = await scrape_visible_text(request.url) # Structure the data using Google's Gemini model structured_data = structure_data(visible_text, request.college_name) logger.info(f"Successfully processed scrape request for URL: {request.url}") # Return the structured data return {"structured_data": structured_data} except Exception as e: logger.error(f"Error occurred while processing scrape request for URL {request.url}: {e}") raise HTTPException(status_code=500, detail=str(e)) # FastAPI endpoint to perform web crawling @app.post("/crawl") async def crawl_web(request: CrawlerRequest): try: topic_title = request.topic_title logger.info(f"Received crawl request for topic: {topic_title}") # Get top 5 links from Google search google_links = google_search(topic_title, num_results=10) # Get links from Quora quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10) # Additional sites can be added similarly other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10) # Combine all links all_links = google_links + quora_links + other_links # Use Gemini to filter and list relevant URLs prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}. Response should only contain the array of links with no formatting." model = genai.GenerativeModel("gemini-1.5-pro") response = model.generate_content(prompt) filtered_links = response.text.strip().split('\n') logger.info(f"Successfully processed crawl request for topic: {topic_title}") # Return the filtered links return {"links": all_links, "filtered_links": filtered_links} except Exception as e: logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}") raise HTTPException(status_code=500, detail=str(e)) # # Updated Pydantic models # class ScrapeAndCrawlRequest(BaseModel): # url: str # college_name: str # topic_title: str # model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro' # num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit # # Combined API endpoint # @app.post("/scrape-and-crawl") # async def scrape_and_crawl( # request: ScrapeAndCrawlRequest, # x_api_key: Optional[str] = Header(None) # API key to be passed in the request header # ): # try: # if not x_api_key: # raise HTTPException(status_code=400, detail="API key is missing from the header") # logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}") # # Configure Google Generative AI API key from header # genai.configure(api_key=x_api_key) # # Scrape visible text from the provided URL asynchronously # visible_text = await scrape_visible_text(request.url) # # Structure the scraped data using the specified model from the request # structured_data = structure_data(visible_text, request.college_name) # # Perform web crawling to get related links with customizable result count # google_links = google_search(request.topic_title, num_results=request.num_results) # quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results) # reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results) # # Combine all links into one list # all_links = google_links + quora_links + reddit_links # # Use the specified model to filter and get the most relevant URLs # prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting." # model = genai.GenerativeModel(request.model_name) # response = model.generate_content(prompt) # filtered_links = response.text.strip().split('\n') # # Return the combined structured data and filtered links # logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}") # return { # "structured_data": structured_data, # "all_links": all_links, # "filtered_links": filtered_links # } # except Exception as e: # logger.error(f"Error occurred while processing combined request: {e}") # raise HTTPException(status_code=500, detail=str(e)) class SiteSearch(BaseModel): site_url: str # Website to perform advanced search on num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5 class ScrapeAndCrawlRequest(BaseModel): college_name: str topic_title: str model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro' sites: list[SiteSearch] # List of websites and the number of results for each site @app.post("/scrape-and-crawl") async def scrape_and_crawl( request: ScrapeAndCrawlRequest, x_api_key: Optional[str] = Header(None) # API key to be passed in the request header ): try: if not x_api_key: raise HTTPException(status_code=400, detail="API key is missing from the header") logger.info(f"Received combined scrape and crawl request for College: {request.college_name}, Topic: {request.topic_title}") # Configure Google Generative AI API key from header genai.configure(api_key=x_api_key) # Initialize lists to hold all crawled links and structured data all_links = [] structured_data_list = [] # Perform advanced search on the provided sites with custom result counts for site in request.sites: logger.info(f"Performing advanced search on {site.site_url} for {site.num_results} results") site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results) all_links.extend(site_links) # Scrape visible text from each fetched link and structure the data for link in all_links: logger.info(f"Scraping visible text from link: {link}") try: visible_text = await scrape_visible_text(link) # Scrape the text structured_data = structure_data(visible_text, request.college_name) # Structure it structured_data_list.append({"link": link, "structured_data": structured_data}) except Exception as scrape_error: logger.error(f"Error scraping link {link}: {scrape_error}") continue # If scraping fails, continue with the next link # Return the structured data for all successfully scraped links logger.info(f"Successfully processed combined request for Topic: {request.topic_title}") return { "structured_data": structured_data_list } except Exception as e: logger.error(f"Error occurred while processing combined request: {e}") raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": logger.info("Starting PreCollege Data Scraper Server...") uvicorn.run(app, host="0.0.0.0", port=7860)