precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 21, 2024

Commit

9c2f54a

verified ·

1 Parent(s): cf196e2

Added combined api

Browse files

Files changed (1) hide show

main.py +56 -1

main.py CHANGED Viewed

@@ -2,8 +2,9 @@ import os
 import google.generativeai as genai
 from playwright.async_api import async_playwright
 from dotenv import load_dotenv
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 import uvicorn
 import asyncio
 import json
@@ -153,6 +154,60 @@ async def crawl_web(request: CrawlerRequest):
     except Exception as e:
         logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     logger.info("Starting PreCollege Data Scraper Server...")

 import google.generativeai as genai
 from playwright.async_api import async_playwright
 from dotenv import load_dotenv
+from fastapi import FastAPI, HTTPException, Header
 from pydantic import BaseModel
+from typing import Optional
 import uvicorn
 import asyncio
 import json
     except Exception as e:
         logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
+# Updated Pydantic models
+class ScrapeAndCrawlRequest(BaseModel):
+    url: str
+    college_name: str
+    topic_title: str
+    model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
+    num_results: int = 5  # Default number of results to fetch from Google, Quora, Reddit
+# Combined API endpoint
+@app.post("/scrape-and-crawl")
+async def scrape_and_crawl(
+    request: ScrapeAndCrawlRequest,
+    x_api_key: Optional[str] = Header(None)  # API key to be passed in the request header
+):
+    try:
+        if not x_api_key:
+            raise HTTPException(status_code=400, detail="API key is missing from the header")
+        logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
+        # Configure Google Generative AI API key from header
+        genai.configure(api_key=x_api_key)
+        # Scrape visible text from the provided URL asynchronously
+        visible_text = await scrape_visible_text(request.url)
+        # Structure the scraped data using the specified model from the request
+        structured_data = structure_data(visible_text, request.college_name)
+        # Perform web crawling to get related links with customizable result count
+        google_links = google_search(request.topic_title, num_results=request.num_results)
+        quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
+        reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
+        # Combine all links into one list
+        all_links = google_links + quora_links + reddit_links
+        # Use the specified model to filter and get the most relevant URLs
+        prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
+        model = genai.GenerativeModel(request.model_name)
+        response = model.generate_content(prompt)
+        filtered_links = response.text.strip().split('\n')
+        # Return the combined structured data and filtered links
+        logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
+        return {
+            "structured_data": structured_data,
+            "all_links": all_links,
+            "filtered_links": filtered_links
+        }
+    except Exception as e:
+        logger.error(f"Error occurred while processing combined request: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     logger.info("Starting PreCollege Data Scraper Server...")