precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 21, 2024

Commit

f9a7a53

1 Parent(s): 8d78a0b

testing combined api

Browse files

Files changed (1) hide show

main.py +4 -61

main.py CHANGED Viewed

@@ -154,69 +154,12 @@ async def crawl_web(request: CrawlerRequest):
     except Exception as e:
         logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
-# # Updated Pydantic models
-# class ScrapeAndCrawlRequest(BaseModel):
-#     url: str
-#     college_name: str
-#     topic_title: str
-#     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
-#     num_results: int = 5  # Default number of results to fetch from Google, Quora, Reddit
-# # Combined API endpoint
-# @app.post("/scrape-and-crawl")
-# async def scrape_and_crawl(
-#     request: ScrapeAndCrawlRequest,
-#     x_api_key: Optional[str] = Header(None)  # API key to be passed in the request header
-# ):
-#     try:
-#         if not x_api_key:
-#             raise HTTPException(status_code=400, detail="API key is missing from the header")
-#         logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
-#         # Configure Google Generative AI API key from header
-#         genai.configure(api_key=x_api_key)
-#         # Scrape visible text from the provided URL asynchronously
-#         visible_text = await scrape_visible_text(request.url)
-#         # Structure the scraped data using the specified model from the request
-#         structured_data = structure_data(visible_text, request.college_name)
-#         # Perform web crawling to get related links with customizable result count
-#         google_links = google_search(request.topic_title, num_results=request.num_results)
-#         quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
-#         reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
-#         # Combine all links into one list
-#         all_links = google_links + quora_links + reddit_links
-#         # Use the specified model to filter and get the most relevant URLs
-#         prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
-#         model = genai.GenerativeModel(request.model_name)
-    #     response = model.generate_content(prompt)
-    #     filtered_links = response.text.strip().split('\n')
-    #     # Return the combined structured data and filtered links
-    #     logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
-    #     return {
-    #         "structured_data": structured_data,
-    #         "all_links": all_links,
-    #         "filtered_links": filtered_links
-    #     }
-    # except Exception as e:
-    #     logger.error(f"Error occurred while processing combined request: {e}")
-    #     raise HTTPException(status_code=500, detail=str(e))
 class SiteSearch(BaseModel):
     site_url: str  # Website to perform advanced search on
     num_results: Optional[int] = 5  # Optional number of results to fetch, default is 5
 class ScrapeAndCrawlRequest(BaseModel):
-    college_name: str
-    topic_title: str
     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
     sites: list[SiteSearch]  # List of websites and the number of results for each site
@@ -229,7 +172,7 @@ async def scrape_and_crawl(
         if not x_api_key:
             raise HTTPException(status_code=400, detail="API key is missing from the header")
-        logger.info(f"Received combined scrape and crawl request for College: {request.college_name}, Topic: {request.topic_title}")
         # Configure Google Generative AI API key from header
         genai.configure(api_key=x_api_key)
@@ -249,7 +192,7 @@ async def scrape_and_crawl(
             logger.info(f"Scraping visible text from link: {link}")
             try:
                 visible_text = await scrape_visible_text(link)  # Scrape the text
-                structured_data = structure_data(visible_text, request.college_name)  # Structure it
                 structured_data_list.append({"link": link, "structured_data": structured_data})
             except Exception as scrape_error:
                 logger.error(f"Error scraping link {link}: {scrape_error}")
@@ -267,4 +210,4 @@ async def scrape_and_crawl(
 if __name__ == "__main__":
     logger.info("Starting PreCollege Data Scraper Server...")
-    uvicorn.run(app, host="0.0.0.0", port=7860)

     except Exception as e:
         logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 class SiteSearch(BaseModel):
     site_url: str  # Website to perform advanced search on
     num_results: Optional[int] = 5  # Optional number of results to fetch, default is 5
 class ScrapeAndCrawlRequest(BaseModel):
+    topic_title: str  # The topic (and college name) for crawling and structuring
     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
     sites: list[SiteSearch]  # List of websites and the number of results for each site
         if not x_api_key:
             raise HTTPException(status_code=400, detail="API key is missing from the header")
+        logger.info(f"Received combined scrape and crawl request for Topic: {request.topic_title}")
         # Configure Google Generative AI API key from header
         genai.configure(api_key=x_api_key)
             logger.info(f"Scraping visible text from link: {link}")
             try:
                 visible_text = await scrape_visible_text(link)  # Scrape the text
+                structured_data = structure_data(visible_text, request.topic_title)  # Structure it
                 structured_data_list.append({"link": link, "structured_data": structured_data})
             except Exception as scrape_error:
                 logger.error(f"Error scraping link {link}: {scrape_error}")
 if __name__ == "__main__":
     logger.info("Starting PreCollege Data Scraper Server...")
+    uvicorn.run(app, host="0.0.0.0", port=7860, reload="true")