precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 21, 2024

Commit

8d78a0b

verified ·

1 Parent(s): 9c2f54a

updated v2 api combined

Browse files

Files changed (1) hide show

main.py +85 -29

main.py CHANGED Viewed

@@ -154,15 +154,72 @@ async def crawl_web(request: CrawlerRequest):
     except Exception as e:
         logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
-# Updated Pydantic models
 class ScrapeAndCrawlRequest(BaseModel):
-    url: str
     college_name: str
     topic_title: str
     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
-    num_results: int = 5  # Default number of results to fetch from Google, Quora, Reddit
-# Combined API endpoint
 @app.post("/scrape-and-crawl")
 async def scrape_and_crawl(
     request: ScrapeAndCrawlRequest,
@@ -172,39 +229,38 @@ async def scrape_and_crawl(
         if not x_api_key:
             raise HTTPException(status_code=400, detail="API key is missing from the header")
-        logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
         # Configure Google Generative AI API key from header
         genai.configure(api_key=x_api_key)
-        # Scrape visible text from the provided URL asynchronously
-        visible_text = await scrape_visible_text(request.url)
-        # Structure the scraped data using the specified model from the request
-        structured_data = structure_data(visible_text, request.college_name)
-        # Perform web crawling to get related links with customizable result count
-        google_links = google_search(request.topic_title, num_results=request.num_results)
-        quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
-        reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
-        # Combine all links into one list
-        all_links = google_links + quora_links + reddit_links
-        # Use the specified model to filter and get the most relevant URLs
-        prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
-        model = genai.GenerativeModel(request.model_name)
-        response = model.generate_content(prompt)
-        filtered_links = response.text.strip().split('\n')
-        # Return the combined structured data and filtered links
-        logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
         return {
-            "structured_data": structured_data,
-            "all_links": all_links,
-            "filtered_links": filtered_links
         }
     except Exception as e:
         logger.error(f"Error occurred while processing combined request: {e}")
         raise HTTPException(status_code=500, detail=str(e))

     except Exception as e:
         logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
+# # Updated Pydantic models
+# class ScrapeAndCrawlRequest(BaseModel):
+#     url: str
+#     college_name: str
+#     topic_title: str
+#     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
+#     num_results: int = 5  # Default number of results to fetch from Google, Quora, Reddit
+# # Combined API endpoint
+# @app.post("/scrape-and-crawl")
+# async def scrape_and_crawl(
+#     request: ScrapeAndCrawlRequest,
+#     x_api_key: Optional[str] = Header(None)  # API key to be passed in the request header
+# ):
+#     try:
+#         if not x_api_key:
+#             raise HTTPException(status_code=400, detail="API key is missing from the header")
+#         logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
+#         # Configure Google Generative AI API key from header
+#         genai.configure(api_key=x_api_key)
+#         # Scrape visible text from the provided URL asynchronously
+#         visible_text = await scrape_visible_text(request.url)
+#         # Structure the scraped data using the specified model from the request
+#         structured_data = structure_data(visible_text, request.college_name)
+#         # Perform web crawling to get related links with customizable result count
+#         google_links = google_search(request.topic_title, num_results=request.num_results)
+#         quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
+#         reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
+#         # Combine all links into one list
+#         all_links = google_links + quora_links + reddit_links
+#         # Use the specified model to filter and get the most relevant URLs
+#         prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
+#         model = genai.GenerativeModel(request.model_name)
+    #     response = model.generate_content(prompt)
+    #     filtered_links = response.text.strip().split('\n')
+    #     # Return the combined structured data and filtered links
+    #     logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
+    #     return {
+    #         "structured_data": structured_data,
+    #         "all_links": all_links,
+    #         "filtered_links": filtered_links
+    #     }
+    # except Exception as e:
+    #     logger.error(f"Error occurred while processing combined request: {e}")
+    #     raise HTTPException(status_code=500, detail=str(e))
+class SiteSearch(BaseModel):
+    site_url: str  # Website to perform advanced search on
+    num_results: Optional[int] = 5  # Optional number of results to fetch, default is 5
 class ScrapeAndCrawlRequest(BaseModel):
     college_name: str
     topic_title: str
     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
+    sites: list[SiteSearch]  # List of websites and the number of results for each site
 @app.post("/scrape-and-crawl")
 async def scrape_and_crawl(
     request: ScrapeAndCrawlRequest,
         if not x_api_key:
             raise HTTPException(status_code=400, detail="API key is missing from the header")
+        logger.info(f"Received combined scrape and crawl request for College: {request.college_name}, Topic: {request.topic_title}")
         # Configure Google Generative AI API key from header
         genai.configure(api_key=x_api_key)
+        # Initialize lists to hold all crawled links and structured data
+        all_links = []
+        structured_data_list = []
+        # Perform advanced search on the provided sites with custom result counts
+        for site in request.sites:
+            logger.info(f"Performing advanced search on {site.site_url} for {site.num_results} results")
+            site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
+            all_links.extend(site_links)
+        # Scrape visible text from each fetched link and structure the data
+        for link in all_links:
+            logger.info(f"Scraping visible text from link: {link}")
+            try:
+                visible_text = await scrape_visible_text(link)  # Scrape the text
+                structured_data = structure_data(visible_text, request.college_name)  # Structure it
+                structured_data_list.append({"link": link, "structured_data": structured_data})
+            except Exception as scrape_error:
+                logger.error(f"Error scraping link {link}: {scrape_error}")
+                continue  # If scraping fails, continue with the next link
+        # Return the structured data for all successfully scraped links
+        logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
         return {
+            "structured_data": structured_data_list
         }
     except Exception as e:
         logger.error(f"Error occurred while processing combined request: {e}")
         raise HTTPException(status_code=500, detail=str(e))