Spaces:

adityaiiitr
/

precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 22, 2024

Commit

0f2ff7e

1 Parent(s): 53a960c

links grouping added

Browse files

Files changed (1) hide show

main.py +24 -4

main.py CHANGED Viewed

@@ -162,6 +162,7 @@ class ScrapeAndCrawlRequest(BaseModel):
     topic_title: str  # The topic (and college name) for crawling and structuring
     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
     sites: list[SiteSearch]  # List of websites and the number of results for each site
 @app.post("/scrape-and-crawl")
 async def scrape_and_crawl(
@@ -179,6 +180,7 @@ async def scrape_and_crawl(
         # Initialize lists to hold all crawled links and structured data
         all_links = []
         structured_data_list = []
         # Perform advanced search on the provided sites with custom result counts
@@ -187,18 +189,35 @@ async def scrape_and_crawl(
             site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
             all_links.extend(site_links)
-        # Scrape visible text from each fetched link and structure the data
         for link in all_links:
             logger.info(f"Scraping visible text from link: {link}")
             try:
                 visible_text = await scrape_visible_text(link)  # Scrape the text
-                structured_data = structure_data(visible_text, request.topic_title)  # Structure it
-                structured_data_list.append({"link": link, "structured_data": structured_data})
             except Exception as scrape_error:
                 logger.error(f"Error scraping link {link}: {scrape_error}")
                 continue  # If scraping fails, continue with the next link
-        # Return the structured data for all successfully scraped links
         logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
         return {
             "structured_data": structured_data_list
@@ -208,6 +227,7 @@ async def scrape_and_crawl(
         logger.error(f"Error occurred while processing combined request: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     logger.info("Starting PreCollege Data Scraper Server...")
     uvicorn.run(app, host="0.0.0.0", port=7860)

     topic_title: str  # The topic (and college name) for crawling and structuring
     model_name: str = "gemini-1.5-pro"  # Default to 'gemini-1.5-pro'
     sites: list[SiteSearch]  # List of websites and the number of results for each site
+    group_size: Optional[int] = 3  # Number of links to group together for each GenAI call
 @app.post("/scrape-and-crawl")
 async def scrape_and_crawl(
         # Initialize lists to hold all crawled links and structured data
         all_links = []
+        all_scraped_texts = []
         structured_data_list = []
         # Perform advanced search on the provided sites with custom result counts
             site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
             all_links.extend(site_links)
+        # Scrape visible text from each fetched link and gather all the texts
         for link in all_links:
             logger.info(f"Scraping visible text from link: {link}")
             try:
                 visible_text = await scrape_visible_text(link)  # Scrape the text
+                all_scraped_texts.append(visible_text)
             except Exception as scrape_error:
                 logger.error(f"Error scraping link {link}: {scrape_error}")
                 continue  # If scraping fails, continue with the next link
+        # Process the scraped text in groups to minimize GenAI API calls
+        group_size = request.group_size or 3  # Use default group size if not provided
+        for i in range(0, len(all_scraped_texts), group_size):
+            text_group = all_scraped_texts[i:i + group_size]  # Get the text for the current group
+            combined_text = "\n".join(text_group)  # Combine all the texts in this group
+            logger.info(f"Structuring data for group {i // group_size + 1} with {len(text_group)} links.")
+            prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content. --- {combined_text} ---"
+            # Generate structured content using Google Generative AI
+            try:
+                model = genai.GenerativeModel(request.model_name)
+                response = model.generate_content(prompt)
+                structured_data_list.append(response.text.strip())
+            except Exception as e:
+                logger.error(f"Error generating structured data for group {i // group_size + 1}: {e}")
+                continue
+        # Return the structured data for all successfully processed groups
         logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
         return {
             "structured_data": structured_data_list
         logger.error(f"Error occurred while processing combined request: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     logger.info("Starting PreCollege Data Scraper Server...")
     uvicorn.run(app, host="0.0.0.0", port=7860)