adityaiiitr commited on
Commit
0f2ff7e
·
1 Parent(s): 53a960c

links grouping added

Browse files
Files changed (1) hide show
  1. main.py +24 -4
main.py CHANGED
@@ -162,6 +162,7 @@ class ScrapeAndCrawlRequest(BaseModel):
162
  topic_title: str # The topic (and college name) for crawling and structuring
163
  model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
164
  sites: list[SiteSearch] # List of websites and the number of results for each site
 
165
 
166
  @app.post("/scrape-and-crawl")
167
  async def scrape_and_crawl(
@@ -179,6 +180,7 @@ async def scrape_and_crawl(
179
 
180
  # Initialize lists to hold all crawled links and structured data
181
  all_links = []
 
182
  structured_data_list = []
183
 
184
  # Perform advanced search on the provided sites with custom result counts
@@ -187,18 +189,35 @@ async def scrape_and_crawl(
187
  site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
188
  all_links.extend(site_links)
189
 
190
- # Scrape visible text from each fetched link and structure the data
191
  for link in all_links:
192
  logger.info(f"Scraping visible text from link: {link}")
193
  try:
194
  visible_text = await scrape_visible_text(link) # Scrape the text
195
- structured_data = structure_data(visible_text, request.topic_title) # Structure it
196
- structured_data_list.append({"link": link, "structured_data": structured_data})
197
  except Exception as scrape_error:
198
  logger.error(f"Error scraping link {link}: {scrape_error}")
199
  continue # If scraping fails, continue with the next link
200
 
201
- # Return the structured data for all successfully scraped links
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
203
  return {
204
  "structured_data": structured_data_list
@@ -208,6 +227,7 @@ async def scrape_and_crawl(
208
  logger.error(f"Error occurred while processing combined request: {e}")
209
  raise HTTPException(status_code=500, detail=str(e))
210
 
 
211
  if __name__ == "__main__":
212
  logger.info("Starting PreCollege Data Scraper Server...")
213
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
162
  topic_title: str # The topic (and college name) for crawling and structuring
163
  model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
164
  sites: list[SiteSearch] # List of websites and the number of results for each site
165
+ group_size: Optional[int] = 3 # Number of links to group together for each GenAI call
166
 
167
  @app.post("/scrape-and-crawl")
168
  async def scrape_and_crawl(
 
180
 
181
  # Initialize lists to hold all crawled links and structured data
182
  all_links = []
183
+ all_scraped_texts = []
184
  structured_data_list = []
185
 
186
  # Perform advanced search on the provided sites with custom result counts
 
189
  site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
190
  all_links.extend(site_links)
191
 
192
+ # Scrape visible text from each fetched link and gather all the texts
193
  for link in all_links:
194
  logger.info(f"Scraping visible text from link: {link}")
195
  try:
196
  visible_text = await scrape_visible_text(link) # Scrape the text
197
+ all_scraped_texts.append(visible_text)
 
198
  except Exception as scrape_error:
199
  logger.error(f"Error scraping link {link}: {scrape_error}")
200
  continue # If scraping fails, continue with the next link
201
 
202
+ # Process the scraped text in groups to minimize GenAI API calls
203
+ group_size = request.group_size or 3 # Use default group size if not provided
204
+ for i in range(0, len(all_scraped_texts), group_size):
205
+ text_group = all_scraped_texts[i:i + group_size] # Get the text for the current group
206
+ combined_text = "\n".join(text_group) # Combine all the texts in this group
207
+
208
+ logger.info(f"Structuring data for group {i // group_size + 1} with {len(text_group)} links.")
209
+ prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content. --- {combined_text} ---"
210
+
211
+ # Generate structured content using Google Generative AI
212
+ try:
213
+ model = genai.GenerativeModel(request.model_name)
214
+ response = model.generate_content(prompt)
215
+ structured_data_list.append(response.text.strip())
216
+ except Exception as e:
217
+ logger.error(f"Error generating structured data for group {i // group_size + 1}: {e}")
218
+ continue
219
+
220
+ # Return the structured data for all successfully processed groups
221
  logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
222
  return {
223
  "structured_data": structured_data_list
 
227
  logger.error(f"Error occurred while processing combined request: {e}")
228
  raise HTTPException(status_code=500, detail=str(e))
229
 
230
+
231
  if __name__ == "__main__":
232
  logger.info("Starting PreCollege Data Scraper Server...")
233
  uvicorn.run(app, host="0.0.0.0", port=7860)