Spaces:
Sleeping
Sleeping
Commit
·
0f2ff7e
1
Parent(s):
53a960c
links grouping added
Browse files
main.py
CHANGED
@@ -162,6 +162,7 @@ class ScrapeAndCrawlRequest(BaseModel):
|
|
162 |
topic_title: str # The topic (and college name) for crawling and structuring
|
163 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
164 |
sites: list[SiteSearch] # List of websites and the number of results for each site
|
|
|
165 |
|
166 |
@app.post("/scrape-and-crawl")
|
167 |
async def scrape_and_crawl(
|
@@ -179,6 +180,7 @@ async def scrape_and_crawl(
|
|
179 |
|
180 |
# Initialize lists to hold all crawled links and structured data
|
181 |
all_links = []
|
|
|
182 |
structured_data_list = []
|
183 |
|
184 |
# Perform advanced search on the provided sites with custom result counts
|
@@ -187,18 +189,35 @@ async def scrape_and_crawl(
|
|
187 |
site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
|
188 |
all_links.extend(site_links)
|
189 |
|
190 |
-
# Scrape visible text from each fetched link and
|
191 |
for link in all_links:
|
192 |
logger.info(f"Scraping visible text from link: {link}")
|
193 |
try:
|
194 |
visible_text = await scrape_visible_text(link) # Scrape the text
|
195 |
-
|
196 |
-
structured_data_list.append({"link": link, "structured_data": structured_data})
|
197 |
except Exception as scrape_error:
|
198 |
logger.error(f"Error scraping link {link}: {scrape_error}")
|
199 |
continue # If scraping fails, continue with the next link
|
200 |
|
201 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
|
203 |
return {
|
204 |
"structured_data": structured_data_list
|
@@ -208,6 +227,7 @@ async def scrape_and_crawl(
|
|
208 |
logger.error(f"Error occurred while processing combined request: {e}")
|
209 |
raise HTTPException(status_code=500, detail=str(e))
|
210 |
|
|
|
211 |
if __name__ == "__main__":
|
212 |
logger.info("Starting PreCollege Data Scraper Server...")
|
213 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
162 |
topic_title: str # The topic (and college name) for crawling and structuring
|
163 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
164 |
sites: list[SiteSearch] # List of websites and the number of results for each site
|
165 |
+
group_size: Optional[int] = 3 # Number of links to group together for each GenAI call
|
166 |
|
167 |
@app.post("/scrape-and-crawl")
|
168 |
async def scrape_and_crawl(
|
|
|
180 |
|
181 |
# Initialize lists to hold all crawled links and structured data
|
182 |
all_links = []
|
183 |
+
all_scraped_texts = []
|
184 |
structured_data_list = []
|
185 |
|
186 |
# Perform advanced search on the provided sites with custom result counts
|
|
|
189 |
site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
|
190 |
all_links.extend(site_links)
|
191 |
|
192 |
+
# Scrape visible text from each fetched link and gather all the texts
|
193 |
for link in all_links:
|
194 |
logger.info(f"Scraping visible text from link: {link}")
|
195 |
try:
|
196 |
visible_text = await scrape_visible_text(link) # Scrape the text
|
197 |
+
all_scraped_texts.append(visible_text)
|
|
|
198 |
except Exception as scrape_error:
|
199 |
logger.error(f"Error scraping link {link}: {scrape_error}")
|
200 |
continue # If scraping fails, continue with the next link
|
201 |
|
202 |
+
# Process the scraped text in groups to minimize GenAI API calls
|
203 |
+
group_size = request.group_size or 3 # Use default group size if not provided
|
204 |
+
for i in range(0, len(all_scraped_texts), group_size):
|
205 |
+
text_group = all_scraped_texts[i:i + group_size] # Get the text for the current group
|
206 |
+
combined_text = "\n".join(text_group) # Combine all the texts in this group
|
207 |
+
|
208 |
+
logger.info(f"Structuring data for group {i // group_size + 1} with {len(text_group)} links.")
|
209 |
+
prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content. --- {combined_text} ---"
|
210 |
+
|
211 |
+
# Generate structured content using Google Generative AI
|
212 |
+
try:
|
213 |
+
model = genai.GenerativeModel(request.model_name)
|
214 |
+
response = model.generate_content(prompt)
|
215 |
+
structured_data_list.append(response.text.strip())
|
216 |
+
except Exception as e:
|
217 |
+
logger.error(f"Error generating structured data for group {i // group_size + 1}: {e}")
|
218 |
+
continue
|
219 |
+
|
220 |
+
# Return the structured data for all successfully processed groups
|
221 |
logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
|
222 |
return {
|
223 |
"structured_data": structured_data_list
|
|
|
227 |
logger.error(f"Error occurred while processing combined request: {e}")
|
228 |
raise HTTPException(status_code=500, detail=str(e))
|
229 |
|
230 |
+
|
231 |
if __name__ == "__main__":
|
232 |
logger.info("Starting PreCollege Data Scraper Server...")
|
233 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|