adityaiiitr commited on
Commit
8d78a0b
·
verified ·
1 Parent(s): 9c2f54a

updated v2 api combined

Browse files
Files changed (1) hide show
  1. main.py +85 -29
main.py CHANGED
@@ -154,15 +154,72 @@ async def crawl_web(request: CrawlerRequest):
154
  except Exception as e:
155
  logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
156
  raise HTTPException(status_code=500, detail=str(e))
157
- # Updated Pydantic models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  class ScrapeAndCrawlRequest(BaseModel):
159
- url: str
160
  college_name: str
161
  topic_title: str
162
  model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
163
- num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
164
 
165
- # Combined API endpoint
166
  @app.post("/scrape-and-crawl")
167
  async def scrape_and_crawl(
168
  request: ScrapeAndCrawlRequest,
@@ -172,39 +229,38 @@ async def scrape_and_crawl(
172
  if not x_api_key:
173
  raise HTTPException(status_code=400, detail="API key is missing from the header")
174
 
175
- logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
176
 
177
  # Configure Google Generative AI API key from header
178
  genai.configure(api_key=x_api_key)
179
-
180
- # Scrape visible text from the provided URL asynchronously
181
- visible_text = await scrape_visible_text(request.url)
182
-
183
- # Structure the scraped data using the specified model from the request
184
- structured_data = structure_data(visible_text, request.college_name)
185
 
186
- # Perform web crawling to get related links with customizable result count
187
- google_links = google_search(request.topic_title, num_results=request.num_results)
188
- quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
189
- reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
190
 
191
- # Combine all links into one list
192
- all_links = google_links + quora_links + reddit_links
193
-
194
- # Use the specified model to filter and get the most relevant URLs
195
- prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
196
- model = genai.GenerativeModel(request.model_name)
197
- response = model.generate_content(prompt)
198
- filtered_links = response.text.strip().split('\n')
 
 
 
 
 
 
 
 
199
 
200
- # Return the combined structured data and filtered links
201
- logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
202
  return {
203
- "structured_data": structured_data,
204
- "all_links": all_links,
205
- "filtered_links": filtered_links
206
  }
207
-
208
  except Exception as e:
209
  logger.error(f"Error occurred while processing combined request: {e}")
210
  raise HTTPException(status_code=500, detail=str(e))
 
154
  except Exception as e:
155
  logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
156
  raise HTTPException(status_code=500, detail=str(e))
157
+
158
+ # # Updated Pydantic models
159
+ # class ScrapeAndCrawlRequest(BaseModel):
160
+ # url: str
161
+ # college_name: str
162
+ # topic_title: str
163
+ # model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
164
+ # num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
165
+
166
+ # # Combined API endpoint
167
+ # @app.post("/scrape-and-crawl")
168
+ # async def scrape_and_crawl(
169
+ # request: ScrapeAndCrawlRequest,
170
+ # x_api_key: Optional[str] = Header(None) # API key to be passed in the request header
171
+ # ):
172
+ # try:
173
+ # if not x_api_key:
174
+ # raise HTTPException(status_code=400, detail="API key is missing from the header")
175
+
176
+ # logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
177
+
178
+ # # Configure Google Generative AI API key from header
179
+ # genai.configure(api_key=x_api_key)
180
+
181
+ # # Scrape visible text from the provided URL asynchronously
182
+ # visible_text = await scrape_visible_text(request.url)
183
+
184
+ # # Structure the scraped data using the specified model from the request
185
+ # structured_data = structure_data(visible_text, request.college_name)
186
+
187
+ # # Perform web crawling to get related links with customizable result count
188
+ # google_links = google_search(request.topic_title, num_results=request.num_results)
189
+ # quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
190
+ # reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
191
+
192
+ # # Combine all links into one list
193
+ # all_links = google_links + quora_links + reddit_links
194
+
195
+ # # Use the specified model to filter and get the most relevant URLs
196
+ # prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
197
+ # model = genai.GenerativeModel(request.model_name)
198
+ # response = model.generate_content(prompt)
199
+ # filtered_links = response.text.strip().split('\n')
200
+
201
+ # # Return the combined structured data and filtered links
202
+ # logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
203
+ # return {
204
+ # "structured_data": structured_data,
205
+ # "all_links": all_links,
206
+ # "filtered_links": filtered_links
207
+ # }
208
+
209
+ # except Exception as e:
210
+ # logger.error(f"Error occurred while processing combined request: {e}")
211
+ # raise HTTPException(status_code=500, detail=str(e))
212
+
213
+ class SiteSearch(BaseModel):
214
+ site_url: str # Website to perform advanced search on
215
+ num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5
216
+
217
  class ScrapeAndCrawlRequest(BaseModel):
 
218
  college_name: str
219
  topic_title: str
220
  model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
221
+ sites: list[SiteSearch] # List of websites and the number of results for each site
222
 
 
223
  @app.post("/scrape-and-crawl")
224
  async def scrape_and_crawl(
225
  request: ScrapeAndCrawlRequest,
 
229
  if not x_api_key:
230
  raise HTTPException(status_code=400, detail="API key is missing from the header")
231
 
232
+ logger.info(f"Received combined scrape and crawl request for College: {request.college_name}, Topic: {request.topic_title}")
233
 
234
  # Configure Google Generative AI API key from header
235
  genai.configure(api_key=x_api_key)
 
 
 
 
 
 
236
 
237
+ # Initialize lists to hold all crawled links and structured data
238
+ all_links = []
239
+ structured_data_list = []
 
240
 
241
+ # Perform advanced search on the provided sites with custom result counts
242
+ for site in request.sites:
243
+ logger.info(f"Performing advanced search on {site.site_url} for {site.num_results} results")
244
+ site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
245
+ all_links.extend(site_links)
246
+
247
+ # Scrape visible text from each fetched link and structure the data
248
+ for link in all_links:
249
+ logger.info(f"Scraping visible text from link: {link}")
250
+ try:
251
+ visible_text = await scrape_visible_text(link) # Scrape the text
252
+ structured_data = structure_data(visible_text, request.college_name) # Structure it
253
+ structured_data_list.append({"link": link, "structured_data": structured_data})
254
+ except Exception as scrape_error:
255
+ logger.error(f"Error scraping link {link}: {scrape_error}")
256
+ continue # If scraping fails, continue with the next link
257
 
258
+ # Return the structured data for all successfully scraped links
259
+ logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
260
  return {
261
+ "structured_data": structured_data_list
 
 
262
  }
263
+
264
  except Exception as e:
265
  logger.error(f"Error occurred while processing combined request: {e}")
266
  raise HTTPException(status_code=500, detail=str(e))