adityaiiitr commited on
Commit
f9a7a53
·
1 Parent(s): 8d78a0b

testing combined api

Browse files
Files changed (1) hide show
  1. main.py +4 -61
main.py CHANGED
@@ -154,69 +154,12 @@ async def crawl_web(request: CrawlerRequest):
154
  except Exception as e:
155
  logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
156
  raise HTTPException(status_code=500, detail=str(e))
157
-
158
- # # Updated Pydantic models
159
- # class ScrapeAndCrawlRequest(BaseModel):
160
- # url: str
161
- # college_name: str
162
- # topic_title: str
163
- # model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
164
- # num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
165
-
166
- # # Combined API endpoint
167
- # @app.post("/scrape-and-crawl")
168
- # async def scrape_and_crawl(
169
- # request: ScrapeAndCrawlRequest,
170
- # x_api_key: Optional[str] = Header(None) # API key to be passed in the request header
171
- # ):
172
- # try:
173
- # if not x_api_key:
174
- # raise HTTPException(status_code=400, detail="API key is missing from the header")
175
-
176
- # logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
177
-
178
- # # Configure Google Generative AI API key from header
179
- # genai.configure(api_key=x_api_key)
180
-
181
- # # Scrape visible text from the provided URL asynchronously
182
- # visible_text = await scrape_visible_text(request.url)
183
-
184
- # # Structure the scraped data using the specified model from the request
185
- # structured_data = structure_data(visible_text, request.college_name)
186
-
187
- # # Perform web crawling to get related links with customizable result count
188
- # google_links = google_search(request.topic_title, num_results=request.num_results)
189
- # quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
190
- # reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
191
-
192
- # # Combine all links into one list
193
- # all_links = google_links + quora_links + reddit_links
194
-
195
- # # Use the specified model to filter and get the most relevant URLs
196
- # prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
197
- # model = genai.GenerativeModel(request.model_name)
198
- # response = model.generate_content(prompt)
199
- # filtered_links = response.text.strip().split('\n')
200
-
201
- # # Return the combined structured data and filtered links
202
- # logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
203
- # return {
204
- # "structured_data": structured_data,
205
- # "all_links": all_links,
206
- # "filtered_links": filtered_links
207
- # }
208
-
209
- # except Exception as e:
210
- # logger.error(f"Error occurred while processing combined request: {e}")
211
- # raise HTTPException(status_code=500, detail=str(e))
212
-
213
  class SiteSearch(BaseModel):
214
  site_url: str # Website to perform advanced search on
215
  num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5
216
 
217
  class ScrapeAndCrawlRequest(BaseModel):
218
- college_name: str
219
- topic_title: str
220
  model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
221
  sites: list[SiteSearch] # List of websites and the number of results for each site
222
 
@@ -229,7 +172,7 @@ async def scrape_and_crawl(
229
  if not x_api_key:
230
  raise HTTPException(status_code=400, detail="API key is missing from the header")
231
 
232
- logger.info(f"Received combined scrape and crawl request for College: {request.college_name}, Topic: {request.topic_title}")
233
 
234
  # Configure Google Generative AI API key from header
235
  genai.configure(api_key=x_api_key)
@@ -249,7 +192,7 @@ async def scrape_and_crawl(
249
  logger.info(f"Scraping visible text from link: {link}")
250
  try:
251
  visible_text = await scrape_visible_text(link) # Scrape the text
252
- structured_data = structure_data(visible_text, request.college_name) # Structure it
253
  structured_data_list.append({"link": link, "structured_data": structured_data})
254
  except Exception as scrape_error:
255
  logger.error(f"Error scraping link {link}: {scrape_error}")
@@ -267,4 +210,4 @@ async def scrape_and_crawl(
267
 
268
  if __name__ == "__main__":
269
  logger.info("Starting PreCollege Data Scraper Server...")
270
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
154
  except Exception as e:
155
  logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
156
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  class SiteSearch(BaseModel):
158
  site_url: str # Website to perform advanced search on
159
  num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5
160
 
161
  class ScrapeAndCrawlRequest(BaseModel):
162
+ topic_title: str # The topic (and college name) for crawling and structuring
 
163
  model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
164
  sites: list[SiteSearch] # List of websites and the number of results for each site
165
 
 
172
  if not x_api_key:
173
  raise HTTPException(status_code=400, detail="API key is missing from the header")
174
 
175
+ logger.info(f"Received combined scrape and crawl request for Topic: {request.topic_title}")
176
 
177
  # Configure Google Generative AI API key from header
178
  genai.configure(api_key=x_api_key)
 
192
  logger.info(f"Scraping visible text from link: {link}")
193
  try:
194
  visible_text = await scrape_visible_text(link) # Scrape the text
195
+ structured_data = structure_data(visible_text, request.topic_title) # Structure it
196
  structured_data_list.append({"link": link, "structured_data": structured_data})
197
  except Exception as scrape_error:
198
  logger.error(f"Error scraping link {link}: {scrape_error}")
 
210
 
211
  if __name__ == "__main__":
212
  logger.info("Starting PreCollege Data Scraper Server...")
213
+ uvicorn.run(app, host="0.0.0.0", port=7860, reload="true")