Spaces:
Sleeping
Sleeping
updated v2 api combined
Browse files
main.py
CHANGED
@@ -154,15 +154,72 @@ async def crawl_web(request: CrawlerRequest):
|
|
154 |
except Exception as e:
|
155 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
156 |
raise HTTPException(status_code=500, detail=str(e))
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
class ScrapeAndCrawlRequest(BaseModel):
|
159 |
-
url: str
|
160 |
college_name: str
|
161 |
topic_title: str
|
162 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
163 |
-
|
164 |
|
165 |
-
# Combined API endpoint
|
166 |
@app.post("/scrape-and-crawl")
|
167 |
async def scrape_and_crawl(
|
168 |
request: ScrapeAndCrawlRequest,
|
@@ -172,39 +229,38 @@ async def scrape_and_crawl(
|
|
172 |
if not x_api_key:
|
173 |
raise HTTPException(status_code=400, detail="API key is missing from the header")
|
174 |
|
175 |
-
logger.info(f"Received combined scrape and crawl request for
|
176 |
|
177 |
# Configure Google Generative AI API key from header
|
178 |
genai.configure(api_key=x_api_key)
|
179 |
-
|
180 |
-
# Scrape visible text from the provided URL asynchronously
|
181 |
-
visible_text = await scrape_visible_text(request.url)
|
182 |
-
|
183 |
-
# Structure the scraped data using the specified model from the request
|
184 |
-
structured_data = structure_data(visible_text, request.college_name)
|
185 |
|
186 |
-
#
|
187 |
-
|
188 |
-
|
189 |
-
reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
|
190 |
|
191 |
-
#
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
-
# Return the
|
201 |
-
logger.info(f"Successfully processed combined request for
|
202 |
return {
|
203 |
-
"structured_data":
|
204 |
-
"all_links": all_links,
|
205 |
-
"filtered_links": filtered_links
|
206 |
}
|
207 |
-
|
208 |
except Exception as e:
|
209 |
logger.error(f"Error occurred while processing combined request: {e}")
|
210 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
154 |
except Exception as e:
|
155 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
156 |
raise HTTPException(status_code=500, detail=str(e))
|
157 |
+
|
158 |
+
# # Updated Pydantic models
|
159 |
+
# class ScrapeAndCrawlRequest(BaseModel):
|
160 |
+
# url: str
|
161 |
+
# college_name: str
|
162 |
+
# topic_title: str
|
163 |
+
# model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
164 |
+
# num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
|
165 |
+
|
166 |
+
# # Combined API endpoint
|
167 |
+
# @app.post("/scrape-and-crawl")
|
168 |
+
# async def scrape_and_crawl(
|
169 |
+
# request: ScrapeAndCrawlRequest,
|
170 |
+
# x_api_key: Optional[str] = Header(None) # API key to be passed in the request header
|
171 |
+
# ):
|
172 |
+
# try:
|
173 |
+
# if not x_api_key:
|
174 |
+
# raise HTTPException(status_code=400, detail="API key is missing from the header")
|
175 |
+
|
176 |
+
# logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
|
177 |
+
|
178 |
+
# # Configure Google Generative AI API key from header
|
179 |
+
# genai.configure(api_key=x_api_key)
|
180 |
+
|
181 |
+
# # Scrape visible text from the provided URL asynchronously
|
182 |
+
# visible_text = await scrape_visible_text(request.url)
|
183 |
+
|
184 |
+
# # Structure the scraped data using the specified model from the request
|
185 |
+
# structured_data = structure_data(visible_text, request.college_name)
|
186 |
+
|
187 |
+
# # Perform web crawling to get related links with customizable result count
|
188 |
+
# google_links = google_search(request.topic_title, num_results=request.num_results)
|
189 |
+
# quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
|
190 |
+
# reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
|
191 |
+
|
192 |
+
# # Combine all links into one list
|
193 |
+
# all_links = google_links + quora_links + reddit_links
|
194 |
+
|
195 |
+
# # Use the specified model to filter and get the most relevant URLs
|
196 |
+
# prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
|
197 |
+
# model = genai.GenerativeModel(request.model_name)
|
198 |
+
# response = model.generate_content(prompt)
|
199 |
+
# filtered_links = response.text.strip().split('\n')
|
200 |
+
|
201 |
+
# # Return the combined structured data and filtered links
|
202 |
+
# logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
|
203 |
+
# return {
|
204 |
+
# "structured_data": structured_data,
|
205 |
+
# "all_links": all_links,
|
206 |
+
# "filtered_links": filtered_links
|
207 |
+
# }
|
208 |
+
|
209 |
+
# except Exception as e:
|
210 |
+
# logger.error(f"Error occurred while processing combined request: {e}")
|
211 |
+
# raise HTTPException(status_code=500, detail=str(e))
|
212 |
+
|
213 |
+
class SiteSearch(BaseModel):
|
214 |
+
site_url: str # Website to perform advanced search on
|
215 |
+
num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5
|
216 |
+
|
217 |
class ScrapeAndCrawlRequest(BaseModel):
|
|
|
218 |
college_name: str
|
219 |
topic_title: str
|
220 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
221 |
+
sites: list[SiteSearch] # List of websites and the number of results for each site
|
222 |
|
|
|
223 |
@app.post("/scrape-and-crawl")
|
224 |
async def scrape_and_crawl(
|
225 |
request: ScrapeAndCrawlRequest,
|
|
|
229 |
if not x_api_key:
|
230 |
raise HTTPException(status_code=400, detail="API key is missing from the header")
|
231 |
|
232 |
+
logger.info(f"Received combined scrape and crawl request for College: {request.college_name}, Topic: {request.topic_title}")
|
233 |
|
234 |
# Configure Google Generative AI API key from header
|
235 |
genai.configure(api_key=x_api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
+
# Initialize lists to hold all crawled links and structured data
|
238 |
+
all_links = []
|
239 |
+
structured_data_list = []
|
|
|
240 |
|
241 |
+
# Perform advanced search on the provided sites with custom result counts
|
242 |
+
for site in request.sites:
|
243 |
+
logger.info(f"Performing advanced search on {site.site_url} for {site.num_results} results")
|
244 |
+
site_links = advanced_search_on_site(site.site_url, request.topic_title, num_results=site.num_results)
|
245 |
+
all_links.extend(site_links)
|
246 |
+
|
247 |
+
# Scrape visible text from each fetched link and structure the data
|
248 |
+
for link in all_links:
|
249 |
+
logger.info(f"Scraping visible text from link: {link}")
|
250 |
+
try:
|
251 |
+
visible_text = await scrape_visible_text(link) # Scrape the text
|
252 |
+
structured_data = structure_data(visible_text, request.college_name) # Structure it
|
253 |
+
structured_data_list.append({"link": link, "structured_data": structured_data})
|
254 |
+
except Exception as scrape_error:
|
255 |
+
logger.error(f"Error scraping link {link}: {scrape_error}")
|
256 |
+
continue # If scraping fails, continue with the next link
|
257 |
|
258 |
+
# Return the structured data for all successfully scraped links
|
259 |
+
logger.info(f"Successfully processed combined request for Topic: {request.topic_title}")
|
260 |
return {
|
261 |
+
"structured_data": structured_data_list
|
|
|
|
|
262 |
}
|
263 |
+
|
264 |
except Exception as e:
|
265 |
logger.error(f"Error occurred while processing combined request: {e}")
|
266 |
raise HTTPException(status_code=500, detail=str(e))
|