Spaces:
Sleeping
Sleeping
Commit
·
f9a7a53
1
Parent(s):
8d78a0b
testing combined api
Browse files
main.py
CHANGED
@@ -154,69 +154,12 @@ async def crawl_web(request: CrawlerRequest):
|
|
154 |
except Exception as e:
|
155 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
156 |
raise HTTPException(status_code=500, detail=str(e))
|
157 |
-
|
158 |
-
# # Updated Pydantic models
|
159 |
-
# class ScrapeAndCrawlRequest(BaseModel):
|
160 |
-
# url: str
|
161 |
-
# college_name: str
|
162 |
-
# topic_title: str
|
163 |
-
# model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
164 |
-
# num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
|
165 |
-
|
166 |
-
# # Combined API endpoint
|
167 |
-
# @app.post("/scrape-and-crawl")
|
168 |
-
# async def scrape_and_crawl(
|
169 |
-
# request: ScrapeAndCrawlRequest,
|
170 |
-
# x_api_key: Optional[str] = Header(None) # API key to be passed in the request header
|
171 |
-
# ):
|
172 |
-
# try:
|
173 |
-
# if not x_api_key:
|
174 |
-
# raise HTTPException(status_code=400, detail="API key is missing from the header")
|
175 |
-
|
176 |
-
# logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
|
177 |
-
|
178 |
-
# # Configure Google Generative AI API key from header
|
179 |
-
# genai.configure(api_key=x_api_key)
|
180 |
-
|
181 |
-
# # Scrape visible text from the provided URL asynchronously
|
182 |
-
# visible_text = await scrape_visible_text(request.url)
|
183 |
-
|
184 |
-
# # Structure the scraped data using the specified model from the request
|
185 |
-
# structured_data = structure_data(visible_text, request.college_name)
|
186 |
-
|
187 |
-
# # Perform web crawling to get related links with customizable result count
|
188 |
-
# google_links = google_search(request.topic_title, num_results=request.num_results)
|
189 |
-
# quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
|
190 |
-
# reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
|
191 |
-
|
192 |
-
# # Combine all links into one list
|
193 |
-
# all_links = google_links + quora_links + reddit_links
|
194 |
-
|
195 |
-
# # Use the specified model to filter and get the most relevant URLs
|
196 |
-
# prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
|
197 |
-
# model = genai.GenerativeModel(request.model_name)
|
198 |
-
# response = model.generate_content(prompt)
|
199 |
-
# filtered_links = response.text.strip().split('\n')
|
200 |
-
|
201 |
-
# # Return the combined structured data and filtered links
|
202 |
-
# logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
|
203 |
-
# return {
|
204 |
-
# "structured_data": structured_data,
|
205 |
-
# "all_links": all_links,
|
206 |
-
# "filtered_links": filtered_links
|
207 |
-
# }
|
208 |
-
|
209 |
-
# except Exception as e:
|
210 |
-
# logger.error(f"Error occurred while processing combined request: {e}")
|
211 |
-
# raise HTTPException(status_code=500, detail=str(e))
|
212 |
-
|
213 |
class SiteSearch(BaseModel):
|
214 |
site_url: str # Website to perform advanced search on
|
215 |
num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5
|
216 |
|
217 |
class ScrapeAndCrawlRequest(BaseModel):
|
218 |
-
|
219 |
-
topic_title: str
|
220 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
221 |
sites: list[SiteSearch] # List of websites and the number of results for each site
|
222 |
|
@@ -229,7 +172,7 @@ async def scrape_and_crawl(
|
|
229 |
if not x_api_key:
|
230 |
raise HTTPException(status_code=400, detail="API key is missing from the header")
|
231 |
|
232 |
-
logger.info(f"Received combined scrape and crawl request for
|
233 |
|
234 |
# Configure Google Generative AI API key from header
|
235 |
genai.configure(api_key=x_api_key)
|
@@ -249,7 +192,7 @@ async def scrape_and_crawl(
|
|
249 |
logger.info(f"Scraping visible text from link: {link}")
|
250 |
try:
|
251 |
visible_text = await scrape_visible_text(link) # Scrape the text
|
252 |
-
structured_data = structure_data(visible_text, request.
|
253 |
structured_data_list.append({"link": link, "structured_data": structured_data})
|
254 |
except Exception as scrape_error:
|
255 |
logger.error(f"Error scraping link {link}: {scrape_error}")
|
@@ -267,4 +210,4 @@ async def scrape_and_crawl(
|
|
267 |
|
268 |
if __name__ == "__main__":
|
269 |
logger.info("Starting PreCollege Data Scraper Server...")
|
270 |
-
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
154 |
except Exception as e:
|
155 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
156 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
class SiteSearch(BaseModel):
|
158 |
site_url: str # Website to perform advanced search on
|
159 |
num_results: Optional[int] = 5 # Optional number of results to fetch, default is 5
|
160 |
|
161 |
class ScrapeAndCrawlRequest(BaseModel):
|
162 |
+
topic_title: str # The topic (and college name) for crawling and structuring
|
|
|
163 |
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
164 |
sites: list[SiteSearch] # List of websites and the number of results for each site
|
165 |
|
|
|
172 |
if not x_api_key:
|
173 |
raise HTTPException(status_code=400, detail="API key is missing from the header")
|
174 |
|
175 |
+
logger.info(f"Received combined scrape and crawl request for Topic: {request.topic_title}")
|
176 |
|
177 |
# Configure Google Generative AI API key from header
|
178 |
genai.configure(api_key=x_api_key)
|
|
|
192 |
logger.info(f"Scraping visible text from link: {link}")
|
193 |
try:
|
194 |
visible_text = await scrape_visible_text(link) # Scrape the text
|
195 |
+
structured_data = structure_data(visible_text, request.topic_title) # Structure it
|
196 |
structured_data_list.append({"link": link, "structured_data": structured_data})
|
197 |
except Exception as scrape_error:
|
198 |
logger.error(f"Error scraping link {link}: {scrape_error}")
|
|
|
210 |
|
211 |
if __name__ == "__main__":
|
212 |
logger.info("Starting PreCollege Data Scraper Server...")
|
213 |
+
uvicorn.run(app, host="0.0.0.0", port=7860, reload="true")
|