Spaces:
Sleeping
Sleeping
Added combined api
Browse files
main.py
CHANGED
@@ -2,8 +2,9 @@ import os
|
|
2 |
import google.generativeai as genai
|
3 |
from playwright.async_api import async_playwright
|
4 |
from dotenv import load_dotenv
|
5 |
-
from fastapi import FastAPI, HTTPException
|
6 |
from pydantic import BaseModel
|
|
|
7 |
import uvicorn
|
8 |
import asyncio
|
9 |
import json
|
@@ -153,6 +154,60 @@ async def crawl_web(request: CrawlerRequest):
|
|
153 |
except Exception as e:
|
154 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
155 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
if __name__ == "__main__":
|
158 |
logger.info("Starting PreCollege Data Scraper Server...")
|
|
|
2 |
import google.generativeai as genai
|
3 |
from playwright.async_api import async_playwright
|
4 |
from dotenv import load_dotenv
|
5 |
+
from fastapi import FastAPI, HTTPException, Header
|
6 |
from pydantic import BaseModel
|
7 |
+
from typing import Optional
|
8 |
import uvicorn
|
9 |
import asyncio
|
10 |
import json
|
|
|
154 |
except Exception as e:
|
155 |
logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
|
156 |
raise HTTPException(status_code=500, detail=str(e))
|
157 |
+
# Updated Pydantic models
|
158 |
+
class ScrapeAndCrawlRequest(BaseModel):
|
159 |
+
url: str
|
160 |
+
college_name: str
|
161 |
+
topic_title: str
|
162 |
+
model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
|
163 |
+
num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
|
164 |
+
|
165 |
+
# Combined API endpoint
|
166 |
+
@app.post("/scrape-and-crawl")
|
167 |
+
async def scrape_and_crawl(
|
168 |
+
request: ScrapeAndCrawlRequest,
|
169 |
+
x_api_key: Optional[str] = Header(None) # API key to be passed in the request header
|
170 |
+
):
|
171 |
+
try:
|
172 |
+
if not x_api_key:
|
173 |
+
raise HTTPException(status_code=400, detail="API key is missing from the header")
|
174 |
+
|
175 |
+
logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
|
176 |
+
|
177 |
+
# Configure Google Generative AI API key from header
|
178 |
+
genai.configure(api_key=x_api_key)
|
179 |
+
|
180 |
+
# Scrape visible text from the provided URL asynchronously
|
181 |
+
visible_text = await scrape_visible_text(request.url)
|
182 |
+
|
183 |
+
# Structure the scraped data using the specified model from the request
|
184 |
+
structured_data = structure_data(visible_text, request.college_name)
|
185 |
+
|
186 |
+
# Perform web crawling to get related links with customizable result count
|
187 |
+
google_links = google_search(request.topic_title, num_results=request.num_results)
|
188 |
+
quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
|
189 |
+
reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
|
190 |
+
|
191 |
+
# Combine all links into one list
|
192 |
+
all_links = google_links + quora_links + reddit_links
|
193 |
+
|
194 |
+
# Use the specified model to filter and get the most relevant URLs
|
195 |
+
prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
|
196 |
+
model = genai.GenerativeModel(request.model_name)
|
197 |
+
response = model.generate_content(prompt)
|
198 |
+
filtered_links = response.text.strip().split('\n')
|
199 |
+
|
200 |
+
# Return the combined structured data and filtered links
|
201 |
+
logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
|
202 |
+
return {
|
203 |
+
"structured_data": structured_data,
|
204 |
+
"all_links": all_links,
|
205 |
+
"filtered_links": filtered_links
|
206 |
+
}
|
207 |
+
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Error occurred while processing combined request: {e}")
|
210 |
+
raise HTTPException(status_code=500, detail=str(e))
|
211 |
|
212 |
if __name__ == "__main__":
|
213 |
logger.info("Starting PreCollege Data Scraper Server...")
|