adityaiiitr commited on
Commit
9c2f54a
·
verified ·
1 Parent(s): cf196e2

Added combined api

Browse files
Files changed (1) hide show
  1. main.py +56 -1
main.py CHANGED
@@ -2,8 +2,9 @@ import os
2
  import google.generativeai as genai
3
  from playwright.async_api import async_playwright
4
  from dotenv import load_dotenv
5
- from fastapi import FastAPI, HTTPException
6
  from pydantic import BaseModel
 
7
  import uvicorn
8
  import asyncio
9
  import json
@@ -153,6 +154,60 @@ async def crawl_web(request: CrawlerRequest):
153
  except Exception as e:
154
  logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
155
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  if __name__ == "__main__":
158
  logger.info("Starting PreCollege Data Scraper Server...")
 
2
  import google.generativeai as genai
3
  from playwright.async_api import async_playwright
4
  from dotenv import load_dotenv
5
+ from fastapi import FastAPI, HTTPException, Header
6
  from pydantic import BaseModel
7
+ from typing import Optional
8
  import uvicorn
9
  import asyncio
10
  import json
 
154
  except Exception as e:
155
  logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
156
  raise HTTPException(status_code=500, detail=str(e))
157
+ # Updated Pydantic models
158
+ class ScrapeAndCrawlRequest(BaseModel):
159
+ url: str
160
+ college_name: str
161
+ topic_title: str
162
+ model_name: str = "gemini-1.5-pro" # Default to 'gemini-1.5-pro'
163
+ num_results: int = 5 # Default number of results to fetch from Google, Quora, Reddit
164
+
165
+ # Combined API endpoint
166
+ @app.post("/scrape-and-crawl")
167
+ async def scrape_and_crawl(
168
+ request: ScrapeAndCrawlRequest,
169
+ x_api_key: Optional[str] = Header(None) # API key to be passed in the request header
170
+ ):
171
+ try:
172
+ if not x_api_key:
173
+ raise HTTPException(status_code=400, detail="API key is missing from the header")
174
+
175
+ logger.info(f"Received combined scrape and crawl request for URL: {request.url}, College Name: {request.college_name}, Topic: {request.topic_title}")
176
+
177
+ # Configure Google Generative AI API key from header
178
+ genai.configure(api_key=x_api_key)
179
+
180
+ # Scrape visible text from the provided URL asynchronously
181
+ visible_text = await scrape_visible_text(request.url)
182
+
183
+ # Structure the scraped data using the specified model from the request
184
+ structured_data = structure_data(visible_text, request.college_name)
185
+
186
+ # Perform web crawling to get related links with customizable result count
187
+ google_links = google_search(request.topic_title, num_results=request.num_results)
188
+ quora_links = advanced_search_on_site("quora.com", request.topic_title, num_results=request.num_results)
189
+ reddit_links = advanced_search_on_site("reddit.com", request.topic_title, num_results=request.num_results)
190
+
191
+ # Combine all links into one list
192
+ all_links = google_links + quora_links + reddit_links
193
+
194
+ # Use the specified model to filter and get the most relevant URLs
195
+ prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{request.topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
196
+ model = genai.GenerativeModel(request.model_name)
197
+ response = model.generate_content(prompt)
198
+ filtered_links = response.text.strip().split('\n')
199
+
200
+ # Return the combined structured data and filtered links
201
+ logger.info(f"Successfully processed combined request for URL: {request.url} and Topic: {request.topic_title}")
202
+ return {
203
+ "structured_data": structured_data,
204
+ "all_links": all_links,
205
+ "filtered_links": filtered_links
206
+ }
207
+
208
+ except Exception as e:
209
+ logger.error(f"Error occurred while processing combined request: {e}")
210
+ raise HTTPException(status_code=500, detail=str(e))
211
 
212
  if __name__ == "__main__":
213
  logger.info("Starting PreCollege Data Scraper Server...")