Spaces:
Sleeping
Sleeping
Commit
·
a88b526
1
Parent(s):
6db47e9
api v1.2
Browse files
main.py
CHANGED
@@ -7,6 +7,8 @@ from pydantic import BaseModel
|
|
7 |
import uvicorn
|
8 |
import asyncio
|
9 |
import json
|
|
|
|
|
10 |
|
11 |
# Load environment variables
|
12 |
load_dotenv()
|
@@ -56,6 +58,29 @@ def structure_data(text, college_name):
|
|
56 |
class URLRequest(BaseModel):
|
57 |
url: str
|
58 |
college_name: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
# FastAPI endpoint to scrape and structure data
|
61 |
@app.post("/scrape")
|
@@ -72,6 +97,36 @@ async def scrape_and_structure_data(request: URLRequest):
|
|
72 |
except Exception as e:
|
73 |
print(f"Error occurred while processing the request: {e}")
|
74 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
if __name__ == "__main__":
|
77 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
7 |
import uvicorn
|
8 |
import asyncio
|
9 |
import json
|
10 |
+
import requests
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
|
13 |
# Load environment variables
|
14 |
load_dotenv()
|
|
|
58 |
class URLRequest(BaseModel):
|
59 |
url: str
|
60 |
college_name: str
|
61 |
+
# Pydantic model for Crawler request
|
62 |
+
class CrawlerRequest(BaseModel):
|
63 |
+
topic_title: str
|
64 |
+
|
65 |
+
# Function to perform Google search and return top N links
|
66 |
+
def google_search(query, num_results=5):
|
67 |
+
search_url = f"https://www.google.com/search?q={query}&num={num_results}"
|
68 |
+
headers = {
|
69 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
|
70 |
+
}
|
71 |
+
response = requests.get(search_url, headers=headers)
|
72 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
73 |
+
links = []
|
74 |
+
for a in soup.find_all('a', href=True, attrs={'jsname': True}):
|
75 |
+
link = a['href']
|
76 |
+
if link.startswith("https://"):
|
77 |
+
links.append(link)
|
78 |
+
return links[:num_results]
|
79 |
+
|
80 |
+
# Function to perform advanced search on specific sites
|
81 |
+
def advanced_search_on_site(site, topic, num_results=10):
|
82 |
+
query = f"site:{site} {topic}"
|
83 |
+
return google_search(query, num_results)
|
84 |
|
85 |
# FastAPI endpoint to scrape and structure data
|
86 |
@app.post("/scrape")
|
|
|
97 |
except Exception as e:
|
98 |
print(f"Error occurred while processing the request: {e}")
|
99 |
raise HTTPException(status_code=500, detail=str(e))
|
100 |
+
|
101 |
+
# FastAPI endpoint to perform web crawling
|
102 |
+
@app.post("/crawl")
|
103 |
+
async def crawl_web(request: CrawlerRequest):
|
104 |
+
try:
|
105 |
+
topic_title = request.topic_title
|
106 |
+
|
107 |
+
# Get top 5 links from Google search
|
108 |
+
google_links = google_search(topic_title, num_results=5)
|
109 |
+
|
110 |
+
# Get links from Quora
|
111 |
+
quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
|
112 |
+
|
113 |
+
# Additional sites can be added similarly
|
114 |
+
other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
|
115 |
|
116 |
+
# Combine all links
|
117 |
+
all_links = google_links + quora_links + other_links
|
118 |
+
|
119 |
+
# Use Gemini to filter and list relevant URLs
|
120 |
+
prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
|
121 |
+
model = genai.GenerativeModel("gemini-1.5-pro")
|
122 |
+
response = model.generate_content(prompt)
|
123 |
+
filtered_links = response.text.strip().split('\n')
|
124 |
+
|
125 |
+
# Return the filtered links
|
126 |
+
return {"filtered_links": filtered_links}
|
127 |
+
except Exception as e:
|
128 |
+
print(f"Error occurred while processing the request: {e}")
|
129 |
+
raise HTTPException(status_code=500, detail=str(e))
|
130 |
+
|
131 |
if __name__ == "__main__":
|
132 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|