adityasproutsai commited on
Commit
a88b526
·
1 Parent(s): 6db47e9
Files changed (1) hide show
  1. main.py +55 -0
main.py CHANGED
@@ -7,6 +7,8 @@ from pydantic import BaseModel
7
  import uvicorn
8
  import asyncio
9
  import json
 
 
10
 
11
  # Load environment variables
12
  load_dotenv()
@@ -56,6 +58,29 @@ def structure_data(text, college_name):
56
  class URLRequest(BaseModel):
57
  url: str
58
  college_name: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # FastAPI endpoint to scrape and structure data
61
  @app.post("/scrape")
@@ -72,6 +97,36 @@ async def scrape_and_structure_data(request: URLRequest):
72
  except Exception as e:
73
  print(f"Error occurred while processing the request: {e}")
74
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  if __name__ == "__main__":
77
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
7
  import uvicorn
8
  import asyncio
9
  import json
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
 
13
  # Load environment variables
14
  load_dotenv()
 
58
  class URLRequest(BaseModel):
59
  url: str
60
  college_name: str
61
+ # Pydantic model for Crawler request
62
+ class CrawlerRequest(BaseModel):
63
+ topic_title: str
64
+
65
+ # Function to perform Google search and return top N links
66
+ def google_search(query, num_results=5):
67
+ search_url = f"https://www.google.com/search?q={query}&num={num_results}"
68
+ headers = {
69
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
70
+ }
71
+ response = requests.get(search_url, headers=headers)
72
+ soup = BeautifulSoup(response.text, "html.parser")
73
+ links = []
74
+ for a in soup.find_all('a', href=True, attrs={'jsname': True}):
75
+ link = a['href']
76
+ if link.startswith("https://"):
77
+ links.append(link)
78
+ return links[:num_results]
79
+
80
+ # Function to perform advanced search on specific sites
81
+ def advanced_search_on_site(site, topic, num_results=10):
82
+ query = f"site:{site} {topic}"
83
+ return google_search(query, num_results)
84
 
85
  # FastAPI endpoint to scrape and structure data
86
  @app.post("/scrape")
 
97
  except Exception as e:
98
  print(f"Error occurred while processing the request: {e}")
99
  raise HTTPException(status_code=500, detail=str(e))
100
+
101
+ # FastAPI endpoint to perform web crawling
102
+ @app.post("/crawl")
103
+ async def crawl_web(request: CrawlerRequest):
104
+ try:
105
+ topic_title = request.topic_title
106
+
107
+ # Get top 5 links from Google search
108
+ google_links = google_search(topic_title, num_results=5)
109
+
110
+ # Get links from Quora
111
+ quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
112
+
113
+ # Additional sites can be added similarly
114
+ other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
115
 
116
+ # Combine all links
117
+ all_links = google_links + quora_links + other_links
118
+
119
+ # Use Gemini to filter and list relevant URLs
120
+ prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
121
+ model = genai.GenerativeModel("gemini-1.5-pro")
122
+ response = model.generate_content(prompt)
123
+ filtered_links = response.text.strip().split('\n')
124
+
125
+ # Return the filtered links
126
+ return {"filtered_links": filtered_links}
127
+ except Exception as e:
128
+ print(f"Error occurred while processing the request: {e}")
129
+ raise HTTPException(status_code=500, detail=str(e))
130
+
131
  if __name__ == "__main__":
132
  uvicorn.run(app, host="0.0.0.0", port=7860)