Spaces:

adityaiiitr
/

precollege_scraper

Sleeping

App Files Files Community

adityasproutsai commited on Oct 12, 2024

Commit

a88b526

1 Parent(s): 6db47e9

api v1.2

Browse files

Files changed (1) hide show

main.py +55 -0

main.py CHANGED Viewed

@@ -7,6 +7,8 @@ from pydantic import BaseModel
 import uvicorn
 import asyncio
 import json
 # Load environment variables
 load_dotenv()
@@ -56,6 +58,29 @@ def structure_data(text, college_name):
 class URLRequest(BaseModel):
     url: str
     college_name: str
 # FastAPI endpoint to scrape and structure data
 @app.post("/scrape")
@@ -72,6 +97,36 @@ async def scrape_and_structure_data(request: URLRequest):
     except Exception as e:
         print(f"Error occurred while processing the request: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import uvicorn
 import asyncio
 import json
+import requests
+from bs4 import BeautifulSoup
 # Load environment variables
 load_dotenv()
 class URLRequest(BaseModel):
     url: str
     college_name: str
+    # Pydantic model for Crawler request
+class CrawlerRequest(BaseModel):
+    topic_title: str
+# Function to perform Google search and return top N links
+def google_search(query, num_results=5):
+    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
+    }
+    response = requests.get(search_url, headers=headers)
+    soup = BeautifulSoup(response.text, "html.parser")
+    links = []
+    for a in soup.find_all('a', href=True, attrs={'jsname': True}):
+        link = a['href']
+        if link.startswith("https://"):
+            links.append(link)
+    return links[:num_results]
+# Function to perform advanced search on specific sites
+def advanced_search_on_site(site, topic, num_results=10):
+    query = f"site:{site} {topic}"
+    return google_search(query, num_results)
 # FastAPI endpoint to scrape and structure data
 @app.post("/scrape")
     except Exception as e:
         print(f"Error occurred while processing the request: {e}")
         raise HTTPException(status_code=500, detail=str(e))
+# FastAPI endpoint to perform web crawling
+@app.post("/crawl")
+async def crawl_web(request: CrawlerRequest):
+    try:
+        topic_title = request.topic_title
+        # Get top 5 links from Google search
+        google_links = google_search(topic_title, num_results=5)
+        # Get links from Quora
+        quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
+        # Additional sites can be added similarly
+        other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
+        # Combine all links
+        all_links = google_links + quora_links + other_links
+        # Use Gemini to filter and list relevant URLs
+        prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}"
+        model = genai.GenerativeModel("gemini-1.5-pro")
+        response = model.generate_content(prompt)
+        filtered_links = response.text.strip().split('\n')
+        # Return the filtered links
+        return {"filtered_links": filtered_links}
+    except Exception as e:
+        print(f"Error occurred while processing the request: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)