Spaces:

adityaiiitr
/

precollege_scraper

Sleeping

App Files Files Community

precollege_scraper / main.py

adityaiiitr

Update main.py

db5c3f6 verified 11 months ago

raw

history blame

2.75 kB

	import os
	import google.generativeai as genai
	from playwright.async_api import async_playwright
	from dotenv import load_dotenv
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	import uvicorn
	import asyncio
	import json

	# Load environment variables
	load_dotenv()

	# Configure Google Generative AI API key
	genai.configure(api_key=os.environ["API_KEY"])

	# FastAPI app initialization
	app = FastAPI()

	# Function to scrape webpage and extract visible text
	async def scrape_visible_text(url):
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
	context = await browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
	viewport={"width": 1280, "height": 800}
	)
	page = await context.new_page()
	await page.goto(url, wait_until="networkidle")
	visible_text = await page.evaluate("document.body.innerText")
	await browser.close()
	return visible_text

	# Function to structure data using Google's Gemini model
	def structure_data(text, college_name):
	prompt = f"Convert the following unstructured text into a structured form with the titles and content containing the data. Don't incldue any kind of text formatting like bold, newline, etc. Properly structure tables and general text. The structured data should contain details only about the college named '{college_name}':\n{text}. Make sure the content include everything such as connectivity, placement, nearby, colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, qna or any other college related parameters only if it is availabale in the above text."
	model = genai.GenerativeModel("gemini-1.5-pro")
	response = model.generate_content(prompt)
	return response.text.strip()

	# Pydantic model for request body
	class URLRequest(BaseModel):
	url: str
	college_name: str

	# FastAPI endpoint to scrape and structure data
	@app.post("/scrape")
	async def scrape_and_structure_data(request: URLRequest):
	try:
	# Scrape visible text from the webpage
	visible_text = await scrape_visible_text(request.url)

	# Structure the data using Google's Gemini model
	structured_data = structure_data(visible_text, request.college_name)

	# Return the structured data
	return {"structured_data": structured_data}
	except Exception as e:
	print(f"Error occurred while processing the request: {e}")
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)