Spaces:

adityaiiitr
/

precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 10, 2024

Commit

60b9061

verified ·

1 Parent(s): 2c38cdf

Update main.py

Browse files

Files changed (1) hide show

main.py +63 -63

main.py CHANGED Viewed

@@ -1,63 +1,63 @@
-import os
-import google.generativeai as genai
-from playwright.async_api import async_playwright
-from dotenv import load_dotenv
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-import uvicorn
-import asyncio
-import json
-# Load environment variables
-load_dotenv()
-# Configure Google Generative AI API key
-genai.configure(api_key=os.environ["API_KEY"])
-# FastAPI app initialization
-app = FastAPI()
-# Function to scrape webpage and extract visible text
-async def scrape_visible_text(url):
-    async with async_playwright() as p:
-        browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
-        context = await browser.new_context(
-            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
-            viewport={"width": 1280, "height": 800}
-        )
-        page = await context.new_page()
-        await page.goto(url, wait_until="networkidle")
-        visible_text = await page.evaluate("document.body.innerText")
-        await browser.close()
-        return visible_text
-# Function to structure data using Google's Gemini model
-def structure_data(text, college_name):
-    prompt = f"Convert the following unstructured text into a structured format with the titles and content containing the data. Properly structure tables and general text. The structured data should contain details only about the college named '{college_name}':\n{text}"
-    model = genai.GenerativeModel("gemini-1.5-flash")
-    response = model.generate_content(prompt)
-    return response.text.strip()
-# Pydantic model for request body
-class URLRequest(BaseModel):
-    url: str
-    college_name: str
-# FastAPI endpoint to scrape and structure data
-@app.post("/scrape")
-async def scrape_and_structure_data(request: URLRequest):
-    try:
-        # Scrape visible text from the webpage
-        visible_text = await scrape_visible_text(request.url)
-        # Structure the data using Google's Gemini model
-        structured_data = structure_data(visible_text, request.college_name)
-        # Return the structured data
-        return {"structured_data": structured_data}
-    except Exception as e:
-        print(f"Error occurred while processing the request: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+import os
+import google.generativeai as genai
+from playwright.async_api import async_playwright
+from dotenv import load_dotenv
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import uvicorn
+import asyncio
+import json
+# Load environment variables
+load_dotenv()
+# Configure Google Generative AI API key
+genai.configure(api_key=os.environ["API_KEY"])
+# FastAPI app initialization
+app = FastAPI()
+# Function to scrape webpage and extract visible text
+async def scrape_visible_text(url):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
+            viewport={"width": 1280, "height": 800}
+        )
+        page = await context.new_page()
+        await page.goto(url, wait_until="networkidle")
+        visible_text = await page.evaluate("document.body.innerText")
+        await browser.close()
+        return visible_text
+# Function to structure data using Google's Gemini model
+def structure_data(text, college_name):
+    prompt = f"Convert the following unstructured text into a structured format with the titles and content containing the data. Properly structure tables and general text. The structured data should contain details only about the college named '{college_name}':\n{text}"
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    response = model.generate_content(prompt)
+    return response.text.strip()
+# Pydantic model for request body
+class URLRequest(BaseModel):
+    url: str
+    college_name: str
+# FastAPI endpoint to scrape and structure data
+@app.post("/scrape")
+async def scrape_and_structure_data(request: URLRequest):
+    try:
+        # Scrape visible text from the webpage
+        visible_text = await scrape_visible_text(request.url)
+        # Structure the data using Google's Gemini model
+        structured_data = structure_data(visible_text, request.college_name)
+        # Return the structured data
+        return {"structured_data": structured_data}
+    except Exception as e:
+        print(f"Error occurred while processing the request: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)