adityaiiitr commited on
Commit
60b9061
·
verified ·
1 Parent(s): 2c38cdf

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +63 -63
main.py CHANGED
@@ -1,63 +1,63 @@
1
- import os
2
- import google.generativeai as genai
3
- from playwright.async_api import async_playwright
4
- from dotenv import load_dotenv
5
- from fastapi import FastAPI, HTTPException
6
- from pydantic import BaseModel
7
- import uvicorn
8
- import asyncio
9
- import json
10
-
11
- # Load environment variables
12
- load_dotenv()
13
-
14
- # Configure Google Generative AI API key
15
- genai.configure(api_key=os.environ["API_KEY"])
16
-
17
- # FastAPI app initialization
18
- app = FastAPI()
19
-
20
- # Function to scrape webpage and extract visible text
21
- async def scrape_visible_text(url):
22
- async with async_playwright() as p:
23
- browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
24
- context = await browser.new_context(
25
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
26
- viewport={"width": 1280, "height": 800}
27
- )
28
- page = await context.new_page()
29
- await page.goto(url, wait_until="networkidle")
30
- visible_text = await page.evaluate("document.body.innerText")
31
- await browser.close()
32
- return visible_text
33
-
34
- # Function to structure data using Google's Gemini model
35
- def structure_data(text, college_name):
36
- prompt = f"Convert the following unstructured text into a structured format with the titles and content containing the data. Properly structure tables and general text. The structured data should contain details only about the college named '{college_name}':\n{text}"
37
- model = genai.GenerativeModel("gemini-1.5-flash")
38
- response = model.generate_content(prompt)
39
- return response.text.strip()
40
-
41
- # Pydantic model for request body
42
- class URLRequest(BaseModel):
43
- url: str
44
- college_name: str
45
-
46
- # FastAPI endpoint to scrape and structure data
47
- @app.post("/scrape")
48
- async def scrape_and_structure_data(request: URLRequest):
49
- try:
50
- # Scrape visible text from the webpage
51
- visible_text = await scrape_visible_text(request.url)
52
-
53
- # Structure the data using Google's Gemini model
54
- structured_data = structure_data(visible_text, request.college_name)
55
-
56
- # Return the structured data
57
- return {"structured_data": structured_data}
58
- except Exception as e:
59
- print(f"Error occurred while processing the request: {e}")
60
- raise HTTPException(status_code=500, detail=str(e))
61
-
62
- if __name__ == "__main__":
63
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ import os
2
+ import google.generativeai as genai
3
+ from playwright.async_api import async_playwright
4
+ from dotenv import load_dotenv
5
+ from fastapi import FastAPI, HTTPException
6
+ from pydantic import BaseModel
7
+ import uvicorn
8
+ import asyncio
9
+ import json
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Configure Google Generative AI API key
15
+ genai.configure(api_key=os.environ["API_KEY"])
16
+
17
+ # FastAPI app initialization
18
+ app = FastAPI()
19
+
20
+ # Function to scrape webpage and extract visible text
21
+ async def scrape_visible_text(url):
22
+ async with async_playwright() as p:
23
+ browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
24
+ context = await browser.new_context(
25
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
26
+ viewport={"width": 1280, "height": 800}
27
+ )
28
+ page = await context.new_page()
29
+ await page.goto(url, wait_until="networkidle")
30
+ visible_text = await page.evaluate("document.body.innerText")
31
+ await browser.close()
32
+ return visible_text
33
+
34
+ # Function to structure data using Google's Gemini model
35
+ def structure_data(text, college_name):
36
+ prompt = f"Convert the following unstructured text into a structured format with the titles and content containing the data. Properly structure tables and general text. The structured data should contain details only about the college named '{college_name}':\n{text}"
37
+ model = genai.GenerativeModel("gemini-1.5-flash")
38
+ response = model.generate_content(prompt)
39
+ return response.text.strip()
40
+
41
+ # Pydantic model for request body
42
+ class URLRequest(BaseModel):
43
+ url: str
44
+ college_name: str
45
+
46
+ # FastAPI endpoint to scrape and structure data
47
+ @app.post("/scrape")
48
+ async def scrape_and_structure_data(request: URLRequest):
49
+ try:
50
+ # Scrape visible text from the webpage
51
+ visible_text = await scrape_visible_text(request.url)
52
+
53
+ # Structure the data using Google's Gemini model
54
+ structured_data = structure_data(visible_text, request.college_name)
55
+
56
+ # Return the structured data
57
+ return {"structured_data": structured_data}
58
+ except Exception as e:
59
+ print(f"Error occurred while processing the request: {e}")
60
+ raise HTTPException(status_code=500, detail=str(e))
61
+
62
+ if __name__ == "__main__":
63
+ uvicorn.run(app, host="0.0.0.0", port=7860)