adityaiiitr commited on
Commit
cf196e2
·
1 Parent(s): 0ed421b

logging added

Browse files
Files changed (1) hide show
  1. main.py +82 -56
main.py CHANGED
@@ -9,6 +9,7 @@ import asyncio
9
  import json
10
  import requests
11
  from bs4 import BeautifulSoup
 
12
 
13
  # Load environment variables
14
  load_dotenv()
@@ -16,66 +17,95 @@ load_dotenv()
16
  # Configure Google Generative AI API key
17
  genai.configure(api_key=os.environ["API_KEY"])
18
 
 
 
 
 
 
 
 
 
 
 
19
  # FastAPI app initialization
20
  app = FastAPI()
21
 
22
  # Function to scrape webpage and extract visible text
23
  async def scrape_visible_text(url):
24
- async with async_playwright() as p:
25
- browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
26
- context = await browser.new_context(
27
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
28
- viewport={"width": 1280, "height": 800},
29
- extra_http_headers={
30
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
31
- "accept-encoding": "gzip, deflate, br, zstd",
32
- "accept-language": "en-US,en;q=0.9,hi;q=0.8",
33
- "cache-control": "max-age=0",
34
- "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
35
- "sec-ch-ua-mobile": "?0",
36
- "sec-ch-ua-platform": '"Windows"',
37
- "sec-fetch-dest": "document",
38
- "sec-fetch-mode": "navigate",
39
- "sec-fetch-site": "none",
40
- "sec-fetch-user": "?1",
41
- "upgrade-insecure-requests": "1"
42
- }
43
- )
44
- page = await context.new_page()
45
- await page.goto(url, wait_until="domcontentloaded")
46
- visible_text = await page.evaluate("document.body.innerText")
47
- await browser.close()
48
- return visible_text
 
 
 
 
 
 
49
 
50
  # Function to structure data using Google's Gemini model
51
  def structure_data(text, college_name):
52
- prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no fromatting! --- \n{text} ---. Use only the text between the '---' markers as input source text. If information is not available about any specific thing dont mention it."
53
- model = genai.GenerativeModel("gemini-1.5-pro")
54
- response = model.generate_content(prompt)
55
- return response.text.strip()
 
 
 
 
 
 
56
 
57
  # Pydantic model for request body
58
  class URLRequest(BaseModel):
59
  url: str
60
  college_name: str
61
- # Pydantic model for Crawler request
 
62
  class CrawlerRequest(BaseModel):
63
  topic_title: str
64
 
65
  # Function to perform Google search and return top N links
66
  def google_search(query, num_results=5):
67
- search_url = f"https://www.google.com/search?q={query}&num={num_results}"
68
- headers = {
69
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
70
- }
71
- response = requests.get(search_url, headers=headers)
72
- soup = BeautifulSoup(response.text, "html.parser")
73
- links = []
74
- for a in soup.find_all('a', href=True, attrs={'jsname': True}):
75
- link = a['href']
76
- if link.startswith("https://") and not link.includes("google.com"):
77
- links.append(link)
78
- return links[:num_results]
 
 
 
 
 
 
79
 
80
  # Function to perform advanced search on specific sites
81
  def advanced_search_on_site(site, topic, num_results=10):
@@ -86,48 +116,44 @@ def advanced_search_on_site(site, topic, num_results=10):
86
  @app.post("/scrape")
87
  async def scrape_and_structure_data(request: URLRequest):
88
  try:
 
89
  # Scrape visible text from the webpage
90
  visible_text = await scrape_visible_text(request.url)
91
-
92
  # Structure the data using Google's Gemini model
93
  structured_data = structure_data(visible_text, request.college_name)
94
-
95
  # Return the structured data
96
  return {"structured_data": structured_data}
97
  except Exception as e:
98
- print(f"Error occurred while processing the request: {e}")
99
  raise HTTPException(status_code=500, detail=str(e))
100
-
101
  # FastAPI endpoint to perform web crawling
102
  @app.post("/crawl")
103
  async def crawl_web(request: CrawlerRequest):
104
  try:
105
  topic_title = request.topic_title
106
-
107
  # Get top 5 links from Google search
108
  google_links = google_search(topic_title, num_results=10)
109
-
110
  # Get links from Quora
111
  quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
112
-
113
  # Additional sites can be added similarly
114
  other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
115
-
116
  # Combine all links
117
  all_links = google_links + quora_links + other_links
118
-
119
  # Use Gemini to filter and list relevant URLs
120
  prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
121
  model = genai.GenerativeModel("gemini-1.5-pro")
122
  response = model.generate_content(prompt)
123
  filtered_links = response.text.strip().split('\n')
124
-
125
  # Return the filtered links
126
- return {"links": all_links,
127
- "filtered_links": filtered_links}
128
  except Exception as e:
129
- print(f"Error occurred while processing the request: {e}")
130
  raise HTTPException(status_code=500, detail=str(e))
131
-
132
  if __name__ == "__main__":
 
133
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
9
  import json
10
  import requests
11
  from bs4 import BeautifulSoup
12
+ import logging
13
 
14
  # Load environment variables
15
  load_dotenv()
 
17
  # Configure Google Generative AI API key
18
  genai.configure(api_key=os.environ["API_KEY"])
19
 
20
+ # Set up logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
24
+ handlers=[
25
+ logging.StreamHandler()
26
+ ]
27
+ )
28
+ logger = logging.getLogger("ScrapeStructureApp")
29
+
30
  # FastAPI app initialization
31
  app = FastAPI()
32
 
33
  # Function to scrape webpage and extract visible text
34
  async def scrape_visible_text(url):
35
+ try:
36
+ logger.info(f"Starting to scrape visible text from URL: {url}")
37
+ async with async_playwright() as p:
38
+ browser = await p.chromium.launch(headless=True) # Launch browser in headless mode
39
+ context = await browser.new_context(
40
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
41
+ viewport={"width": 1280, "height": 800},
42
+ extra_http_headers={
43
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
44
+ "accept-encoding": "gzip, deflate, br, zstd",
45
+ "accept-language": "en-US,en;q=0.9,hi;q=0.8",
46
+ "cache-control": "max-age=0",
47
+ "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
48
+ "sec-ch-ua-mobile": "?0",
49
+ "sec-ch-ua-platform": '"Windows"',
50
+ "sec-fetch-dest": "document",
51
+ "sec-fetch-mode": "navigate",
52
+ "sec-fetch-site": "none",
53
+ "sec-fetch-user": "?1",
54
+ "upgrade-insecure-requests": "1"
55
+ }
56
+ )
57
+ page = await context.new_page()
58
+ await page.goto(url, wait_until="domcontentloaded")
59
+ visible_text = await page.evaluate("document.body.innerText")
60
+ await browser.close()
61
+ logger.info(f"Successfully scraped visible text from URL: {url}")
62
+ return visible_text
63
+ except Exception as e:
64
+ logger.error(f"Error while scraping visible text from URL {url}: {e}")
65
+ raise
66
 
67
  # Function to structure data using Google's Gemini model
68
  def structure_data(text, college_name):
69
+ try:
70
+ logger.info(f"Starting to structure data for college: {college_name}")
71
+ prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no formatting! --- \n{text} ---. Use only the text between the '---' markers as input source text. If information is not available about any specific thing dont mention it."
72
+ model = genai.GenerativeModel("gemini-1.5-pro")
73
+ response = model.generate_content(prompt)
74
+ logger.info(f"Successfully structured data for college: {college_name}")
75
+ return response.text.strip()
76
+ except Exception as e:
77
+ logger.error(f"Error while structuring data for college {college_name}: {e}")
78
+ raise
79
 
80
  # Pydantic model for request body
81
  class URLRequest(BaseModel):
82
  url: str
83
  college_name: str
84
+
85
+ # Pydantic model for Crawler request
86
  class CrawlerRequest(BaseModel):
87
  topic_title: str
88
 
89
  # Function to perform Google search and return top N links
90
  def google_search(query, num_results=5):
91
+ try:
92
+ logger.info(f"Performing Google search for query: {query}")
93
+ search_url = f"https://www.google.com/search?q={query}&num={num_results}"
94
+ headers = {
95
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
96
+ }
97
+ response = requests.get(search_url, headers=headers)
98
+ soup = BeautifulSoup(response.text, "html.parser")
99
+ links = []
100
+ for a in soup.find_all('a', href=True, attrs={'jsname': True}):
101
+ link = a['href']
102
+ if link.startswith("https://") and not link.__contains__("google.com"):
103
+ links.append(link)
104
+ logger.info(f"Successfully retrieved {len(links)} links for query: {query}")
105
+ return links[:num_results]
106
+ except Exception as e:
107
+ logger.error(f"Error while performing Google search for query {query}: {e}")
108
+ raise
109
 
110
  # Function to perform advanced search on specific sites
111
  def advanced_search_on_site(site, topic, num_results=10):
 
116
  @app.post("/scrape")
117
  async def scrape_and_structure_data(request: URLRequest):
118
  try:
119
+ logger.info(f"Received scrape request for URL: {request.url}, College Name: {request.college_name}")
120
  # Scrape visible text from the webpage
121
  visible_text = await scrape_visible_text(request.url)
 
122
  # Structure the data using Google's Gemini model
123
  structured_data = structure_data(visible_text, request.college_name)
124
+ logger.info(f"Successfully processed scrape request for URL: {request.url}")
125
  # Return the structured data
126
  return {"structured_data": structured_data}
127
  except Exception as e:
128
+ logger.error(f"Error occurred while processing scrape request for URL {request.url}: {e}")
129
  raise HTTPException(status_code=500, detail=str(e))
130
+
131
  # FastAPI endpoint to perform web crawling
132
  @app.post("/crawl")
133
  async def crawl_web(request: CrawlerRequest):
134
  try:
135
  topic_title = request.topic_title
136
+ logger.info(f"Received crawl request for topic: {topic_title}")
137
  # Get top 5 links from Google search
138
  google_links = google_search(topic_title, num_results=10)
 
139
  # Get links from Quora
140
  quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
 
141
  # Additional sites can be added similarly
142
  other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
 
143
  # Combine all links
144
  all_links = google_links + quora_links + other_links
 
145
  # Use Gemini to filter and list relevant URLs
146
  prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
147
  model = genai.GenerativeModel("gemini-1.5-pro")
148
  response = model.generate_content(prompt)
149
  filtered_links = response.text.strip().split('\n')
150
+ logger.info(f"Successfully processed crawl request for topic: {topic_title}")
151
  # Return the filtered links
152
+ return {"links": all_links, "filtered_links": filtered_links}
 
153
  except Exception as e:
154
+ logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
155
  raise HTTPException(status_code=500, detail=str(e))
156
+
157
  if __name__ == "__main__":
158
+ logger.info("Starting PreCollege Data Scraper Server...")
159
  uvicorn.run(app, host="0.0.0.0", port=7860)