Spaces:

adityaiiitr
/

precollege_scraper

Sleeping

App Files Files Community

adityaiiitr commited on Oct 13, 2024

Commit

cf196e2

1 Parent(s): 0ed421b

logging added

Browse files

Files changed (1) hide show

main.py +82 -56

main.py CHANGED Viewed

@@ -9,6 +9,7 @@ import asyncio
 import json
 import requests
 from bs4 import BeautifulSoup
 # Load environment variables
 load_dotenv()
@@ -16,66 +17,95 @@ load_dotenv()
 # Configure Google Generative AI API key
 genai.configure(api_key=os.environ["API_KEY"])
 # FastAPI app initialization
 app = FastAPI()
 # Function to scrape webpage and extract visible text
 async def scrape_visible_text(url):
-    async with async_playwright() as p:
-        browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
-        context = await browser.new_context(
-            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
-            viewport={"width": 1280, "height": 800},
-            extra_http_headers={
-                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-                "accept-encoding": "gzip, deflate, br, zstd",
-                "accept-language": "en-US,en;q=0.9,hi;q=0.8",
-                "cache-control": "max-age=0",
-                "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
-                "sec-ch-ua-mobile": "?0",
-                "sec-ch-ua-platform": '"Windows"',
-                "sec-fetch-dest": "document",
-                "sec-fetch-mode": "navigate",
-                "sec-fetch-site": "none",
-                "sec-fetch-user": "?1",
-                "upgrade-insecure-requests": "1"
-            }
-        )
-        page = await context.new_page()
-        await page.goto(url, wait_until="domcontentloaded")
-        visible_text = await page.evaluate("document.body.innerText")
-        await browser.close()
-        return visible_text
 # Function to structure data using Google's Gemini model
 def structure_data(text, college_name):
-    prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no fromatting! --- \n{text} ---. Use only the text between the '---' markers as input source text. If information is not available about any specific thing dont mention it."
-    model = genai.GenerativeModel("gemini-1.5-pro")
-    response = model.generate_content(prompt)
-    return response.text.strip()
 # Pydantic model for request body
 class URLRequest(BaseModel):
     url: str
     college_name: str
-    # Pydantic model for Crawler request
 class CrawlerRequest(BaseModel):
     topic_title: str
 # Function to perform Google search and return top N links
 def google_search(query, num_results=5):
-    search_url = f"https://www.google.com/search?q={query}&num={num_results}"
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
-    }
-    response = requests.get(search_url, headers=headers)
-    soup = BeautifulSoup(response.text, "html.parser")
-    links = []
-    for a in soup.find_all('a', href=True, attrs={'jsname': True}):
-        link = a['href']
-        if link.startswith("https://") and not link.includes("google.com"):
-            links.append(link)
-    return links[:num_results]
 # Function to perform advanced search on specific sites
 def advanced_search_on_site(site, topic, num_results=10):
@@ -86,48 +116,44 @@ def advanced_search_on_site(site, topic, num_results=10):
 @app.post("/scrape")
 async def scrape_and_structure_data(request: URLRequest):
     try:
         # Scrape visible text from the webpage
         visible_text = await scrape_visible_text(request.url)
         # Structure the data using Google's Gemini model
         structured_data = structure_data(visible_text, request.college_name)
         # Return the structured data
         return {"structured_data": structured_data}
     except Exception as e:
-        print(f"Error occurred while processing the request: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 # FastAPI endpoint to perform web crawling
 @app.post("/crawl")
 async def crawl_web(request: CrawlerRequest):
     try:
         topic_title = request.topic_title
         # Get top 5 links from Google search
         google_links = google_search(topic_title, num_results=10)
         # Get links from Quora
         quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
         # Additional sites can be added similarly
         other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
         # Combine all links
         all_links = google_links + quora_links + other_links
         # Use Gemini to filter and list relevant URLs
         prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
         model = genai.GenerativeModel("gemini-1.5-pro")
         response = model.generate_content(prompt)
         filtered_links = response.text.strip().split('\n')
         # Return the filtered links
-        return {"links": all_links,
-                "filtered_links": filtered_links}
     except Exception as e:
-        print(f"Error occurred while processing the request: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import json
 import requests
 from bs4 import BeautifulSoup
+import logging
 # Load environment variables
 load_dotenv()
 # Configure Google Generative AI API key
 genai.configure(api_key=os.environ["API_KEY"])
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger("ScrapeStructureApp")
 # FastAPI app initialization
 app = FastAPI()
 # Function to scrape webpage and extract visible text
 async def scrape_visible_text(url):
+    try:
+        logger.info(f"Starting to scrape visible text from URL: {url}")
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)  # Launch browser in headless mode
+            context = await browser.new_context(
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
+                viewport={"width": 1280, "height": 800},
+                extra_http_headers={
+                    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+                    "accept-encoding": "gzip, deflate, br, zstd",
+                    "accept-language": "en-US,en;q=0.9,hi;q=0.8",
+                    "cache-control": "max-age=0",
+                    "sec-ch-ua": '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
+                    "sec-ch-ua-mobile": "?0",
+                    "sec-ch-ua-platform": '"Windows"',
+                    "sec-fetch-dest": "document",
+                    "sec-fetch-mode": "navigate",
+                    "sec-fetch-site": "none",
+                    "sec-fetch-user": "?1",
+                    "upgrade-insecure-requests": "1"
+                }
+            )
+            page = await context.new_page()
+            await page.goto(url, wait_until="domcontentloaded")
+            visible_text = await page.evaluate("document.body.innerText")
+            await browser.close()
+            logger.info(f"Successfully scraped visible text from URL: {url}")
+            return visible_text
+    except Exception as e:
+        logger.error(f"Error while scraping visible text from URL {url}: {e}")
+        raise
 # Function to structure data using Google's Gemini model
 def structure_data(text, college_name):
+    try:
+        logger.info(f"Starting to structure data for college: {college_name}")
+        prompt = f"Convert the following unstructured text into a well-written and comprehensive structured form with titles and content containing all relevant data. The response should be a detailed paragraph mentioning everything about the college named '{college_name}', ensuring no important information is missed. Include details such as connectivity, placement, nearby colleges, infrastructure, courses, branches, students, festivals, clubs, reviews, Q&A, and any other college-related parameters available in the text. Provide the response text with no formatting! --- \n{text} ---. Use only the text between the '---' markers as input source text. If information is not available about any specific thing dont mention it."
+        model = genai.GenerativeModel("gemini-1.5-pro")
+        response = model.generate_content(prompt)
+        logger.info(f"Successfully structured data for college: {college_name}")
+        return response.text.strip()
+    except Exception as e:
+        logger.error(f"Error while structuring data for college {college_name}: {e}")
+        raise
 # Pydantic model for request body
 class URLRequest(BaseModel):
     url: str
     college_name: str
+# Pydantic model for Crawler request
 class CrawlerRequest(BaseModel):
     topic_title: str
 # Function to perform Google search and return top N links
 def google_search(query, num_results=5):
+    try:
+        logger.info(f"Performing Google search for query: {query}")
+        search_url = f"https://www.google.com/search?q={query}&num={num_results}"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
+        }
+        response = requests.get(search_url, headers=headers)
+        soup = BeautifulSoup(response.text, "html.parser")
+        links = []
+        for a in soup.find_all('a', href=True, attrs={'jsname': True}):
+            link = a['href']
+            if link.startswith("https://") and not link.__contains__("google.com"):
+                links.append(link)
+        logger.info(f"Successfully retrieved {len(links)} links for query: {query}")
+        return links[:num_results]
+    except Exception as e:
+        logger.error(f"Error while performing Google search for query {query}: {e}")
+        raise
 # Function to perform advanced search on specific sites
 def advanced_search_on_site(site, topic, num_results=10):
 @app.post("/scrape")
 async def scrape_and_structure_data(request: URLRequest):
     try:
+        logger.info(f"Received scrape request for URL: {request.url}, College Name: {request.college_name}")
         # Scrape visible text from the webpage
         visible_text = await scrape_visible_text(request.url)
         # Structure the data using Google's Gemini model
         structured_data = structure_data(visible_text, request.college_name)
+        logger.info(f"Successfully processed scrape request for URL: {request.url}")
         # Return the structured data
         return {"structured_data": structured_data}
     except Exception as e:
+        logger.error(f"Error occurred while processing scrape request for URL {request.url}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 # FastAPI endpoint to perform web crawling
 @app.post("/crawl")
 async def crawl_web(request: CrawlerRequest):
     try:
         topic_title = request.topic_title
+        logger.info(f"Received crawl request for topic: {topic_title}")
         # Get top 5 links from Google search
         google_links = google_search(topic_title, num_results=10)
         # Get links from Quora
         quora_links = advanced_search_on_site("quora.com", topic_title, num_results=10)
         # Additional sites can be added similarly
         other_links = advanced_search_on_site("reddit.com", topic_title, num_results=10)
         # Combine all links
         all_links = google_links + quora_links + other_links
         # Use Gemini to filter and list relevant URLs
         prompt = f"Filter the following URLs and list only those that are most relevant to the topic '{topic_title}':\n{all_links}. Response should only contain the array of links with no formatting."
         model = genai.GenerativeModel("gemini-1.5-pro")
         response = model.generate_content(prompt)
         filtered_links = response.text.strip().split('\n')
+        logger.info(f"Successfully processed crawl request for topic: {topic_title}")
         # Return the filtered links
+        return {"links": all_links, "filtered_links": filtered_links}
     except Exception as e:
+        logger.error(f"Error occurred while processing crawl request for topic {topic_title}: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
+    logger.info("Starting PreCollege Data Scraper Server...")
     uvicorn.run(app, host="0.0.0.0", port=7860)