Spaces:

SaiKumar1627
/

Project2

Sleeping

App Files Files Community

Project2 / deliverable2.py

SaiKumar1627

Update deliverable2.py

11e62a5 verified 6 months ago

raw

history blame

7.27 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	from sentence_transformers import SentenceTransformer, util
	from transformers import pipeline

	class URLValidator:
	"""
	A production-ready URL validation class that evaluates the credibility of a webpage
	using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations.
	"""

	def __init__(self):
	# Load models once to avoid redundant API calls
	self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
	self.fake_news_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection")
	self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")

	def fetch_page_content(self, url: str) -> str:
	""" Fetches and extracts text content from the given URL, handling errors gracefully. """
	try:
	headers = {"User-Agent": "Mozilla/5.0"}
	response = requests.get(url, timeout=10, headers=headers)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	content = " ".join([p.text for p in soup.find_all("p")])
	return content if content else "Error: No readable content found on the page."
	except requests.exceptions.Timeout:
	return "Error: Request timed out."
	except requests.exceptions.HTTPError as e:
	return f"Error: HTTP {e.response.status_code} - Page may not exist."
	except requests.exceptions.RequestException as e:
	return f"Error: Unable to fetch URL ({str(e)})."

	def get_domain_trust(self, url: str, content: str) -> int:
	""" Computes the domain trust score. Uses a mock approach for now. """
	if "Error" in content:
	return 0 # If page fetch failed, trust is low
	return len(url) % 5 + 1 # Mock trust rating (1-5)

	def compute_similarity_score(self, user_query: str, content: str) -> int:
	""" Computes semantic similarity between user query and page content. """
	if "Error" in content:
	return 0
	return int(util.pytorch_cos_sim(
	self.similarity_model.encode(user_query),
	self.similarity_model.encode(content)
	).item() * 100)

	def check_facts(self, content: str) -> int:
	""" Simulated function to check fact reliability. """
	if "Error" in content:
	return 0
	return len(content) % 5 + 1 # Mock fact-check rating (1-5)

	def detect_bias(self, content: str) -> int:
	""" Uses NLP sentiment analysis to detect potential bias in content. """
	if "Error" in content:
	return 0
	sentiment_result = self.sentiment_analyzer(content[:512])[0]
	return 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30

	def get_star_rating(self, score: float) -> tuple:
	""" Converts a score (0-100) into a 1-5 star rating. """
	stars = max(1, min(5, round(score / 20))) # Normalize 100-scale to 5-star scale
	return stars, "⭐" * stars

	def generate_explanation(self, domain_trust, similarity_score, fact_check_score, bias_score, final_score) -> str:
	""" Generates a human-readable explanation for the score. """
	reasons = []
	if domain_trust < 50:
	reasons.append("The source has low domain authority.")
	if similarity_score < 50:
	reasons.append("The content is not highly relevant to your query.")
	if fact_check_score < 50:
	reasons.append("Limited fact-checking verification found.")
	if bias_score < 50:
	reasons.append("Potential bias detected in the content.")

	return " ".join(reasons) if reasons else "This source is highly credible and relevant."

	def rate_url_validity(self, user_query: str, url: str):
	""" Main function to evaluate the validity of a webpage. """
	content = self.fetch_page_content(url)

	# Handle errors
	if "Error" in content:
	return {"Validation Error": content}

	domain_trust = self.get_domain_trust(url, content)
	similarity_score = self.compute_similarity_score(user_query, content)
	fact_check_score = self.check_facts(content)
	bias_score = self.detect_bias(content)

	final_score = (
	(0.3 * domain_trust) +
	(0.3 * similarity_score) +
	(0.2 * fact_check_score) +
	(0.2 * bias_score)
	)

	stars, icon = self.get_star_rating(final_score)
	explanation = self.generate_explanation(domain_trust, similarity_score, fact_check_score, bias_score, final_score)

	return {
	"raw_score": {
	"Domain Trust": domain_trust,
	"Content Relevance": similarity_score,
	"Fact-Check Score": fact_check_score,
	"Bias Score": bias_score,
	"Final Validity Score": final_score
	},
	"stars": {
	"icon": icon
	},
	"explanation": explanation
	}


	# ✅ Sample Queries and URLs (10 Each)
	sample_queries = [
	"How does climate change impact global weather?",
	"What are the latest advancements in AI?",
	"How does diet influence mental health?",
	"What are the effects of space travel on astronauts?",
	"Is cryptocurrency a safe investment?",
	"What are the advantages of renewable energy?",
	"How does deep learning work?",
	"What are the health risks of 5G technology?",
	"Is intermittent fasting effective for weight loss?",
	"How do electric vehicles compare to gas cars?"
	]

	sample_urls = [
	"https://www.nationalgeographic.com/environment/article/climate-change",
	"https://www.technologyreview.com/2023/05/01/latest-ai-advancements/",
	"https://www.health.harvard.edu/mind-and-mood/foods-linked-to-better-brainpower",
	"https://www.nasa.gov/hrp/long-term-health-risks-of-space-travel",
	"https://www.investopedia.com/terms/c/cryptocurrency.asp",
	"https://www.energy.gov/eere/renewable-energy",
	"https://www.ibm.com/cloud/deep-learning",
	"https://www.who.int/news-room/questions-and-answers/item/radiation-5g-mobile-networks-and-health",
	"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6167940/",
	"https://www.tesla.com/blog/benefits-of-electric-vehicles"
	]

	# ✅ Running the Validator and Saving to CSV
	validator = URLValidator()

	data_rows = []
	for query, url in zip(sample_queries, sample_urls):
	result = validator.rate_url_validity(query, url)
	func_rating = round(result["raw_score"]["Final Validity Score"] / 20) # Convert 100-scale to 1-5
	custom_rating = func_rating + 1 if func_rating < 5 else func_rating # User-adjusted rating

	data_rows.append([query, url, func_rating, custom_rating])

	# Save to CSV
	csv_filename = "url_validation_results.csv"
	df = pd.DataFrame(data_rows, columns=["user_prompt", "url_to_check", "func_rating", "custom_rating"])
	df.to_csv(csv_filename, index=False)

	print(f"✅ CSV file '{csv_filename}' has been created successfully!")