Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /search_detection.py

pmkhanh7890

refactor code + fix bug of label after grouping url

00b1038 5 months ago

raw

history blame

9.25 kB

	"""
	Author: Khanh Phan
	Date: 2024-12-04
	"""

	import warnings
	from typing import Optional

	import numpy as np
	from pandas import DataFrame
	from sentence_transformers import util

	from src.application.config import (
	DEVICE,
	MAX_CHAR_SIZE,
	PARAPHRASE_MODEL,
	PARAPHRASE_THRESHOLD,
	PARAPHRASE_THRESHOLD_HUMAN,
	PARAPHRASE_THRESHOLD_MACHINE,
	TOP_URLS_PER_SEARCH,
	)
	from src.application.text.helper import split_into_sentences
	from src.application.text.search import (
	generate_search_phrases,
	search_by_google,
	)
	from src.application.url_reader import URLReader

	warnings.simplefilter(action="ignore", category=FutureWarning)


	def find_sentence_source(
	text: list,
	text_index: str,
	sentences_df: DataFrame,
	) -> tuple[DataFrame, list]:
	"""
	Finds the source URL for a given sentence by searching Google
	and checking for paraphrases.

	Args:
	text (list): A list of sentences.
	text_index (int): The index of the sentence to find the source for.
	sentences_df (pd.DataFrame): A DF to store sentence information.

	Returns:
	tuple: A tuple of the updated sentences_df and a list of image URLs.
	If a source is found, the DF is updated with source information.
	If no source is found, the DF is updated with the original input.
	"""
	checked_urls = (
	set()
	) # Keep track of visited URLs to avoid redundant checks
	searched_phrases = generate_search_phrases(text[text_index])

	for candidate in searched_phrases:
	# Search Google for the generated phrase
	search_results = search_by_google(candidate)

	# Extract URLs from search results
	urls = [item["link"] for item in search_results.get("items", [])]

	# Check the top 3 URLs from the search results
	for url in urls[:TOP_URLS_PER_SEARCH]:
	if url in checked_urls: # Skip already checked URLs
	continue
	if "bbc.com" not in url: # TODO: remove when releasing
	continue

	checked_urls.add(url)
	print(f"\t\tChecking URL: {url}")

	content = URLReader(url)

	if content.is_extracted is True:
	if content.title is None or content.text is None:
	print("\t\t\t↑↑↑ Title or text not found")
	continue

	source_text = content.title + "\n" + content.text
	if len(source_text) > MAX_CHAR_SIZE:
	print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
	continue
	print(f"\t\t\t↑↑↑ Title: {content.title}")
	aligned_sentence = check_paraphrase(
	text[text_index],
	source_text,
	url,
	)

	if aligned_sentence["paraphrase"] is False:
	sentences_df.loc[text_index, "input"] = aligned_sentence[
	"input"
	]
	sentences_df.loc[text_index, "paraphrase"] = (
	aligned_sentence["paraphrase"]
	)
	return sentences_df, []

	if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD:
	columns = [
	"input",
	"source",
	"label",
	"similarity",
	"paraphrase",
	"url",
	]
	else:
	columns = [
	"input",
	"label",
	"paraphrase",
	]

	for c in columns:
	if c in sentences_df.columns:
	sentences_df.loc[text_index, c] = aligned_sentence[c]

	# Check other sentences for better matches in the same source
	for idx, _ in sentences_df.iterrows():
	similarity = sentences_df.loc[idx, "similarity"]
	if similarity is not None:
	if similarity > PARAPHRASE_THRESHOLD_MACHINE:
	continue

	aligned_sentence = check_paraphrase(
	text[idx],
	source_text,
	url,
	)

	if (
	similarity is None
	or aligned_sentence["similarity"] > similarity
	):
	if (
	aligned_sentence["similarity"]
	> PARAPHRASE_THRESHOLD
	):
	columns = [
	"input",
	"source",
	"label",
	"similarity",
	"url",
	]
	else:
	columns = [
	"input",
	"label",
	]
	for c in columns:
	if c in sentences_df.columns:
	sentences_df.loc[idx, c] = aligned_sentence[c]
	return sentences_df, content.images

	# If no source is found, update the DF with the original input
	sentences_df.loc[text_index, "input"] = text[text_index]
	return sentences_df, []


	def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
	"""
	Checks if the input text is a paraphrase of the source text
	by comparing sentence-level similarities.

	Args:
	input_text (str): The text to be checked for paraphrasing.
	source_text (str): The source text to compare against.
	url (str): The URL of the source text (for storing in the result).

	Returns:
	dict: A dictionary containing the alignment information, including:
	- "input": Concatenated input sentences.
	- "source": Concatenated best-matched source sentences.
	- "similarity": Average cosine similarity score.
	- "label": Label determined based on similarity.
	- "paraphrase": Boolean indicating if it's a paraphrase.
	- "url": The source URL.
	"""
	# Extract sentences from input text and web page
	input_sentences = split_into_sentences(input_text)

	if not source_text:
	return {}
	source_sentences = split_into_sentences(source_text)

	if not input_sentences or not source_sentences:
	return {}

	# Handle external references in source sentences
	# This is specified for bbc news articles
	additional_sentences = []
	for sentence in source_sentences:
	if ", external" in sentence:
	additional_sentences.append(sentence.replace(", external", ""))
	source_sentences.extend(additional_sentences)

	# Encode sentences into embeddings using the PARAPHASE_MODEL
	embeddings1 = PARAPHRASE_MODEL.encode(
	input_sentences,
	convert_to_tensor=True,
	device=DEVICE,
	show_progress_bar=False,
	)
	embeddings2 = PARAPHRASE_MODEL.encode(
	source_sentences,
	convert_to_tensor=True,
	device=DEVICE,
	show_progress_bar=False,
	)

	# Compute cosine similarity matrix
	similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()

	# Find sentence alignments
	inputs = ""
	sources = ""
	similarities = []

	for i, sentence in enumerate(input_sentences):
	max_sim_index = np.argmax(similarity_matrix[i])
	max_similarity = similarity_matrix[i][max_sim_index]
	best_matched_sentence = source_sentences[max_sim_index]

	inputs += sentence + " "
	sources += best_matched_sentence + " "
	similarities.append(max_similarity)

	# Calculate average similarity and determine paraphrase label
	similarity = sum(similarities) / len(similarities)
	label, is_paraphrased = determine_label(max_similarity)

	# Create the alignment dictionary
	alignment = {
	"input": inputs,
	"source": sources,
	"similarity": similarity,
	"label": label,
	"paraphrase": is_paraphrased,
	"url": url,
	}

	print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')

	return alignment


	def determine_label(similarity: float) -> tuple[Optional[str], bool]:
	"""
	Determines a label and paraphrase status based on the similarity score.

	Args:
	similarity (float): The similarity score between two texts.

	Returns:
	tuple: A tuple containing the label (str or None)
	and a boolean indicating if it's a paraphrase.
	"""
	if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
	return "HUMAN", True # Human paraphrase
	elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
	return "MACHINE", True # Machine paraphrase
	else:
	return None, False # Not a paraphrase


	if __name__ == "__main__":
	pass