Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2024-12-04 | |
| """ | |
| import warnings | |
| from typing import Optional | |
| import numpy as np | |
| from pandas import DataFrame | |
| from sentence_transformers import util | |
| from src.application.config import ( | |
| DEVICE, | |
| MAX_CHAR_SIZE, | |
| PARAPHRASE_MODEL, | |
| PARAPHRASE_THRESHOLD, | |
| PARAPHRASE_THRESHOLD_HUMAN, | |
| PARAPHRASE_THRESHOLD_MACHINE, | |
| TOP_URLS_PER_SEARCH, | |
| ) | |
| from src.application.text.helper import split_into_sentences | |
| from src.application.text.search import ( | |
| generate_search_phrases, | |
| search_by_google, | |
| ) | |
| from src.application.url_reader import URLReader | |
| warnings.simplefilter(action="ignore", category=FutureWarning) | |
| def find_sentence_source( | |
| text: list, | |
| text_index: str, | |
| sentences_df: DataFrame, | |
| ) -> tuple[DataFrame, list]: | |
| """ | |
| Finds the source URL for a given sentence by searching Google | |
| and checking for paraphrases. | |
| Args: | |
| text (list): A list of sentences. | |
| text_index (int): The index of the sentence to find the source for. | |
| sentences_df (pd.DataFrame): A DF to store sentence information. | |
| Returns: | |
| tuple: A tuple of the updated sentences_df and a list of image URLs. | |
| If a source is found, the DF is updated with source information. | |
| If no source is found, the DF is updated with the original input. | |
| """ | |
| checked_urls = ( | |
| set() | |
| ) # Keep track of visited URLs to avoid redundant checks | |
| searched_phrases = generate_search_phrases(text[text_index]) | |
| for candidate in searched_phrases: | |
| # Search Google for the generated phrase | |
| search_results = search_by_google(candidate) | |
| # Extract URLs from search results | |
| urls = [item["link"] for item in search_results.get("items", [])] | |
| # Check the top 3 URLs from the search results | |
| for url in urls[:TOP_URLS_PER_SEARCH]: | |
| if url in checked_urls: # Skip already checked URLs | |
| continue | |
| if "bbc.com" not in url: # TODO: remove when releasing | |
| continue | |
| checked_urls.add(url) | |
| print(f"\t\tChecking URL: {url}") | |
| content = URLReader(url) | |
| if content.is_extracted is True: | |
| if content.title is None or content.text is None: | |
| print("\t\t\tβββ Title or text not found") | |
| continue | |
| source_text = content.title + "\n" + content.text | |
| if len(source_text) > MAX_CHAR_SIZE: | |
| print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters") | |
| continue | |
| print(f"\t\t\tβββ Title: {content.title}") | |
| aligned_sentence = check_paraphrase( | |
| text[text_index], | |
| source_text, | |
| url, | |
| ) | |
| if aligned_sentence["paraphrase"] is False: | |
| sentences_df.loc[text_index, "input"] = aligned_sentence[ | |
| "input" | |
| ] | |
| sentences_df.loc[text_index, "paraphrase"] = ( | |
| aligned_sentence["paraphrase"] | |
| ) | |
| return sentences_df, [] | |
| if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD: | |
| columns = [ | |
| "input", | |
| "source", | |
| "label", | |
| "similarity", | |
| "paraphrase", | |
| "url", | |
| ] | |
| else: | |
| columns = [ | |
| "input", | |
| "label", | |
| "paraphrase", | |
| ] | |
| for c in columns: | |
| if c in sentences_df.columns: | |
| sentences_df.loc[text_index, c] = aligned_sentence[c] | |
| # Check other sentences for better matches in the same source | |
| for idx, _ in sentences_df.iterrows(): | |
| similarity = sentences_df.loc[idx, "similarity"] | |
| if similarity is not None: | |
| if similarity > PARAPHRASE_THRESHOLD_MACHINE: | |
| continue | |
| aligned_sentence = check_paraphrase( | |
| text[idx], | |
| source_text, | |
| url, | |
| ) | |
| if ( | |
| similarity is None | |
| or aligned_sentence["similarity"] > similarity | |
| ): | |
| if ( | |
| aligned_sentence["similarity"] | |
| > PARAPHRASE_THRESHOLD | |
| ): | |
| columns = [ | |
| "input", | |
| "source", | |
| "label", | |
| "similarity", | |
| "url", | |
| ] | |
| else: | |
| columns = [ | |
| "input", | |
| "label", | |
| ] | |
| for c in columns: | |
| if c in sentences_df.columns: | |
| sentences_df.loc[idx, c] = aligned_sentence[c] | |
| return sentences_df, content.images | |
| # If no source is found, update the DF with the original input | |
| sentences_df.loc[text_index, "input"] = text[text_index] | |
| return sentences_df, [] | |
| def check_paraphrase(input_text: str, source_text: str, url: str) -> dict: | |
| """ | |
| Checks if the input text is a paraphrase of the source text | |
| by comparing sentence-level similarities. | |
| Args: | |
| input_text (str): The text to be checked for paraphrasing. | |
| source_text (str): The source text to compare against. | |
| url (str): The URL of the source text (for storing in the result). | |
| Returns: | |
| dict: A dictionary containing the alignment information, including: | |
| - "input": Concatenated input sentences. | |
| - "source": Concatenated best-matched source sentences. | |
| - "similarity": Average cosine similarity score. | |
| - "label": Label determined based on similarity. | |
| - "paraphrase": Boolean indicating if it's a paraphrase. | |
| - "url": The source URL. | |
| """ | |
| # Extract sentences from input text and web page | |
| input_sentences = split_into_sentences(input_text) | |
| if not source_text: | |
| return {} | |
| source_sentences = split_into_sentences(source_text) | |
| if not input_sentences or not source_sentences: | |
| return {} | |
| # Handle external references in source sentences | |
| # This is specified for bbc news articles | |
| additional_sentences = [] | |
| for sentence in source_sentences: | |
| if ", external" in sentence: | |
| additional_sentences.append(sentence.replace(", external", "")) | |
| source_sentences.extend(additional_sentences) | |
| # Encode sentences into embeddings using the PARAPHASE_MODEL | |
| embeddings1 = PARAPHRASE_MODEL.encode( | |
| input_sentences, | |
| convert_to_tensor=True, | |
| device=DEVICE, | |
| show_progress_bar=False, | |
| ) | |
| embeddings2 = PARAPHRASE_MODEL.encode( | |
| source_sentences, | |
| convert_to_tensor=True, | |
| device=DEVICE, | |
| show_progress_bar=False, | |
| ) | |
| # Compute cosine similarity matrix | |
| similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy() | |
| # Find sentence alignments | |
| inputs = "" | |
| sources = "" | |
| similarities = [] | |
| for i, sentence in enumerate(input_sentences): | |
| max_sim_index = np.argmax(similarity_matrix[i]) | |
| max_similarity = similarity_matrix[i][max_sim_index] | |
| best_matched_sentence = source_sentences[max_sim_index] | |
| inputs += sentence + " " | |
| sources += best_matched_sentence + " " | |
| similarities.append(max_similarity) | |
| # Calculate average similarity and determine paraphrase label | |
| similarity = sum(similarities) / len(similarities) | |
| label, is_paraphrased = determine_label(max_similarity) | |
| # Create the alignment dictionary | |
| alignment = { | |
| "input": inputs, | |
| "source": sources, | |
| "similarity": similarity, | |
| "label": label, | |
| "paraphrase": is_paraphrased, | |
| "url": url, | |
| } | |
| print(f'Result: [{alignment["similarity"]}] {alignment["source"]}') | |
| return alignment | |
| def determine_label(similarity: float) -> tuple[Optional[str], bool]: | |
| """ | |
| Determines a label and paraphrase status based on the similarity score. | |
| Args: | |
| similarity (float): The similarity score between two texts. | |
| Returns: | |
| tuple: A tuple containing the label (str or None) | |
| and a boolean indicating if it's a paraphrase. | |
| """ | |
| if similarity >= PARAPHRASE_THRESHOLD_HUMAN: | |
| return "HUMAN", True # Human paraphrase | |
| elif similarity >= PARAPHRASE_THRESHOLD_MACHINE: | |
| return "MACHINE", True # Machine paraphrase | |
| else: | |
| return None, False # Not a paraphrase | |
| if __name__ == "__main__": | |
| pass | |