Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2024-12-04 | |
| """ | |
| import pandas as pd | |
| from src.application.config import ( | |
| MIN_RATIO_PARAPHRASE_NUM, | |
| PARAPHRASE_THRESHOLD, | |
| PARAPHRASE_THRESHOLD_MACHINE, | |
| ) | |
| from src.application.formatting_fact_checker import create_fact_checker_table | |
| from src.application.formatting_governor import create_governor_table | |
| from src.application.formatting_ordinary_user import create_ordinary_user_table | |
| from src.application.image.image import ImageDetector | |
| from src.application.image.image_detection import ( | |
| detect_image_by_ai_model, | |
| detect_image_by_reverse_search, | |
| detect_image_from_news_image, | |
| ) | |
| from src.application.text.entity import highlight_entities | |
| from src.application.text.helper import ( | |
| postprocess_label, | |
| split_into_paragraphs, | |
| ) | |
| from src.application.text.model_detection import ( | |
| detect_text_by_ai_model, | |
| predict_generation_model, | |
| ) | |
| from src.application.text.search_detection import find_sentence_source | |
| from src.application.text.text import TextDetector | |
| class NewsVerification: | |
| def __init__(self): | |
| """ | |
| Initializes the NewsVerification object. | |
| """ | |
| self.news_text: str = "" | |
| self.news_title: str = "" | |
| self.news_content: str = "" | |
| self.news_image: str = "" | |
| self.text = TextDetector() | |
| self.image = ImageDetector() | |
| self.news_prediction_label: str = "" | |
| self.news_prediction_score: float = -1 | |
| # news' urls to find img | |
| self.found_img_url: list[str] = [] | |
| # Analyzed results | |
| self.aligned_sentences_df: pd.DataFrame = pd.DataFrame( | |
| columns=[ | |
| "input", | |
| "source", | |
| "label", | |
| "similarity", | |
| "paraphrase", | |
| "url", | |
| # "entities", | |
| ], | |
| ) | |
| def load_news(self, news_title: str, news_content: str, news_image: str): | |
| """ | |
| Loads news data into the object's attributes. | |
| Args: | |
| news_title (str): The title of the news article. | |
| news_content (str): The content of the news article. | |
| news_image (str): The url of image in news article. | |
| """ | |
| # Combine title and content for a full text representation. | |
| self.news_text = news_title + "\n\n" + news_content | |
| # if not isinstance(news_title, str) or not isinstance( | |
| # news_content, | |
| # str, | |
| # ): | |
| # raise TypeError("News title and content must be strings.") | |
| # if not isinstance(news_image, str) or news_image is not None: | |
| # Warning("News image must be a string.") | |
| self.news_title = news_title | |
| self.news_content = news_content | |
| self.news_image = news_image | |
| self.text.input = self.news_text | |
| self.image.input = news_image | |
| def group_by_url(self): | |
| """ | |
| Groups aligned sentences by URL | |
| Then, concatenates text the 'input' and 'source' text for each group. | |
| """ | |
| def concat_text(series): | |
| """ | |
| Concatenates the elements of a pd.Series into a single string. | |
| """ | |
| return " ".join( | |
| series.astype(str).tolist(), | |
| ) # Handle mixed data types and NaNs | |
| # Group sentences by URL and concatenate 'input' and 'source' text. | |
| self.text.grouped_url_df = ( | |
| self.aligned_sentences_df.groupby("url") | |
| .agg( | |
| { | |
| "input": concat_text, | |
| "source": concat_text, | |
| }, | |
| ) | |
| .reset_index() | |
| ) # Reset index to make 'url' a regular column | |
| # Add new columns for label and score | |
| self.text.grouped_url_df["label"] = None | |
| self.text.grouped_url_df["score"] = None | |
| print(f"aligned_sentences_df:\n {self.aligned_sentences_df}") | |
| def determine_text_origin_by_url(self): | |
| """ | |
| Determines the text origin for each URL group. | |
| """ | |
| for index, row in self.text.grouped_url_df.iterrows(): | |
| # Verify text origin using URL-based verification. | |
| label, score = self.verify_text(row["url"]) | |
| # If URL-based verification returns 'UNKNOWN', use AI detection | |
| if label == "UNKNOWN": | |
| # Concatenate text from "input" column in sentence_df | |
| text = " ".join(row["input"]) | |
| # Detect text origin using an AI model. | |
| label, score = detect_text_by_ai_model(text) | |
| print(f"labels = {label}") | |
| self.text.grouped_url_df.at[index, "label"] = label | |
| self.text.grouped_url_df.at[index, "score"] = score | |
| def determine_text_origin(self): | |
| """ | |
| Determines the origin of the input text by analyzing | |
| its sources and applying AI detection models. | |
| This method groups sentences by their source URLs, | |
| applies verification and AI detection, and then determines | |
| an overall label and score for the input text. | |
| """ | |
| # Find the text URLs associated with the input text | |
| self.find_text_source() | |
| # Group sentences by URL and concatenate 'input' and 'source' text. | |
| self.group_by_url() | |
| # Determine the text origin for each URL group | |
| self.determine_text_origin_by_url() | |
| # Determine the overall label and score for the entire input text. | |
| if not self.text.grouped_url_df.empty: | |
| # Check for 'gpt-4o' labels in the grouped URLs. | |
| machine_label = self.text.grouped_url_df[ | |
| self.text.grouped_url_df["label"].str.contains( | |
| "gpt-4o", | |
| case=False, | |
| na=False, | |
| ) | |
| ] | |
| print(f" machine_label = {machine_label}") | |
| if not machine_label.empty: | |
| # If 'gpt-4o' labels are found, post-process and assign. | |
| labels = machine_label["label"].tolist() | |
| label = postprocess_label(labels) | |
| # labels = " and ".join(machine_label["label"].tolist()) | |
| # label = remove_duplicate_words(label) | |
| self.text.prediction_label[0] = label | |
| self.text.prediction_score[0] = machine_label["score"].mean() | |
| else: | |
| # If no 'gpt-4o' labels, assign for 'HUMAN' labels. | |
| machine_label = self.aligned_sentences_df[ | |
| self.aligned_sentences_df["label"] == "HUMAN" | |
| ] | |
| self.text.prediction_label[0] = "HUMAN" | |
| self.text.prediction_score[0] = self.text.grouped_url_df[ | |
| "score" | |
| ].mean() | |
| else: | |
| # If no found URLs, use AI detection on the entire input text. | |
| print("No source found in the input text") | |
| text = " ".join(self.aligned_sentences_df["input"].tolist()) | |
| # Detect text origin using an AI model. | |
| label, score = detect_text_by_ai_model(text) | |
| self.text.prediction_label[0] = label | |
| self.text.prediction_score[0] = score | |
| def find_text_source(self): | |
| """ | |
| Determines the origin of the given text based on paraphrasing | |
| detection and human authorship analysis. | |
| 1. Splits the input news text into sentences, | |
| 2. Searches for sources for each sentence | |
| 3. Updates the aligned_sentences_df with the found sources. | |
| """ | |
| print("CHECK TEXT:") | |
| print("\tFrom search engine:") | |
| input_paragraphs = split_into_paragraphs(self.news_text) | |
| # Initialize an empty DataFrame if it doesn't exist, | |
| # otherwise extend it. | |
| if ( | |
| not hasattr(self, "aligned_sentences_df") | |
| or self.aligned_sentences_df is None | |
| ): | |
| self.aligned_sentences_df = pd.DataFrame( | |
| columns=[ | |
| "input", | |
| "source", | |
| "label", | |
| "similarity", | |
| "paraphrase", | |
| "url", | |
| "entities", | |
| ], | |
| ) | |
| # Setup DataFrame for input_sentences | |
| for _ in range(len(input_paragraphs)): | |
| self.aligned_sentences_df = pd.concat( | |
| [ | |
| self.aligned_sentences_df, | |
| pd.DataFrame( | |
| [ | |
| { | |
| "input": None, | |
| "source": None, | |
| "label": None, | |
| "similarity": None, | |
| "paraphrase": None, | |
| "url": None, | |
| "entities": None, | |
| }, | |
| ], | |
| ), | |
| ], | |
| ignore_index=True, | |
| ) | |
| # Find a source for each sentence | |
| for index, _ in enumerate(input_paragraphs): | |
| similarity = self.aligned_sentences_df.loc[index, "similarity"] | |
| if similarity is not None: | |
| if similarity > PARAPHRASE_THRESHOLD_MACHINE: | |
| continue | |
| print(f"\n-------index = {index}-------") | |
| print(f"current_text = {input_paragraphs[index]}\n") | |
| self.aligned_sentences_df, img_urls = find_sentence_source( | |
| input_paragraphs, | |
| index, | |
| self.aligned_sentences_df, | |
| ) | |
| # Initialize found_img_url if it does not exist. | |
| if not hasattr(self, "found_img_url"): | |
| self.found_img_url = [] | |
| self.found_img_url.extend(img_urls) | |
| def verify_text(self, url): | |
| """ | |
| Verifies the text origin based on similarity scores and labels | |
| associated with a given URL. | |
| 1. Filters sentences by URL and similarity score, | |
| 2. Determines if the text is likely generated by a machine or a human. | |
| 3. Calculates an average similarity score. | |
| Args: | |
| url (str): The URL to filter sentences by. | |
| Returns: | |
| tuple: A | |
| - Label ("MACHINE", "HUMAN", or "UNKNOWN") | |
| - Score | |
| """ | |
| label = "UNKNOWN" | |
| score = 0 | |
| # calculate the average similarity when the similary score | |
| # in each row of sentences_df is higher than 0.8 | |
| # Filter sentences by URL. | |
| filtered_by_url = self.aligned_sentences_df[ | |
| self.aligned_sentences_df["url"] == url | |
| ] | |
| # Filter sentences by similarity score (> PARAPHRASE_THRESHOLD). | |
| filtered_by_similarity = filtered_by_url[ | |
| filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD | |
| ] | |
| # Check if a ratio of remaining filtering-sentences is more than 50%. | |
| if ( | |
| len(filtered_by_similarity) / len(filtered_by_url) | |
| > MIN_RATIO_PARAPHRASE_NUM | |
| ): | |
| # check if "MACHINE" is in self.aligned_sentences_df["label"]: | |
| contains_machine = ( | |
| filtered_by_similarity["label"] | |
| .str.contains( | |
| "MACHINE", | |
| case=False, | |
| na=False, | |
| ) | |
| .any() | |
| ) | |
| print(f"contain_machine = \n{contains_machine}") | |
| # TODO: integrate with determine_text_origin | |
| if contains_machine: | |
| # If "MACHINE" label is present, set label and calculate score. | |
| machine_rows = filtered_by_similarity[ | |
| filtered_by_similarity["label"].str.contains( | |
| "MACHINE", | |
| case=False, | |
| na=False, | |
| ) | |
| ] | |
| generated_model, _ = predict_generation_model(self.news_text) | |
| label = f"Partially generated by {generated_model}" | |
| score = machine_rows["similarity"].mean() | |
| else: | |
| # If no "MACHINE" label, | |
| # assign "HUMAN" label and calculate score. | |
| label = "HUMAN" | |
| human_rows = filtered_by_similarity[ | |
| filtered_by_similarity["label"].str.contains( | |
| "HUMAN", | |
| case=False, | |
| na=False, | |
| ) | |
| ] | |
| score = human_rows["similarity"].mean() | |
| return label, score | |
| def determine_image_origin(self): | |
| """ | |
| Determines the origin of the news image using 3 detection methods. | |
| 1. Matching against previously found image URLs. | |
| 2. Reverse image search. | |
| 3. AI-based image detection. | |
| If none of these methods succeed, the image origin is "UNKNOWN". | |
| """ | |
| print("CHECK IMAGE:") | |
| # Handle the case where no image is provided. | |
| if self.news_image is None: | |
| self.image.prediction_label = "UNKNOWN" | |
| self.image.prediction_score = 0.0 | |
| self.image.referent_url = None | |
| return | |
| # Attempt to match the image against previously found image URLs. | |
| print("\tFrom found image URLs...") | |
| matched_url, similarity = detect_image_from_news_image( | |
| self.news_image, | |
| self.found_img_url, | |
| ) | |
| if matched_url is not None: | |
| print(f"matched image: {matched_url}\nsimilarity: {similarity}\n") | |
| self.image.prediction_label = "HUMAN" | |
| self.image.prediction_score = similarity | |
| self.image.referent_url = matched_url | |
| return | |
| # Attempt to find the image origin using reverse image search. | |
| print("\tFrom reverse image search...") | |
| matched_url, similarity = detect_image_by_reverse_search( | |
| self.news_image, | |
| ) | |
| if matched_url is not None: | |
| print(f"matched image: {matched_url}\tScore: {similarity}%\n") | |
| self.image.prediction_label = "HUMAN" | |
| self.image.prediction_score = similarity | |
| self.image.referent_url = matched_url | |
| return | |
| # Attempt to detect the image origin using an AI model. | |
| print("\tFrom an AI model...") | |
| detected_label, score = detect_image_by_ai_model(self.news_image) | |
| if detected_label: | |
| print(f"detected_label: {detected_label} ({score})") | |
| self.image.prediction_label = detected_label | |
| self.image.prediction_score = score | |
| self.image.referent_url = None | |
| return | |
| # If all detection methods fail, mark the image origin as "UNKNOWN". | |
| self.image.prediction_label = "UNKNOWN" | |
| self.image.prediction_score = 50 | |
| self.image.referent_url = None | |
| def determine_origin(self): | |
| """ | |
| Determine origins by analyzing the news text and image. | |
| """ | |
| if self.news_text != "": | |
| self.determine_text_origin() | |
| if self.news_image != "": | |
| self.determine_image_origin() | |
| # Handle entity recognition and processing. | |
| self.handle_entities() | |
| def generate_report(self) -> tuple[str, str, str]: | |
| """ | |
| Generates reports tailored for different user roles | |
| (ordinary users, fact checkers, governors). | |
| Returns: | |
| tuple: A tuple containing three html-formatted reports: | |
| - ordinary_user_table: Report for ordinary users. | |
| - fact_checker_table: Report for fact checkers. | |
| - governor_table: Report for governors. | |
| """ | |
| ordinary_user_table = create_ordinary_user_table( | |
| self.aligned_sentences_df, | |
| self.text, | |
| self.image, | |
| ) | |
| fact_checker_table = create_fact_checker_table( | |
| self.aligned_sentences_df, | |
| self.text, | |
| self.image, | |
| ) | |
| governor_table = create_governor_table( | |
| self.aligned_sentences_df, | |
| self.text, | |
| self.image, | |
| ) | |
| return ordinary_user_table, fact_checker_table, governor_table | |
| def handle_entities(self): | |
| """ | |
| Highlights and assigns entities with colors to aligned sentences | |
| based on grouped URLs. | |
| For each grouped URL: | |
| 1. Highlights entities in the input and source text | |
| 2. Then assigns these highlighted entities to the corresponding | |
| sentences in the aligned sentences DataFrame. | |
| """ | |
| entities_with_colors = [] | |
| for index, row in self.text.grouped_url_df.iterrows(): | |
| # Get entity-words (in pair) with colors | |
| entities_with_colors = highlight_entities( | |
| row["input"], | |
| row["source"], | |
| ) | |
| # Assign the highlighted entities to the corresponding sentences | |
| # in aligned_sentences_df. | |
| for index, sentence in self.aligned_sentences_df.iterrows(): | |
| if sentence["url"] == row["url"]: | |
| # Use .at to modify the DataFrame efficiently. | |
| self.aligned_sentences_df.at[index, "entities"] = ( | |
| entities_with_colors | |
| ) | |
| def get_text_urls(self) -> set: | |
| """ | |
| Returns a set of unique URLs referenced in the text analysis. | |
| Returns: | |
| set: A set containing the unique URLs referenced in the text. | |
| """ | |
| return set(self.text_referent_url) | |