Spaces:
Sleeping
Sleeping
""" | |
Author: Khanh Phan | |
Date: 2024-12-04 | |
""" | |
import pandas as pd | |
from src.application.config import MIN_RATIO_PARAPHRASE_NUM, PARAPHRASE_THRESHOLD, PARAPHRASE_THRESHOLD_MACHINE | |
from src.application.formatting import color_text, format_entity_count | |
from src.application.image.image_detection import ( | |
detect_image_by_ai_model, | |
detect_image_by_reverse_search, | |
detect_image_from_news_image, | |
) | |
from src.application.text.entity import ( | |
apply_highlight, | |
highlight_entities, | |
) | |
from src.application.text.helper import ( | |
extract_equal_text, | |
postprocess_label, | |
split_into_paragraphs, | |
) | |
from src.application.text.model_detection import ( | |
detect_text_by_ai_model, | |
predict_generation_model, | |
) | |
from src.application.text.search_detection import find_sentence_source | |
class NewsVerification: | |
def __init__(self): | |
""" | |
Initializes the NewsVerification object. | |
""" | |
self.news_text: str = "" | |
self.news_title: str = "" | |
self.news_content: str = "" | |
self.news_image: str = "" | |
self.text_prediction_label: list[str] = ["UNKNOWN"] | |
self.text_prediction_score: list[float] = [0.0] | |
self.image_prediction_label: list[str] = ["UNKNOWN"] | |
self.image_prediction_score: list[str] = [0.0] | |
self.image_referent_url: list[str] = [] | |
self.news_prediction_label: str = "" | |
self.news_prediction_score: float = -1 | |
# news' urls to find img | |
self.found_img_url: list[str] = [] | |
# Analyzed results | |
self.aligned_sentences_df: pd.DataFrame = pd.DataFrame( | |
columns=[ | |
"input", | |
"source", | |
"label", | |
"similarity", | |
"paraphrase", | |
"url", | |
# "entities", | |
], | |
) | |
self.grouped_url_df: pd.DataFrame = pd.DataFrame() | |
# For formatting ouput tables | |
self.ordinary_user_table: list = [] | |
self.fact_checker_table: list = [] | |
self.governor_table: list = [] | |
def load_news(self, news_title: str, news_content: str, news_image: str): | |
""" | |
Loads news data into the object's attributes. | |
Args: | |
news_title (str): The title of the news article. | |
news_content (str): The content of the news article. | |
news_image (str): The url of image in news article. | |
""" | |
# Combine title and content for a full text representation. | |
# .strip() removes leading/trailing whitespace for cleaner text. | |
self.news_text = (news_title + "\n\n" + news_content).strip() | |
# if not isinstance(news_title, str) or not isinstance( | |
# news_content, | |
# str, | |
# ): | |
# raise TypeError("News title and content must be strings.") | |
# if not isinstance(news_image, str) or news_image is not None: | |
# Warning("News image must be a string.") | |
self.news_title = news_title | |
self.news_content = news_content | |
self.news_image = news_image | |
def group_by_url(self): | |
""" | |
Groups aligned sentences by URL | |
Then, concatenates text the 'input' and 'source' text for each group. | |
""" | |
def concat_text(series): | |
""" | |
Concatenates the elements of a pd.Series into a single string. | |
""" | |
return " ".join( | |
series.astype(str).tolist(), | |
) # Handle mixed data types and NaNs | |
# Group sentences by URL and concatenate 'input' and 'source' text. | |
self.grouped_url_df = ( | |
self.aligned_sentences_df.groupby("url") | |
.agg( | |
{ | |
"input": concat_text, | |
"source": concat_text, | |
}, | |
) | |
.reset_index() | |
) # Reset index to make 'url' a regular column | |
# Add new columns for label and score | |
self.grouped_url_df["label"] = None | |
self.grouped_url_df["score"] = None | |
print(f"aligned_sentences_df:\n {self.aligned_sentences_df}") | |
def determine_text_origin_by_url(self): | |
""" | |
Determines the text origin for each URL group. | |
""" | |
for index, row in self.grouped_url_df.iterrows(): | |
# Verify text origin using URL-based verification. | |
label, score = self.verify_text(row["url"]) | |
# If URL-based verification returns 'UNKNOWN', use AI detection | |
if label == "UNKNOWN": | |
# Concatenate text from "input" column in sentence_df | |
text = " ".join(row["input"]) | |
# Detect text origin using an AI model. | |
label, score = detect_text_by_ai_model(text) | |
self.grouped_url_df.at[index, "label"] = label | |
self.grouped_url_df.at[index, "score"] = score | |
def determine_text_origin(self): | |
""" | |
Determines the origin of the input text by analyzing | |
its sources and applying AI detection models. | |
This method groups sentences by their source URLs, | |
applies verification and AI detection, and then determines | |
an overall label and score for the input text. | |
""" | |
# Find the text URLs associated with the input text | |
self.find_text_source() | |
# Group sentences by URL and concatenate 'input' and 'source' text. | |
self.group_by_url() | |
# Determine the text origin for each URL group | |
self.determine_text_origin_by_url() | |
# Determine the overall label and score for the entire input text. | |
if not self.grouped_url_df.empty: | |
# Check for 'gpt-4o' labels in the grouped URLs. | |
machine_label = self.grouped_url_df[ | |
self.grouped_url_df["label"].str.contains( | |
"gpt-4o", | |
case=False, | |
na=False, | |
) | |
] | |
if not machine_label.empty: | |
# If 'gpt-4o' labels are found, post-process and assign. | |
labels = machine_label["label"].tolist() | |
label = postprocess_label(labels) | |
# labels = " and ".join(machine_label["label"].tolist()) | |
# label = remove_duplicate_words(label) | |
self.text_prediction_label[0] = label | |
self.text_prediction_score[0] = machine_label["score"].mean() | |
else: | |
# If no 'gpt-4o' labels, assign for 'HUMAN' labels. | |
machine_label = self.aligned_sentences_df[ | |
self.aligned_sentences_df["label"] == "HUMAN" | |
] | |
self.text_prediction_label[0] = "HUMAN" | |
self.text_prediction_score[0] = machine_label["score"].mean() | |
else: | |
# If no found URLs, use AI detection on the entire input text. | |
print("No source found in the input text") | |
text = " ".join(self.aligned_sentences_df["input"].tolist()) | |
# Detect text origin using an AI model. | |
label, score = detect_text_by_ai_model(text) | |
self.text_prediction_label[0] = label | |
self.text_prediction_score[0] = score | |
def find_text_source(self): | |
""" | |
Determines the origin of the given text based on paraphrasing | |
detection and human authorship analysis. | |
1. Splits the input news text into sentences, | |
2. Searches for sources for each sentence | |
3. Updates the aligned_sentences_df with the found sources. | |
""" | |
print("CHECK TEXT:") | |
print("\tFrom search engine:") | |
input_paragraphs = split_into_paragraphs(self.news_text) | |
# Initialize an empty DataFrame if it doesn't exist, otherwise extend it. | |
if not hasattr(self, 'aligned_sentences_df') or self.aligned_sentences_df is None: | |
self.aligned_sentences_df = pd.DataFrame(columns=[ | |
"input", | |
"source", | |
"label", | |
"similarity", | |
"paraphrase", | |
"url", | |
"entities", | |
]) | |
# Setup DataFrame for input_sentences | |
for _ in range(len(input_paragraphs)): | |
self.aligned_sentences_df = pd.concat( | |
[ | |
self.aligned_sentences_df, | |
pd.DataFrame( | |
[ | |
{ | |
"input": None, | |
"source": None, | |
"label": None, | |
"similarity": None, | |
"paraphrase": None, | |
"url": None, | |
"entities": None, | |
}, | |
], | |
), | |
], | |
ignore_index=True, | |
) | |
# Find a source for each sentence | |
for index, _ in enumerate(input_paragraphs): | |
similarity = self.aligned_sentences_df.loc[index, "similarity"] | |
if similarity is not None: | |
if similarity > PARAPHRASE_THRESHOLD_MACHINE: | |
continue | |
print(f"\n-------index = {index}-------") | |
print(f"current_text = {input_paragraphs[index]}\n") | |
self.aligned_sentences_df, img_urls = find_sentence_source( | |
input_paragraphs, | |
index, | |
self.aligned_sentences_df, | |
) | |
# Initialize found_img_url if it does not exist. | |
if not hasattr(self, 'found_img_url'): | |
self.found_img_url = [] | |
self.found_img_url.extend(img_urls) | |
def verify_text(self, url): | |
""" | |
Verifies the text origin based on similarity scores and labels | |
associated with a given URL. | |
1. Filters sentences by URL and similarity score, | |
2. Determines if the text is likely generated by a machine or a human. | |
3. Calculates an average similarity score. | |
Args: | |
url (str): The URL to filter sentences by. | |
Returns: | |
tuple: A | |
- Label ("MACHINE", "HUMAN", or "UNKNOWN") | |
- Score | |
""" | |
label = "UNKNOWN" | |
score = 0 | |
# calculate the average similarity when the similary score | |
# in each row of sentences_df is higher than 0.8 | |
# Filter sentences by URL. | |
filtered_by_url = self.aligned_sentences_df[ | |
self.aligned_sentences_df["url"] == url | |
] | |
# Filter sentences by similarity score (> PARAPHRASE_THRESHOLD). | |
filtered_by_similarity = filtered_by_url[ | |
filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD | |
] | |
# Check if a ratio of remaining filtering-sentences is more than 50%. | |
if len(filtered_by_similarity) / len(self.aligned_sentences_df) > MIN_RATIO_PARAPHRASE_NUM: | |
# check if "MACHINE" is in self.aligned_sentences_df["label"]: | |
contains_machine = ( | |
filtered_by_similarity["label"] | |
.str.contains( | |
"MACHINE", | |
case=False, | |
na=False, | |
) | |
.any() | |
) | |
# TODO: integrate with determine_text_origin | |
if contains_machine: | |
# If "MACHINE" label is present, set label and calculate score. | |
machine_rows = filtered_by_similarity[ | |
filtered_by_similarity["label"].str.contains( | |
"MACHINE", | |
case=False, | |
na=False, | |
) | |
] | |
generated_model, _ = predict_generation_model(self.news_text) | |
label = f"Partially generated by {generated_model}" | |
score = machine_rows["similarity"].mean() | |
else: | |
# If no "MACHINE" label, assign "HUMAN" label and calculate score. | |
label = "HUMAN" | |
human_rows = filtered_by_similarity[ | |
filtered_by_similarity["label"].str.contains( | |
"HUMAN", | |
case=False, | |
na=False, | |
) | |
] | |
score = human_rows["similarity"].mean() | |
return label, score | |
def determine_image_origin(self): | |
""" | |
Determines the origin of the news image using various detection methods. | |
1. Matching against previously found image URLs. | |
2. Reverse image search. | |
3. AI-based image detection. | |
If none of these methods succeed, the image origin is marked as "UNKNOWN". | |
""" | |
print("CHECK IMAGE:") | |
# Handle the case where no image is provided. | |
if self.news_image is None: | |
self.image_prediction_label = "UNKNOWN" | |
self.image_prediction_score = 0.0 | |
self.image_referent_url = None | |
return | |
# Attempt to match the image against previously found image URLs. | |
print("\tFrom found image URLs...") | |
matched_url, similarity = detect_image_from_news_image( | |
self.news_image, | |
self.found_img_url, | |
) | |
if matched_url is not None: | |
print(f"matched image: {matched_url}\nsimilarity: {similarity}\n") | |
self.image_prediction_label = "HUMAN" | |
self.image_prediction_score = similarity | |
self.image_referent_url = matched_url | |
return | |
# Attempt to find the image origin using reverse image search. | |
print("\tFrom reverse image search...") | |
matched_url, similarity = detect_image_by_reverse_search( | |
self.news_image, | |
) | |
if matched_url is not None: | |
print(f"matched image: {matched_url}\tScore: {similarity}%\n") | |
self.image_prediction_label = "HUMAN" | |
self.image_prediction_score = similarity | |
self.image_referent_url = matched_url | |
return | |
# Attempt to detect the image origin using an AI model. | |
print("\tFrom an AI model...") | |
detected_label, score = detect_image_by_ai_model(self.news_image) | |
if detected_label: | |
print(f"detected_label: {detected_label} ({score})") | |
self.image_prediction_label = detected_label | |
self.image_prediction_score = score | |
self.image_referent_url = None | |
return | |
# If all detection methods fail, mark the image origin as "UNKNOWN". | |
self.image_prediction_label = "UNKNOWN" | |
self.image_prediction_score = 50 | |
self.image_referent_url = None | |
def determine_origin(self): | |
""" | |
Determine origins by analyzing the news text and image. | |
""" | |
if self.news_text != "": | |
self.determine_text_origin() | |
if self.news_image != "": | |
self.determine_image_origin() | |
# Handle entity recognition and processing. | |
self.handle_entities() | |
def generate_report(self) -> tuple[str, str, str]: | |
""" | |
Generates reports tailored for different user roles | |
(ordinary users, fact checkers, governors). | |
Returns: | |
tuple: A tuple containing three html-formatted reports: | |
- ordinary_user_table: Report for ordinary users. | |
- fact_checker_table: Report for fact checkers. | |
- governor_table: Report for governors. | |
""" | |
ordinary_user_table = self.create_ordinary_user_table() | |
fact_checker_table = self.create_fact_checker_table() | |
governor_table = self.create_governor_table() | |
return ordinary_user_table, fact_checker_table, governor_table | |
def handle_entities(self): | |
""" | |
Highlights and assigns entities with colors to aligned sentences | |
based on grouped URLs. | |
For each grouped URL: | |
1. Highlights entities in the input and source text | |
2. Then assigns these highlighted entities to the corresponding | |
sentences in the aligned sentences DataFrame. | |
""" | |
entities_with_colors = [] | |
for index, row in self.grouped_url_df.iterrows(): | |
# Get entity-words (in pair) with colors | |
entities_with_colors = highlight_entities( | |
row["input"], | |
row["source"], | |
) | |
# Assign the highlighted entities to the corresponding sentences | |
# in aligned_sentences_df. | |
for index, sentence in self.aligned_sentences_df.iterrows(): | |
if sentence["url"] == row["url"]: | |
# Use .at to modify the DataFrame efficiently. | |
self.aligned_sentences_df.at[index, "entities"] = ( | |
entities_with_colors | |
) | |
def get_text_urls(self) -> set: | |
""" | |
Returns a set of unique URLs referenced in the text analysis. | |
Returns: | |
set: A set containing the unique URLs referenced in the text. | |
""" | |
return set(self.text_referent_url) | |
def create_fact_checker_table(self): | |
rows = [] | |
rows.append(self.format_image_fact_checker_row()) | |
for _, row in self.aligned_sentences_df.iterrows(): | |
if row["input"] is None: | |
continue | |
if row["source"] is None: | |
equal_idx_1 = equal_idx_2 = [] | |
else: # Get index of equal phrases in input and source sentences | |
equal_idx_1, equal_idx_2 = extract_equal_text( | |
row["input"], | |
row["source"], | |
) | |
self.fact_checker_table.append( | |
[ | |
row, | |
equal_idx_1, | |
equal_idx_2, | |
row["entities"], | |
row["url"], | |
], | |
) | |
previous_url = None | |
span_row = 1 | |
for index, row in enumerate(self.fact_checker_table): | |
current_url = row[4] | |
last_url_row = False | |
# First row or URL change | |
if index == 0 or current_url != previous_url: | |
first_url_row = True | |
previous_url = current_url | |
# Increase counter "span_row" when the next url is the same | |
while ( | |
index + span_row < len(self.fact_checker_table) | |
and self.fact_checker_table[index + span_row][4] | |
== current_url | |
): | |
span_row += 1 | |
else: | |
first_url_row = False | |
span_row -= 1 | |
if span_row == 1: | |
last_url_row = True | |
formatted_row = self.format_text_fact_checker_row( | |
row, | |
first_url_row, | |
last_url_row, | |
span_row, | |
) | |
rows.append(formatted_row) | |
table = "\n".join(rows) | |
return f""" | |
<h5>Comparison between input news and source news:</h5> | |
<table border="1" style="width:100%; text-align:left;"> | |
<col style="width: 170px;"> | |
<col style="width: 170px;"> | |
<col style="width: 30px;"> | |
<col style="width: 75px;"> | |
<thead> | |
<tr> | |
<th>Input news</th> | |
<th>Source (URL in Originality)</th> | |
<th>Forensic</th> | |
<th>Originality</th> | |
</tr> | |
</thead> | |
<tbody> | |
{table} | |
</tbody> | |
</table> | |
<style> | |
""" | |
def format_text_fact_checker_row( | |
self, | |
row, | |
first_url_row=True, | |
last_url_row=True, | |
span_row=1, | |
): | |
entity_count = 0 | |
if row[0]["input"] is None: | |
return "" | |
if row[0]["source"] is not None: # source is not empty | |
if row[3] is not None: | |
# highlight entities | |
input_sentence, highlight_idx_input = apply_highlight( | |
row[0]["input"], | |
row[3], | |
"input", | |
) | |
source_sentence, highlight_idx_source = apply_highlight( | |
row[0]["source"], | |
row[3], | |
"source", | |
) | |
else: | |
input_sentence = row[0]["input"] | |
source_sentence = row[0]["source"] | |
highlight_idx_input = [] | |
highlight_idx_source = [] | |
if row[3] is not None: | |
entity_count = len(row[3]) | |
# Color overlapping words | |
input_sentence = color_text( | |
input_sentence, | |
row[1], | |
highlight_idx_input, | |
) # text, index of highlight words | |
source_sentence = color_text( | |
source_sentence, | |
row[2], | |
highlight_idx_source, | |
) # text, index of highlight words | |
# Replace _ to get correct formatting | |
# Original one having _ for correct word counting | |
input_sentence = input_sentence.replace( | |
"span_style", | |
"span style", | |
).replace("1px_4px", "1px 4px") | |
source_sentence = source_sentence.replace( | |
"span_style", | |
"span style", | |
).replace("1px_4px", "1px 4px") | |
else: | |
input_sentence = row[0]["input"] | |
source_sentence = row[0]["source"] | |
url = row[0]["url"] | |
# Displayed label and score by url | |
filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url] | |
if len(filterby_url) > 0: | |
label = filterby_url["label"].values[0] | |
score = filterby_url["score"].values[0] | |
else: | |
label = self.text_prediction_label[0] | |
score = self.text_prediction_score[0] | |
# Format displayed url | |
source_text_url = f"""<a href="{url}">{url}</a>""" | |
# Format displayed entity count | |
entity_count_text = format_entity_count(entity_count) | |
border_top = "border-top: 1px solid transparent;" | |
border_bottom = "border-bottom: 1px solid transparent;" | |
word_break = "word-break: break-all;" | |
if first_url_row is True: | |
# First & Last the group: no transparent | |
if last_url_row is True: | |
return f""" | |
<tr> | |
<td>{input_sentence}</td> | |
<td>{source_sentence}</td> | |
<td rowspan="{span_row}">{label}<br> | |
({score * 100:.2f}%)<br><br> | |
{entity_count_text}</td> | |
<td rowspan="{span_row}"; style="{word_break}";>{source_text_url}</td> | |
</tr> | |
""" | |
# First row of the group: transparent bottom border | |
return f""" | |
<tr> | |
<td style="{border_bottom}";>{input_sentence}</td> | |
<td style="{border_bottom}";>{source_sentence}</td> | |
<td rowspan="{span_row}">{label}<br> | |
({score * 100:.2f}%)<br><br> | |
{entity_count_text}</td> | |
<td rowspan="{span_row}"; style="{word_break}";>{source_text_url}</td> | |
</tr> | |
""" | |
else: | |
if last_url_row is True: | |
# NOT First row, Last row: transparent top border | |
return f""" | |
<tr> | |
<td style="{border_top}";>{input_sentence}</td> | |
<td style="{border_top}";>{source_sentence}</td> | |
</tr> | |
""" | |
else: | |
# NOT First & NOT Last row: transparent top & bottom borders | |
return f""" | |
<tr> | |
<td style="{border_top} {border_bottom}";>{input_sentence}</td> | |
<td style="{border_top} {border_bottom}";>{source_sentence}</td> | |
</tr> | |
""" | |
def format_image_fact_checker_row(self): | |
if ( | |
self.image_referent_url is not None | |
or self.image_referent_url != "" | |
): | |
source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">""" # noqa: E501 | |
source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>""" # noqa: E501 | |
else: | |
source_image = "Image not found" | |
source_image_url = "" | |
word_break = "word-break: break-all;" | |
return f""" | |
<tr> | |
<td>input image</td> | |
<td>{source_image}</td> | |
<td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td> | |
<td style="{word_break}";>{source_image_url}</td></tr>""" | |
def create_ordinary_user_table(self): | |
rows = [] | |
rows.append(self.format_image_ordinary_user_row()) | |
rows.append(self.format_text_ordinary_user_row()) | |
table = "\n".join(rows) | |
return f""" | |
<h5>Comparison between input news and source news:</h5> | |
<table border="1" style="width:100%; text-align:left;"> | |
<col style="width: 340px;"> | |
<col style="width: 30px;"> | |
<col style="width: 75px;"> | |
<thead> | |
<tr> | |
<th>Input news</th> | |
<th>Forensic</th> | |
<th>Originality</th> | |
</tr> | |
</thead> | |
<tbody> | |
{table} | |
</tbody> | |
</table> | |
<style> | |
""" | |
def format_text_ordinary_user_row(self): | |
input_sentences = "" | |
source_text_urls = "" | |
urls = [] | |
for _, row in self.aligned_sentences_df.iterrows(): | |
if row["input"] is None: | |
continue | |
input_sentences += row["input"] + "<br><br>" | |
url = row["url"] | |
if url not in urls: | |
urls.append(url) | |
source_text_urls += f"""<a href="{url}">{url}</a><br>""" | |
word_break = "word-break: break-all;" | |
return f""" | |
<tr> | |
<td>{input_sentences}</td> | |
<td>{self.text_prediction_label[0]}<br> | |
({self.text_prediction_score[0] * 100:.2f}%)</td> | |
<td style="{word_break}";>{source_text_urls}</td> | |
</tr> | |
""" | |
def format_image_ordinary_user_row(self): | |
if ( | |
self.image_referent_url is not None | |
or self.image_referent_url != "" | |
): | |
source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>""" # noqa: E501 | |
else: | |
source_image_url = "" | |
word_break = "word-break: break-all;" | |
return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501 | |
def create_governor_table(self): | |
rows = [] | |
rows.append(self.format_image_governor_row()) | |
for _, row in self.aligned_sentences_df.iterrows(): | |
if row["input"] is None: | |
continue | |
if row["source"] is None: | |
equal_idx_1 = equal_idx_2 = [] | |
else: | |
# Get index of equal phrases in input and source sentences | |
equal_idx_1, equal_idx_2 = extract_equal_text( | |
row["input"], | |
row["source"], | |
) | |
self.governor_table.append( | |
[ | |
row, | |
equal_idx_1, | |
equal_idx_2, | |
row["entities"], | |
], | |
) | |
formatted_row = self.format_text_governor_row() | |
rows.append(formatted_row) | |
table = "\n".join(rows) | |
return f""" | |
<h5>Comparison between input news and source news:</h5> | |
<table border="1" style="width:100%; text-align:left;"> | |
<col style="width: 170px;"> | |
<col style="width: 170px;"> | |
<col style="width: 30px;"> | |
<col style="width: 75px;"> | |
<thead> | |
<tr> | |
<th>Input news</th> | |
<th>Source (URL in Originality)</th> | |
<th>Forensic</th> | |
<th>Originality</th> | |
</tr> | |
</thead> | |
<tbody> | |
{table} | |
</tbody> | |
</table> | |
<style> | |
""" | |
def format_text_governor_row(self): | |
input_sentences = "" | |
source_sentences = "" | |
source_text_urls = "" | |
urls = [] | |
sentence_count = 0 | |
entity_count = [0, 0] # to get index of [-2] | |
for row in self.governor_table: | |
if row[0]["input"] is None: | |
continue | |
if row[0]["source"] is not None: # source is not empty | |
# highlight entities | |
input_sentence, highlight_idx_input = apply_highlight( | |
row[0]["input"], | |
row[3], # entities_with_colors | |
"input", # key | |
entity_count[ | |
-2 | |
], # since the last one is for current counting | |
) | |
source_sentence, highlight_idx_source = apply_highlight( | |
row[0]["source"], | |
row[3], # entities_with_colors | |
"source", # key | |
entity_count[ | |
-2 | |
], # since the last one is for current counting | |
) | |
# Color overlapping words | |
input_sentence = color_text( | |
input_sentence, | |
row[1], | |
highlight_idx_input, | |
) # text, index of highlight words | |
source_sentence = color_text( | |
source_sentence, | |
row[2], | |
highlight_idx_source, | |
) # text, index of highlight words | |
input_sentence = input_sentence.replace( | |
"span_style", | |
"span style", | |
).replace("1px_4px", "1px 4px") | |
source_sentence = source_sentence.replace( | |
"span_style", | |
"span style", | |
).replace("1px_4px", "1px 4px") | |
else: | |
if row[0]["source"] is None: | |
source_sentence = "" | |
else: | |
source_sentence = row[0]["source"] | |
input_sentence = row[0]["input"] | |
# convert score to HUMAN-based score: | |
input_sentences += input_sentence + "<br><br>" | |
source_sentences += source_sentence + "<br><br>" | |
url = row[0]["url"] | |
if url not in urls: | |
urls.append(url) | |
source_text_urls += f"""<a href="{url}">{url}</a><br><br>""" | |
sentence_count += 1 | |
if row[3] is not None: | |
entity_count.append(len(row[3])) | |
entity_count_text = format_entity_count(sum(entity_count)) | |
word_break = "word-break: break-all;" | |
return f""" | |
<tr> | |
<td>{input_sentences}</td> | |
<td>{source_sentences}</td> | |
<td>{self.text_prediction_label[0]}<br> | |
({self.text_prediction_score[0] * 100:.2f}%)<br><br> | |
{entity_count_text}</td> | |
<td style="{word_break}";>{source_text_urls}</td> | |
</tr> | |
""" | |
def format_image_governor_row(self): | |
if ( | |
self.image_referent_url is not None | |
or self.image_referent_url != "" | |
): | |
source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">""" # noqa: E501 | |
source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>""" # noqa: E501 | |
else: | |
source_image = "Image not found" | |
source_image_url = "" | |
word_break = "word-break: break-all;" | |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501 | |