Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /formatting_fact_checker.py

pmkhanh7890

fix bug of showing forensic

5842223 5 months ago

raw

history blame

8.06 kB

	from pandas import DataFrame

	from src.application.config import WORD_BREAK
	from src.application.formatting import (
	color_text,
	format_entity_count,
	)
	from src.application.image.helper import encode_image
	from src.application.image.image import ImageDetector
	from src.application.text.entity import apply_highlight
	from src.application.text.helper import (
	extract_equal_text,
	replace_leading_spaces,
	)
	from src.application.text.text import TextDetector


	def create_fact_checker_table(
	aligned_sentences_df: DataFrame,
	text: TextDetector,
	image: ImageDetector,
	):
	rows = []
	if image.input is not None:
	rows.append(format_image_fact_checker_row(image))

	if text.input is not None:
	for _, row in aligned_sentences_df.iterrows():
	if row["input"] is None:
	continue

	if row["source"] is None:
	equal_idx_1 = equal_idx_2 = []

	else: # Get index of equal phrases in input and source sentences
	equal_idx_1, equal_idx_2 = extract_equal_text(
	row["input"],
	row["source"],
	)

	text.fact_checker_table.append(
	[
	row, # aligned_sentences_df
	equal_idx_1, # index of equal text in input
	equal_idx_2, # index of equal text in source
	row["entities"],
	row["url"],
	],
	)

	previous_url = None
	span_row = 1
	for index, row in enumerate(text.fact_checker_table):
	current_url = row[4]
	last_url_row = False

	# First row or URL change
	if index == 0 or current_url != previous_url:
	first_url_row = True
	previous_url = current_url
	# Increase counter "span_row" when the next url is the same
	while (
	index + span_row < len(text.fact_checker_table)
	and text.fact_checker_table[index + span_row][4]
	== current_url
	):
	span_row += 1

	else:
	first_url_row = False
	span_row -= 1

	if span_row == 1:
	last_url_row = True

	formatted_row = format_text_fact_checker_row(
	text,
	row,
	first_url_row,
	last_url_row,
	span_row,
	)
	rows.append(formatted_row)

	table = "\n".join(rows)
	return f"""
	<h5>Comparison between input news and source news:</h5>
	<table border="1" style="width:100%; text-align:left;">
	<col style="width: 170px;">
	<col style="width: 170px;">
	<col style="width: 30px;">
	<col style="width: 75px;">
	<thead>
	<tr>
	<th>Input news</th>
	<th>Source (URL in Originality)</th>
	<th>Forensic</th>
	<th>Originality</th>
	</tr>
	</thead>
	<tbody>
	{table}
	</tbody>
	</table>
	<style>
	"""


	def format_text_fact_checker_row(
	text: TextDetector,
	row: list,
	first_url_row: bool = True,
	last_url_row: bool = True,
	span_row: int = 1,
	):
	entity_count = 0
	print(f"row: {row}")
	if row[0]["input"] is None:
	return ""
	if row[0]["source"] is not None: # source is not empty
	if row[3] is not None:
	# highlight entities
	input_sentence, highlight_idx_input = apply_highlight(
	row[0]["input"],
	row[3],
	"input",
	)
	source_sentence, highlight_idx_source = apply_highlight(
	row[0]["source"],
	row[3],
	"source",
	)
	else:
	input_sentence = row[0]["input"]
	source_sentence = row[0]["source"]
	highlight_idx_input = []
	highlight_idx_source = []

	if row[3] is not None:
	entity_count = len(row[3])

	# Color overlapping words
	input_sentence = color_text(
	input_sentence,
	row[1],
	highlight_idx_input,
	) # text, index of highlight words
	source_sentence = color_text(
	source_sentence,
	row[2],
	highlight_idx_source,
	) # text, index of highlight words

	# Replace _ to get correct formatting
	# Original one having _ for correct word counting
	input_sentence = input_sentence.replace(
	"span_style",
	"span style",
	).replace("1px_4px", "1px 4px")
	source_sentence = source_sentence.replace(
	"span_style",
	"span style",
	).replace("1px_4px", "1px 4px")
	else:
	input_sentence = row[0]["input"]
	source_sentence = row[0]["source"]

	input_sentence = replace_leading_spaces(input_sentence)
	source_sentence = replace_leading_spaces(source_sentence)

	url = row[0]["url"]

	# Displayed label and score by url
	filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]

	if len(filterby_url) > 0:
	label = filterby_url["label"].values[0]
	score = filterby_url["score"].values[0]
	else:
	label = text.prediction_label[0]
	score = text.prediction_score[0]

	# Format displayed url
	if url is None:
	source_text_url = url
	else:
	source_text_url = f"""<a href="{url}">{url}</a>"""

	# Format displayed entity count
	entity_count_text = format_entity_count(entity_count)

	border_top = "border-top: 1px solid transparent;"
	border_bottom = "border-bottom: 1px solid transparent;"
	if first_url_row is True:
	# First & Last the group: no transparent
	if last_url_row is True:
	return f"""
	<tr>
	<td>{input_sentence}</td>
	<td>{source_sentence}</td>
	<td rowspan="{span_row}">{label}<br>
	({score * 100:.2f}%)<br><br>
	{entity_count_text}</td>
	<td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
	</tr>
	"""
	# First row of the group: transparent bottom border
	return f"""
	<tr>
	<td style="{border_bottom}";>{input_sentence}</td>
	<td style="{border_bottom}";>{source_sentence}</td>
	<td rowspan="{span_row}">{label}<br>
	({score * 100:.2f}%)<br><br>
	{entity_count_text}</td>
	<td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
	</tr>
	"""
	else:
	if last_url_row is True:
	# NOT First row, Last row: transparent top border
	return f"""
	<tr>
	<td style="{border_top}";>{input_sentence}</td>
	<td style="{border_top}";>{source_sentence}</td>
	</tr>
	"""
	else:
	# NOT First & NOT Last row: transparent top & bottom borders
	return f"""
	<tr>
	<td style="{border_top} {border_bottom}";>{input_sentence}</td>
	<td style="{border_top} {border_bottom}";>{source_sentence}</td>
	</tr>
	"""


	def format_image_fact_checker_row(image: ImageDetector):
	if image.input is None:
	return ""

	if image.referent_url is not None or image.referent_url != "":
	if "http" in image.input:
	input_image = (
	f"""<a href="{image.input}">{image.input}</a>""" # noqa: E501
	)
	else:
	base64_image = encode_image(image.input)
	input_image = f"""<img src="data:image/jpeg;base64,{base64_image}" width="100" height="150">""" # noqa: E501
	source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
	source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
	else:
	source_image = "Image not found"
	source_image_url = ""

	return f"""
	<tr>
	<td>{input_image}</td>
	<td>{source_image}</td>
	<td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
	<td style="{WORD_BREAK}";>{source_image_url}</td>
	</tr>
	"""