Spaces:
Sleeping
Sleeping
File size: 8,091 Bytes
7e96e8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import re
def create_matching_pattern(word):
"""Creates appropriate regex pattern based on word characteristics"""
escaped_word = re.escape(word)
# Check for special cases
if any(char in word for char in '&-/.\'()[]$€£¥+') or ' ' in word:
# Special handling for phrases with special characters or spaces
return rf'{escaped_word}'
elif word.endswith('%'):
# Special handling for percentage values
numeric_part = word[:-1]
return rf'\b{re.escape(numeric_part)}\s*%'
elif re.search(r'[0-9]', word) and re.search(r'[a-zA-Z]', word):
# Special handling for alphanumeric combinations
return rf'{escaped_word}'
else:
# Standard word boundary pattern for simple words
return rf'\b{escaped_word}\b'
def highlight_common_words(common_words, sentences, title):
"""
Highlight common words in sentences by adding color-coded background and unique IDs.
Args:
common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
sentences (list of str): List of sentences to search through.
title (str): The title for the HTML output.
Returns:
str: HTML string with the highlighted sentences.
"""
color_map = {}
color_index = 0
highlighted_html = []
pastel_colors = ['#E199C6','#7FB3D5', '#E57373', '#B388EB', '#80D9AA', '#F0B66B',
"#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1",
"#65CFA5", "#B38FDE", "#E6C97A"]
# Process each sentence
for idx, sentence in enumerate(sentences, start=1):
sentence_with_idx = f"{idx}. {sentence}"
highlighted_sentence = sentence_with_idx
# Highlight common words in each sentence
for index, word in common_words:
if word not in color_map:
color_map[word] = pastel_colors[color_index % len(pastel_colors)]
color_index += 1
# Create appropriate pattern based on word characteristics
pattern = create_matching_pattern(word)
# Replace the word with highlighted version
highlighted_sentence = re.sub(
pattern,
lambda m, idx=index, color=color_map[word]: (
f'<span style="background-color: {color}; font-weight: bold;'
f' padding: 2px 4px; border-radius: 2px; position: relative;">'
f'<span style="background-color: black; color: white; border-radius: 50%;'
f' padding: 2px 5px; margin-right: 5px;">{idx}</span>'
f'{m.group(0)}'
f'</span>'
),
highlighted_sentence,
flags=re.IGNORECASE
)
highlighted_html.append(highlighted_sentence)
# Format the HTML output with the title
final_html = "<br><br>".join(highlighted_html)
return f'''
<div style="border: solid 1px #FFFFFF; padding: 16px; background-color: #000000; color: #FFFFFF; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
<h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
<div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
</div>
'''
def highlight_common_words_dict(common_words, sentences, title):
"""
Highlight common words in sentences (from a dictionary) by adding color-coded background and unique IDs.
Args:
common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
sentences (dict): A dictionary of sentences where the key is the sentence and the value is an entailment score.
title (str): The title for the HTML output.
Returns:
str: HTML string with the highlighted sentences and their entailment scores.
"""
color_map = {}
color_index = 0
highlighted_html = []
pastel_colors = ['#E199C6','#7FB3D5', '#E57373', '#B388EB', '#80D9AA', '#F0B66B',
"#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1",
"#65CFA5", "#B38FDE", "#E6C97A"]
# Process each sentence and its score
for idx, (sentence, score) in enumerate(sentences.items(), start=1):
sentence_with_idx = f"{idx}. {sentence}"
highlighted_sentence = sentence_with_idx
# Highlight common words in each sentence
for index, word in common_words:
if word not in color_map:
color_map[word] = pastel_colors[color_index % len(pastel_colors)]
color_index += 1
# Create appropriate pattern based on word characteristics
pattern = create_matching_pattern(word)
# Replace the word with highlighted version
highlighted_sentence = re.sub(
pattern,
lambda m, idx=index, color=color_map[word]: (
f'<span style="background-color: {color}; font-weight: bold;'
f' padding: 1px 2px; border-radius: 2px; position: relative;">'
f'<span style="background-color: black; color: white; border-radius: 50%;'
f' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{idx}</span>'
f'{m.group(0)}'
f'</span>'
),
highlighted_sentence,
flags=re.IGNORECASE
)
# Add the entailment score
highlighted_html.append(
f'<div style="margin-bottom: 5px;">'
f'{highlighted_sentence}'
f'<div style="display: inline-block; margin-left: 5px; padding: 3px 5px; border-radius: 3px; '
f'background-color: #333333; color: white; font-size: 0.9em;">'
f'Entailment Score: {score}</div></div>'
)
# Format the HTML output with the title
final_html = "<br>".join(highlighted_html)
return f'''
<div style="background-color: #000000; color: #FFFFFF;border: solid 1px #FFFFFF; border-radius: 8px;">
<h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
<div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
</div>
'''
def reparaphrased_sentences_html(sentences):
"""
Create an HTML representation of sentences with numbering.
Args:
sentences (list of str): List of sentences to format.
Returns:
str: HTML string with numbered sentences.
"""
formatted_sentences = []
# Process each sentence
for idx, sentence in enumerate(sentences, start=1):
sentence_with_idx = f"{idx}. {sentence}"
formatted_sentences.append(sentence_with_idx)
# Format the HTML output
final_html = "<br><br>".join(formatted_sentences)
return f'''
<div style="border: solid 1px #FFFFFF; background-color: #000000; color: #FFFFFF;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
<div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
</div>
'''
if __name__ == "__main__":
# Example usage
common_words = [(1, "highlight"), (2, "numbering"), (3, "S&P 500")]
sentences = ["This is a test to highlight words.", "Numbering is important for clarity.", "The S&P 500 index rose 2% today."]
# Test highlight_common_words
highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
print(highlighted_html)
# Test highlight_common_words_dict
sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8, "The S&P 500 index is a market benchmark.": 0.88}
highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
print(highlighted_html_dict) |