n CLean up code
Browse files- .idea/workspace.xml +2 -1
- app.py +3 -17
- custom_renderer.py +25 -26
.idea/workspace.xml
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
<project version="4">
|
| 3 |
<component name="ChangeListManager">
|
| 4 |
<list default="true" id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="">
|
|
|
|
| 5 |
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
| 6 |
<change beforePath="$PROJECT_DIR$/custom_renderer.py" beforeDir="false" afterPath="$PROJECT_DIR$/custom_renderer.py" afterDir="false" />
|
| 7 |
</list>
|
|
@@ -43,7 +44,7 @@
|
|
| 43 |
<component name="PropertiesComponent"><![CDATA[{
|
| 44 |
"keyToString": {
|
| 45 |
"last_opened_file_path": "/home/matthias/Documents/Summarization-fact-checker/HugginfaceSpace/HFSummSpace",
|
| 46 |
-
"settings.editor.selected.configurable": "editor.preferences.
|
| 47 |
}
|
| 48 |
}]]></component>
|
| 49 |
<component name="RecentsManager">
|
|
|
|
| 2 |
<project version="4">
|
| 3 |
<component name="ChangeListManager">
|
| 4 |
<list default="true" id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="">
|
| 5 |
+
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
| 6 |
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
| 7 |
<change beforePath="$PROJECT_DIR$/custom_renderer.py" beforeDir="false" afterPath="$PROJECT_DIR$/custom_renderer.py" afterDir="false" />
|
| 8 |
</list>
|
|
|
|
| 44 |
<component name="PropertiesComponent"><![CDATA[{
|
| 45 |
"keyToString": {
|
| 46 |
"last_opened_file_path": "/home/matthias/Documents/Summarization-fact-checker/HugginfaceSpace/HFSummSpace",
|
| 47 |
+
"settings.editor.selected.configurable": "editor.preferences.folding"
|
| 48 |
}
|
| 49 |
}]]></component>
|
| 50 |
<component name="RecentsManager">
|
app.py
CHANGED
|
@@ -19,7 +19,8 @@ from transformers import pipeline
|
|
| 19 |
import os
|
| 20 |
|
| 21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
@st.experimental_singleton
|
| 25 |
def get_sentence_embedding_model():
|
|
@@ -108,7 +109,6 @@ def fetch_dependency_svg(filename: str) -> AnyStr:
|
|
| 108 |
def display_summary(summary_content: str):
|
| 109 |
st.session_state.summary_output = summary_content
|
| 110 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
| 111 |
-
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
| 112 |
return HTML_WRAPPER.format(soup)
|
| 113 |
|
| 114 |
|
|
@@ -149,7 +149,6 @@ def get_all_entities(text):
|
|
| 149 |
return list(itertools.chain.from_iterable(all_entities_per_sentence))
|
| 150 |
|
| 151 |
|
| 152 |
-
# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
|
| 153 |
def get_and_compare_entities():
|
| 154 |
# article_content = fetch_article_contents(article_name)
|
| 155 |
article_content = st.session_state.article_text
|
|
@@ -194,10 +193,6 @@ def highlight_entities():
|
|
| 194 |
for entity in unmatched_entities:
|
| 195 |
summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
|
| 196 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
| 197 |
-
|
| 198 |
-
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
| 199 |
-
margin-bottom: 2.5rem">{}</div> """
|
| 200 |
-
|
| 201 |
return HTML_WRAPPER.format(soup)
|
| 202 |
|
| 203 |
|
|
@@ -207,9 +202,7 @@ def render_dependency_parsing(text: Dict):
|
|
| 207 |
st.write(get_svg(html), unsafe_allow_html=True)
|
| 208 |
|
| 209 |
|
| 210 |
-
# If deps for article: True, otherwise deps for summary calc
|
| 211 |
def check_dependency(article: bool):
|
| 212 |
-
# nlp = spacy.load('en_core_web_lg')
|
| 213 |
if article:
|
| 214 |
text = st.session_state.article_text
|
| 215 |
all_entities = get_all_entities_per_sentence(text)
|
|
@@ -220,7 +213,6 @@ def check_dependency(article: bool):
|
|
| 220 |
# all_entities = st.session_state.entities_per_sentence_summary
|
| 221 |
doc = nlp(text)
|
| 222 |
tok_l = doc.to_json()['tokens']
|
| 223 |
-
# all_deps = ""
|
| 224 |
test_list_dict_output = []
|
| 225 |
|
| 226 |
sentences = list(doc.sents)
|
|
@@ -244,7 +236,6 @@ def check_dependency(article: bool):
|
|
| 244 |
"target_word_index": (t['head'] - sentence.start),
|
| 245 |
"identifier": identifier, "sentence": str(sentence)})
|
| 246 |
elif object_target in all_entities[i]:
|
| 247 |
-
# all_deps = all_deps.join(str(sentence))
|
| 248 |
identifier = object_here + t['dep'] + object_target
|
| 249 |
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
|
| 250 |
"target_word_index": (t['head'] - sentence.start),
|
|
@@ -252,7 +243,6 @@ def check_dependency(article: bool):
|
|
| 252 |
else:
|
| 253 |
continue
|
| 254 |
return test_list_dict_output
|
| 255 |
-
# return all_deps
|
| 256 |
|
| 257 |
|
| 258 |
def render_svg(svg_file):
|
|
@@ -320,7 +310,7 @@ st.markdown("Letβs start by selecting an article text for which we want to gen
|
|
| 320 |
"generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
|
| 321 |
|
| 322 |
selected_article = st.selectbox('Select an article or provide your own:',
|
| 323 |
-
list_all_article_names())
|
| 324 |
st.session_state.article_text = fetch_article_contents(selected_article)
|
| 325 |
article_text = st.text_area(
|
| 326 |
label='Full article text',
|
|
@@ -391,8 +381,6 @@ if summarize_button:
|
|
| 391 |
if st.session_state.unchanged_text:
|
| 392 |
entity_specific_text = fetch_entity_specific_contents(selected_article)
|
| 393 |
soup = BeautifulSoup(entity_specific_text, features="html.parser")
|
| 394 |
-
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
| 395 |
-
margin-bottom: 2.5rem">{}</div> """
|
| 396 |
st.write("π‘π **Specific example explanation** ππ‘", HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
| 397 |
|
| 398 |
# DEPENDENCY PARSING PART
|
|
@@ -429,8 +417,6 @@ if summarize_button:
|
|
| 429 |
st.write(cur_svg_image, unsafe_allow_html=True)
|
| 430 |
dep_specific_text = fetch_dependency_specific_contents(selected_article)
|
| 431 |
soup = BeautifulSoup(dep_specific_text, features="html.parser")
|
| 432 |
-
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
| 433 |
-
margin-bottom: 2.5rem">{}</div> """
|
| 434 |
st.write("π‘π **Specific example explanation** ππ‘", HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
| 435 |
else:
|
| 436 |
summary_deps = check_dependency(False)
|
|
|
|
| 19 |
import os
|
| 20 |
|
| 21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 22 |
+
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
| 23 |
+
margin-bottom: 2.5rem">{}</div> """
|
| 24 |
|
| 25 |
@st.experimental_singleton
|
| 26 |
def get_sentence_embedding_model():
|
|
|
|
| 109 |
def display_summary(summary_content: str):
|
| 110 |
st.session_state.summary_output = summary_content
|
| 111 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
|
|
|
| 112 |
return HTML_WRAPPER.format(soup)
|
| 113 |
|
| 114 |
|
|
|
|
| 149 |
return list(itertools.chain.from_iterable(all_entities_per_sentence))
|
| 150 |
|
| 151 |
|
|
|
|
| 152 |
def get_and_compare_entities():
|
| 153 |
# article_content = fetch_article_contents(article_name)
|
| 154 |
article_content = st.session_state.article_text
|
|
|
|
| 193 |
for entity in unmatched_entities:
|
| 194 |
summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
|
| 195 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
return HTML_WRAPPER.format(soup)
|
| 197 |
|
| 198 |
|
|
|
|
| 202 |
st.write(get_svg(html), unsafe_allow_html=True)
|
| 203 |
|
| 204 |
|
|
|
|
| 205 |
def check_dependency(article: bool):
|
|
|
|
| 206 |
if article:
|
| 207 |
text = st.session_state.article_text
|
| 208 |
all_entities = get_all_entities_per_sentence(text)
|
|
|
|
| 213 |
# all_entities = st.session_state.entities_per_sentence_summary
|
| 214 |
doc = nlp(text)
|
| 215 |
tok_l = doc.to_json()['tokens']
|
|
|
|
| 216 |
test_list_dict_output = []
|
| 217 |
|
| 218 |
sentences = list(doc.sents)
|
|
|
|
| 236 |
"target_word_index": (t['head'] - sentence.start),
|
| 237 |
"identifier": identifier, "sentence": str(sentence)})
|
| 238 |
elif object_target in all_entities[i]:
|
|
|
|
| 239 |
identifier = object_here + t['dep'] + object_target
|
| 240 |
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
|
| 241 |
"target_word_index": (t['head'] - sentence.start),
|
|
|
|
| 243 |
else:
|
| 244 |
continue
|
| 245 |
return test_list_dict_output
|
|
|
|
| 246 |
|
| 247 |
|
| 248 |
def render_svg(svg_file):
|
|
|
|
| 310 |
"generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
|
| 311 |
|
| 312 |
selected_article = st.selectbox('Select an article or provide your own:',
|
| 313 |
+
list_all_article_names())
|
| 314 |
st.session_state.article_text = fetch_article_contents(selected_article)
|
| 315 |
article_text = st.text_area(
|
| 316 |
label='Full article text',
|
|
|
|
| 381 |
if st.session_state.unchanged_text:
|
| 382 |
entity_specific_text = fetch_entity_specific_contents(selected_article)
|
| 383 |
soup = BeautifulSoup(entity_specific_text, features="html.parser")
|
|
|
|
|
|
|
| 384 |
st.write("π‘π **Specific example explanation** ππ‘", HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
| 385 |
|
| 386 |
# DEPENDENCY PARSING PART
|
|
|
|
| 417 |
st.write(cur_svg_image, unsafe_allow_html=True)
|
| 418 |
dep_specific_text = fetch_dependency_specific_contents(selected_article)
|
| 419 |
soup = BeautifulSoup(dep_specific_text, features="html.parser")
|
|
|
|
|
|
|
| 420 |
st.write("π‘π **Specific example explanation** ππ‘", HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
| 421 |
else:
|
| 422 |
summary_deps = check_dependency(False)
|
custom_renderer.py
CHANGED
|
@@ -1,6 +1,26 @@
|
|
| 1 |
from typing import Dict
|
| 2 |
from PIL import ImageFont
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
def get_pil_text_size(text, font_size, font_name):
|
|
@@ -21,15 +41,7 @@ def render_arrow(
|
|
| 21 |
i (int): Unique ID, typically arrow index.
|
| 22 |
RETURNS (str): Rendered SVG markup.
|
| 23 |
"""
|
| 24 |
-
|
| 25 |
-
<g class="displacy-arrow">
|
| 26 |
-
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="red"/>
|
| 27 |
-
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
| 28 |
-
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" side="{label_side}" fill="red" text-anchor="middle">{label}</textPath>
|
| 29 |
-
</text>
|
| 30 |
-
<path class="displacy-arrowhead" d="{head}" fill="red"/>
|
| 31 |
-
</g>
|
| 32 |
-
"""
|
| 33 |
arc = get_arc(start + 10, 50, 5, end + 10)
|
| 34 |
arrowhead = get_arrowhead(direction, start + 10, 50, end + 10)
|
| 35 |
label_side = "right" if direction == "rtl" else "left"
|
|
@@ -75,26 +87,15 @@ def get_arrowhead(direction: str, x: int, y: int, end: int) -> str:
|
|
| 75 |
|
| 76 |
|
| 77 |
def render_sentence_custom(unmatched_list: Dict, nlp):
|
| 78 |
-
TPL_DEP_WORDS = """
|
| 79 |
-
<text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
|
| 80 |
-
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
|
| 81 |
-
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
|
| 82 |
-
</text>
|
| 83 |
-
"""
|
| 84 |
-
|
| 85 |
-
TPL_DEP_SVG = """
|
| 86 |
-
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
|
| 87 |
-
"""
|
| 88 |
arcs_svg = []
|
| 89 |
-
#nlp = spacy.load('en_core_web_lg')
|
| 90 |
doc = nlp(unmatched_list["sentence"])
|
| 91 |
|
| 92 |
x_value_counter = 10
|
| 93 |
index_counter = 0
|
| 94 |
svg_words = []
|
| 95 |
-
|
| 96 |
-
coords_test = []
|
| 97 |
direction_current = "rtl"
|
|
|
|
| 98 |
if unmatched_list["cur_word_index"] < unmatched_list["target_word_index"]:
|
| 99 |
min_index = unmatched_list["cur_word_index"]
|
| 100 |
max_index = unmatched_list["target_word_index"]
|
|
@@ -108,13 +109,13 @@ def render_sentence_custom(unmatched_list: Dict, nlp):
|
|
| 108 |
pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
|
| 109 |
svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
|
| 110 |
if min_index <= index_counter <= max_index:
|
| 111 |
-
|
| 112 |
if index_counter < max_index - 1:
|
| 113 |
x_value_counter += 50
|
| 114 |
index_counter += 1
|
| 115 |
x_value_counter += pixel_x_length + 4
|
| 116 |
|
| 117 |
-
arcs_svg.append(render_arrow(unmatched_list['dep'],
|
| 118 |
|
| 119 |
content = "".join(svg_words) + "".join(arcs_svg)
|
| 120 |
|
|
@@ -130,5 +131,3 @@ def render_sentence_custom(unmatched_list: Dict, nlp):
|
|
| 130 |
lang="en",
|
| 131 |
)
|
| 132 |
return full_svg
|
| 133 |
-
|
| 134 |
-
|
|
|
|
| 1 |
from typing import Dict
|
| 2 |
from PIL import ImageFont
|
| 3 |
|
| 4 |
+
TPL_DEP_WORDS = """
|
| 5 |
+
<text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
|
| 6 |
+
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
|
| 7 |
+
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
|
| 8 |
+
</text>
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
TPL_DEP_SVG = """
|
| 12 |
+
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
TPL_DEP_ARCS = """
|
| 16 |
+
<g class="displacy-arrow">
|
| 17 |
+
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="red"/>
|
| 18 |
+
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
| 19 |
+
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" side="{label_side}" fill="red" text-anchor="middle">{label}</textPath>
|
| 20 |
+
</text>
|
| 21 |
+
<path class="displacy-arrowhead" d="{head}" fill="red"/>
|
| 22 |
+
</g>
|
| 23 |
+
"""
|
| 24 |
|
| 25 |
|
| 26 |
def get_pil_text_size(text, font_size, font_name):
|
|
|
|
| 41 |
i (int): Unique ID, typically arrow index.
|
| 42 |
RETURNS (str): Rendered SVG markup.
|
| 43 |
"""
|
| 44 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
arc = get_arc(start + 10, 50, 5, end + 10)
|
| 46 |
arrowhead = get_arrowhead(direction, start + 10, 50, end + 10)
|
| 47 |
label_side = "right" if direction == "rtl" else "left"
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
def render_sentence_custom(unmatched_list: Dict, nlp):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
arcs_svg = []
|
|
|
|
| 91 |
doc = nlp(unmatched_list["sentence"])
|
| 92 |
|
| 93 |
x_value_counter = 10
|
| 94 |
index_counter = 0
|
| 95 |
svg_words = []
|
| 96 |
+
words_under_arc = []
|
|
|
|
| 97 |
direction_current = "rtl"
|
| 98 |
+
|
| 99 |
if unmatched_list["cur_word_index"] < unmatched_list["target_word_index"]:
|
| 100 |
min_index = unmatched_list["cur_word_index"]
|
| 101 |
max_index = unmatched_list["target_word_index"]
|
|
|
|
| 109 |
pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
|
| 110 |
svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
|
| 111 |
if min_index <= index_counter <= max_index:
|
| 112 |
+
words_under_arc.append(x_value_counter)
|
| 113 |
if index_counter < max_index - 1:
|
| 114 |
x_value_counter += 50
|
| 115 |
index_counter += 1
|
| 116 |
x_value_counter += pixel_x_length + 4
|
| 117 |
|
| 118 |
+
arcs_svg.append(render_arrow(unmatched_list['dep'], words_under_arc[0], words_under_arc[-1], direction_current, i))
|
| 119 |
|
| 120 |
content = "".join(svg_words) + "".join(arcs_svg)
|
| 121 |
|
|
|
|
| 131 |
lang="en",
|
| 132 |
)
|
| 133 |
return full_svg
|
|
|
|
|
|