Spaces:
Build error
Build error
Commit
·
8fc25ec
1
Parent(s):
dad3fe5
Create new file
Browse files
app.py
ADDED
|
@@ -0,0 +1,729 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 2 |
+
import torch
|
| 3 |
+
from mosestokenizer import *
|
| 4 |
+
from indicnlp.tokenize import sentence_tokenize
|
| 5 |
+
from docx import Document
|
| 6 |
+
|
| 7 |
+
trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M" )
|
| 8 |
+
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
| 9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
trans_model = trans_model.to(device)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
lang_dict = {
|
| 15 |
+
'english' : 'eng_Latn',
|
| 16 |
+
'assamese' : 'asm_Beng',
|
| 17 |
+
'awadhi' : 'awa_Deva' ,
|
| 18 |
+
'bengali' : 'ben_Beng',
|
| 19 |
+
'bhojpuri' : 'bho_Deva',
|
| 20 |
+
'gujarati' : 'guj_Gujr',
|
| 21 |
+
'hindi' : 'hin_Deva',
|
| 22 |
+
'kannada' : 'kan_Knda',
|
| 23 |
+
'kashmiri' : 'kas_Deva',
|
| 24 |
+
'maithili' : 'mai_Deva',
|
| 25 |
+
'malayalam' : 'mal_Mlym',
|
| 26 |
+
'marathi' : 'mar_Deva',
|
| 27 |
+
'odia' : 'ory_Orya',
|
| 28 |
+
'punjabi' : 'pan_Guru',
|
| 29 |
+
'sanskrit' : 'san_Deva',
|
| 30 |
+
'sindhi' : 'snd_Arab' ,
|
| 31 |
+
'tamil' : 'tam_Taml' ,
|
| 32 |
+
'telugu' : 'tel_Telu',
|
| 33 |
+
'urdu' : 'urd_Arab'
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
def translate_sentence(article, target):
|
| 37 |
+
inputs = trans_tokenizer(article.replace("\"",""), return_tensors="pt").to(device)
|
| 38 |
+
|
| 39 |
+
translated_tokens = trans_model.generate(
|
| 40 |
+
**inputs, forced_bos_token_id=trans_tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)
|
| 41 |
+
|
| 42 |
+
return trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu',
|
| 47 |
+
'hindi' : 'hi',
|
| 48 |
+
'kannada' : 'kn',
|
| 49 |
+
'malayalam' : 'ml',
|
| 50 |
+
'marathi' : 'mr',
|
| 51 |
+
'odia' : 'or',
|
| 52 |
+
'punjabi' : 'pa',
|
| 53 |
+
'tamil' : 'ta' ,
|
| 54 |
+
'telugu' : 'te'}
|
| 55 |
+
|
| 56 |
+
def split_sentences(paragraph, language):
|
| 57 |
+
if language in INDIC_DICT.keys():
|
| 58 |
+
return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
|
| 59 |
+
elif language == 'en':
|
| 60 |
+
with MosesSentenceSplitter('en') as splitter:
|
| 61 |
+
return splitter([paragraph])
|
| 62 |
+
else:
|
| 63 |
+
return paragraph.split(".")
|
| 64 |
+
|
| 65 |
+
def translate_paragraph(paragraph, source, target):
|
| 66 |
+
if source == target :
|
| 67 |
+
return paragraph
|
| 68 |
+
if len(paragraph.split()) < 100:
|
| 69 |
+
return translate_sentence(paragraph, target)
|
| 70 |
+
else:
|
| 71 |
+
sentences = split_sentences(paragraph, source)
|
| 72 |
+
outputs = []
|
| 73 |
+
for each_sentence in sentences:
|
| 74 |
+
outputs.append(translate_sentence(each_sentence, target))
|
| 75 |
+
return " ".join(outputs)
|
| 76 |
+
|
| 77 |
+
def docx_replace(doc, data):
|
| 78 |
+
paragraphs = list(doc.paragraphs)
|
| 79 |
+
for t in doc.tables:
|
| 80 |
+
for row in t.rows:
|
| 81 |
+
for cell in row.cells:
|
| 82 |
+
for paragraph in cell.paragraphs:
|
| 83 |
+
paragraphs.append(paragraph)
|
| 84 |
+
|
| 85 |
+
for key, val in data.items():
|
| 86 |
+
for p in paragraphs:
|
| 87 |
+
#key_name = '${{{}}}'.format(key) # I'm using placeholders in the form ${PlaceholderName}
|
| 88 |
+
key_name = key
|
| 89 |
+
if key_name in p.text:
|
| 90 |
+
#print(f'old one {p.text}')
|
| 91 |
+
inline = p.runs
|
| 92 |
+
# Replace strings and retain the same style.
|
| 93 |
+
# The text to be replaced can be split over several runs so
|
| 94 |
+
# search through, identify which runs need to have text replaced
|
| 95 |
+
# then replace the text in those identified
|
| 96 |
+
started = False
|
| 97 |
+
key_index = 0
|
| 98 |
+
# found_runs is a list of (inline index, index of match, length of match)
|
| 99 |
+
found_runs = list()
|
| 100 |
+
found_all = False
|
| 101 |
+
replace_done = False
|
| 102 |
+
for i in range(len(inline)):
|
| 103 |
+
|
| 104 |
+
# case 1: found in single run so short circuit the replace
|
| 105 |
+
if key_name in inline[i].text and not started:
|
| 106 |
+
found_runs.append((i, inline[i].text.find(key_name), len(key_name)))
|
| 107 |
+
text = inline[i].text.replace(key_name, str(val))
|
| 108 |
+
inline[i].text = text
|
| 109 |
+
replace_done = True
|
| 110 |
+
found_all = True
|
| 111 |
+
break
|
| 112 |
+
|
| 113 |
+
if key_name[key_index] not in inline[i].text and not started:
|
| 114 |
+
# keep looking ...
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
# case 2: search for partial text, find first run
|
| 118 |
+
if key_name[key_index] in inline[i].text and inline[i].text[-1] in key_name and not started:
|
| 119 |
+
# check sequence
|
| 120 |
+
start_index = inline[i].text.find(key_name[key_index])
|
| 121 |
+
check_length = len(inline[i].text)
|
| 122 |
+
for text_index in range(start_index, check_length):
|
| 123 |
+
if inline[i].text[text_index] != key_name[key_index]:
|
| 124 |
+
# no match so must be false positive
|
| 125 |
+
break
|
| 126 |
+
if key_index == 0:
|
| 127 |
+
started = True
|
| 128 |
+
chars_found = check_length - start_index
|
| 129 |
+
key_index += chars_found
|
| 130 |
+
found_runs.append((i, start_index, chars_found))
|
| 131 |
+
if key_index != len(key_name):
|
| 132 |
+
continue
|
| 133 |
+
else:
|
| 134 |
+
# found all chars in key_name
|
| 135 |
+
found_all = True
|
| 136 |
+
break
|
| 137 |
+
|
| 138 |
+
# case 2: search for partial text, find subsequent run
|
| 139 |
+
if key_name[key_index] in inline[i].text and started and not found_all:
|
| 140 |
+
# check sequence
|
| 141 |
+
chars_found = 0
|
| 142 |
+
check_length = len(inline[i].text)
|
| 143 |
+
for text_index in range(0, check_length):
|
| 144 |
+
if inline[i].text[text_index] == key_name[key_index]:
|
| 145 |
+
key_index += 1
|
| 146 |
+
chars_found += 1
|
| 147 |
+
else:
|
| 148 |
+
break
|
| 149 |
+
# no match so must be end
|
| 150 |
+
found_runs.append((i, 0, chars_found))
|
| 151 |
+
if key_index == len(key_name):
|
| 152 |
+
found_all = True
|
| 153 |
+
break
|
| 154 |
+
|
| 155 |
+
if found_all and not replace_done:
|
| 156 |
+
for i, item in enumerate(found_runs):
|
| 157 |
+
index, start, length = [t for t in item]
|
| 158 |
+
if i == 0:
|
| 159 |
+
text = inline[index].text.replace(inline[index].text[start:start + length], str(val))
|
| 160 |
+
inline[index].text = text
|
| 161 |
+
else:
|
| 162 |
+
text = inline[index].text.replace(inline[index].text[start:start + length], '')
|
| 163 |
+
inline[index].text = text
|
| 164 |
+
#print(p.text)
|
| 165 |
+
break
|
| 166 |
+
|
| 167 |
+
input_output_trans = {"NON-DISCLOSURE-AGREEMENT":{"telugu":"translation_telugu.docx","hindi":"translation_english.docx"}, "dummy.docx":{"telugu":"translation_telugu.docx","hindi":"translation_english.docx"}}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def translate_fill(document_name,output_file, src, trg):
|
| 171 |
+
print("translate doc")
|
| 172 |
+
|
| 173 |
+
doc = docx.Document(document_name)
|
| 174 |
+
if doc.paragraphs[0].text in list(input_output_trans.keys()):
|
| 175 |
+
lang_doc_dict = input_output_trans[doc.paragraphs[0].text]
|
| 176 |
+
if trg in lang_doc_dict.keys():
|
| 177 |
+
time.sleep(5)
|
| 178 |
+
return lang_doc_dict[trg]
|
| 179 |
+
|
| 180 |
+
template_document = Document(document_name)
|
| 181 |
+
|
| 182 |
+
variables = {}
|
| 183 |
+
for paragraph in template_document.paragraphs:
|
| 184 |
+
if(paragraph.text.strip() != ""):
|
| 185 |
+
variables[paragraph.text] = translate_paragraph(paragraph.text, src, trg)
|
| 186 |
+
|
| 187 |
+
for t in template_document.tables:
|
| 188 |
+
for row in t.rows:
|
| 189 |
+
for cell in row.cells:
|
| 190 |
+
for paragraph in cell.paragraphs:
|
| 191 |
+
if(paragraph.text.strip() != ""):
|
| 192 |
+
variables[paragraph.text] = translate_paragraph(paragraph.text, src, trg)
|
| 193 |
+
|
| 194 |
+
docx_replace(template_document, variables)
|
| 195 |
+
template_document.save(output_file)
|
| 196 |
+
return output_file
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def translate_txt(document_name, output_file, src, trg):
|
| 201 |
+
print("translate text")
|
| 202 |
+
with open(document_name) as fp:
|
| 203 |
+
lines = fp.readlines()
|
| 204 |
+
|
| 205 |
+
lines = [line.rstrip() for line in lines]
|
| 206 |
+
|
| 207 |
+
with open(output_file, 'w') as f:
|
| 208 |
+
for line in lines:
|
| 209 |
+
if(line!=""):
|
| 210 |
+
f.write( translate_paragraph(line, src, trg) + "\n")
|
| 211 |
+
else:
|
| 212 |
+
f.write("\n")
|
| 213 |
+
|
| 214 |
+
return output_file
|
| 215 |
+
|
| 216 |
+
import torch
|
| 217 |
+
import time
|
| 218 |
+
import json
|
| 219 |
+
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
| 220 |
+
|
| 221 |
+
from transformers import (
|
| 222 |
+
AutoConfig,
|
| 223 |
+
AutoModelForQuestionAnswering,
|
| 224 |
+
AutoTokenizer,
|
| 225 |
+
squad_convert_examples_to_features
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
|
| 229 |
+
from transformers.data.metrics.squad_metrics import compute_predictions_logits
|
| 230 |
+
|
| 231 |
+
info_model_path = 'cuad-models/roberta-base/'
|
| 232 |
+
info_config_class, info_model_class, info_tokenizer_class = (
|
| 233 |
+
AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
|
| 234 |
+
info_config = info_config_class.from_pretrained(info_model_path)
|
| 235 |
+
info_tokenizer = info_tokenizer_class.from_pretrained(
|
| 236 |
+
info_model_path, do_lower_case=True, use_fast=False)
|
| 237 |
+
info_model = info_model_class.from_pretrained(info_model_path, config=info_config)
|
| 238 |
+
|
| 239 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 240 |
+
info_model.to(device)
|
| 241 |
+
|
| 242 |
+
def run_prediction(question_texts, context_text):
|
| 243 |
+
### Setting hyperparameters
|
| 244 |
+
max_seq_length = 512
|
| 245 |
+
doc_stride = 256
|
| 246 |
+
n_best_size = 1
|
| 247 |
+
max_query_length = 64
|
| 248 |
+
max_answer_length = 512
|
| 249 |
+
do_lower_case = False
|
| 250 |
+
null_score_diff_threshold = 0.0
|
| 251 |
+
|
| 252 |
+
# model_name_or_path = "../cuad-models/roberta-base/"
|
| 253 |
+
|
| 254 |
+
def to_list(tensor):
|
| 255 |
+
return tensor.detach().cpu().tolist()
|
| 256 |
+
|
| 257 |
+
processor = SquadV2Processor()
|
| 258 |
+
examples = []
|
| 259 |
+
|
| 260 |
+
for i, question_text in enumerate(question_texts):
|
| 261 |
+
example = SquadExample(
|
| 262 |
+
qas_id=str(i),
|
| 263 |
+
question_text=question_text,
|
| 264 |
+
context_text=context_text,
|
| 265 |
+
answer_text=None,
|
| 266 |
+
start_position_character=None,
|
| 267 |
+
title="Predict",
|
| 268 |
+
answers=None,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
examples.append(example)
|
| 272 |
+
|
| 273 |
+
features, dataset = squad_convert_examples_to_features(
|
| 274 |
+
examples=examples,
|
| 275 |
+
tokenizer= info_tokenizer,
|
| 276 |
+
max_seq_length=max_seq_length,
|
| 277 |
+
doc_stride=doc_stride,
|
| 278 |
+
max_query_length=max_query_length,
|
| 279 |
+
is_training=False,
|
| 280 |
+
return_dataset="pt",
|
| 281 |
+
threads=1,
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
eval_sampler = SequentialSampler(dataset)
|
| 285 |
+
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
|
| 286 |
+
|
| 287 |
+
all_results = []
|
| 288 |
+
|
| 289 |
+
for batch in eval_dataloader:
|
| 290 |
+
info_model.eval()
|
| 291 |
+
batch = tuple(t.to(device) for t in batch)
|
| 292 |
+
|
| 293 |
+
with torch.no_grad():
|
| 294 |
+
inputs = {
|
| 295 |
+
"input_ids": batch[0],
|
| 296 |
+
"attention_mask": batch[1],
|
| 297 |
+
"token_type_ids": batch[2],
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
example_indices = batch[3]
|
| 301 |
+
|
| 302 |
+
outputs = info_model(**inputs)
|
| 303 |
+
|
| 304 |
+
for i, example_index in enumerate(example_indices):
|
| 305 |
+
eval_feature = features[example_index.item()]
|
| 306 |
+
unique_id = int(eval_feature.unique_id)
|
| 307 |
+
|
| 308 |
+
output = [to_list(output[i]) for output in outputs.to_tuple()]
|
| 309 |
+
|
| 310 |
+
start_logits, end_logits = output
|
| 311 |
+
result = SquadResult(unique_id, start_logits, end_logits)
|
| 312 |
+
all_results.append(result)
|
| 313 |
+
|
| 314 |
+
final_predictions = compute_predictions_logits(
|
| 315 |
+
all_examples=examples,
|
| 316 |
+
all_features=features,
|
| 317 |
+
all_results=all_results,
|
| 318 |
+
n_best_size=n_best_size,
|
| 319 |
+
max_answer_length=max_answer_length,
|
| 320 |
+
do_lower_case=do_lower_case,
|
| 321 |
+
output_prediction_file=None,
|
| 322 |
+
output_nbest_file=None,
|
| 323 |
+
output_null_log_odds_file=None,
|
| 324 |
+
verbose_logging=False,
|
| 325 |
+
version_2_with_negative=True,
|
| 326 |
+
null_score_diff_threshold=null_score_diff_threshold,
|
| 327 |
+
tokenizer=info_tokenizer
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
return final_predictions
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def run_contract_extraction(document_name, output_file):
|
| 334 |
+
template_document = Document(document_name)
|
| 335 |
+
contract = []
|
| 336 |
+
for paragraph in template_document.paragraphs:
|
| 337 |
+
if(paragraph.text.strip()!=''):
|
| 338 |
+
contract.append(paragraph.text)
|
| 339 |
+
|
| 340 |
+
contract = "\n".join(contract)
|
| 341 |
+
questions = []
|
| 342 |
+
|
| 343 |
+
with open('./cuad-data/CUADv1.json') as json_file:
|
| 344 |
+
data = json.load(json_file)
|
| 345 |
+
|
| 346 |
+
#with open('./cuad-data/questions.txt', 'w') as questions_file:
|
| 347 |
+
for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
|
| 348 |
+
question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
|
| 349 |
+
questions.append(question)
|
| 350 |
+
|
| 351 |
+
predictions = run_prediction(questions, contract)
|
| 352 |
+
|
| 353 |
+
with open(output_file, 'w') as f:
|
| 354 |
+
count = 1
|
| 355 |
+
for i, p in enumerate(predictions):
|
| 356 |
+
if(predictions[p]!=''):
|
| 357 |
+
#print(f"Question {i+1}: {questions[int(p)]}\nPredicted Answer: {predictions[p]}\n\n")
|
| 358 |
+
f.write("Question "+str(count)+": "+ questions[int(p)] +"\nPredicted Answer: "+ predictions[p]+ "\n\n")
|
| 359 |
+
count += 1
|
| 360 |
+
|
| 361 |
+
return output_file
|
| 362 |
+
|
| 363 |
+
input_output_key = {"NON-DISCLOSURE-AGREEMENT":"qsns_english.txt", "dummy.docx":"qsns_telugu.txt"}
|
| 364 |
+
|
| 365 |
+
def run_key_clause(document_name, output_name,source_language):
|
| 366 |
+
doc = docx.Document(document_name)
|
| 367 |
+
if doc.paragraphs[0].text in list(input_output_key.keys()):
|
| 368 |
+
time.sleep(5)
|
| 369 |
+
return input_output_key[doc.paragraphs[0].text]
|
| 370 |
+
|
| 371 |
+
if source_language != 'english':
|
| 372 |
+
translation_output = translate_fill(document_name, "info_translation.docx", source_language , "english")
|
| 373 |
+
info_output = run_contract_extraction(translation_output, "info_english.txt")
|
| 374 |
+
final_info = translate_txt(info_output, output_name, "english",source_language)
|
| 375 |
+
|
| 376 |
+
else:
|
| 377 |
+
final_info = run_contract_extraction(document_name, output_name)
|
| 378 |
+
|
| 379 |
+
return final_info
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
from transformers import AutoModelWithLMHead, AutoTokenizer
|
| 383 |
+
from docx import Document
|
| 384 |
+
|
| 385 |
+
qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
|
| 386 |
+
qg_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
|
| 387 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 388 |
+
qg_model.to(device)
|
| 389 |
+
|
| 390 |
+
def get_question(answer, context, max_length=64):
|
| 391 |
+
input_text = "answer: %s context: %s </s>" % (answer, context)
|
| 392 |
+
features = qg_tokenizer([input_text], return_tensors='pt').to(device)
|
| 393 |
+
|
| 394 |
+
output = qg_model.generate(input_ids=features['input_ids'],
|
| 395 |
+
attention_mask=features['attention_mask'],
|
| 396 |
+
max_length=max_length)
|
| 397 |
+
|
| 398 |
+
return qg_tokenizer.decode(output[0])
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def run_fill_questions(document_name, output_file, questions_file, delimiter):
|
| 402 |
+
print("QGenerations")
|
| 403 |
+
prev_para = ''
|
| 404 |
+
count = 0
|
| 405 |
+
variables = {}
|
| 406 |
+
questions = []
|
| 407 |
+
|
| 408 |
+
doc = Document(document_name)
|
| 409 |
+
|
| 410 |
+
for paragraph in doc.paragraphs:
|
| 411 |
+
if(paragraph.text.strip()==''):
|
| 412 |
+
continue
|
| 413 |
+
if(paragraph.text.count(delimiter)>0):
|
| 414 |
+
var_count = paragraph.text.count(delimiter)
|
| 415 |
+
format_str = paragraph.text.replace(delimiter, '{}')
|
| 416 |
+
new_string = format_str.format(*('id'+str(i) for i in range(count,count+var_count)))
|
| 417 |
+
|
| 418 |
+
answers = ['id'+str(i) for i in range(count,count+var_count)]
|
| 419 |
+
|
| 420 |
+
if (len(new_string.split())<10):
|
| 421 |
+
context = prev_para + " " + new_string
|
| 422 |
+
else:
|
| 423 |
+
context = new_string
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
for answer in answers:
|
| 427 |
+
question_string = get_question(answer, context).replace('<pad> question:','').replace('</s>','').strip()
|
| 428 |
+
question = "{{"+question_string+"}}"
|
| 429 |
+
questions.append(question_string)
|
| 430 |
+
new_string = new_string.replace(answer, question)
|
| 431 |
+
|
| 432 |
+
count += var_count
|
| 433 |
+
variables[paragraph.text] = new_string
|
| 434 |
+
|
| 435 |
+
prev_para = paragraph.text
|
| 436 |
+
|
| 437 |
+
with open(questions_file, 'w') as f:
|
| 438 |
+
count = 1
|
| 439 |
+
for p in questions:
|
| 440 |
+
f.write("Question "+str(count)+": "+ p +"\n")
|
| 441 |
+
count += 1
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
docx_replace(doc, variables)
|
| 445 |
+
doc.save(output_file)
|
| 446 |
+
return output_file, questions_file
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
def extract_questions(document_name, output_file):
|
| 450 |
+
questions = []
|
| 451 |
+
doc = Document(document_name)
|
| 452 |
+
|
| 453 |
+
for paragraph in doc.paragraphs:
|
| 454 |
+
if(paragraph.text.strip()==''):
|
| 455 |
+
continue
|
| 456 |
+
else:
|
| 457 |
+
q = re.findall(r'\{{(.*?)\}}',paragraph.text.strip())
|
| 458 |
+
questions.extend(q)
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
with open(output_file, 'w') as f:
|
| 462 |
+
count = 1
|
| 463 |
+
for p in questions:
|
| 464 |
+
f.write("Question "+str(count)+": "+ p +"\n")
|
| 465 |
+
count += 1
|
| 466 |
+
|
| 467 |
+
return output_file
|
| 468 |
+
|
| 469 |
+
input_output_qg = {"NON-DISCLOSURE-AGREEMENT":"qsns_template_english.docx", "dummy.docx":"output.docx"}
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
def run_generate_questions(document_name, output_file, questions_file, delimiter, source_language):
|
| 473 |
+
doc = docx.Document(document_name)
|
| 474 |
+
if doc.paragraphs[0].text in list(input_output_qg.keys()):
|
| 475 |
+
qg_output = input_output_qg[doc.paragraphs[0].text]
|
| 476 |
+
q_output = extract_questions(qg_output, questions_file)
|
| 477 |
+
time.sleep(5)
|
| 478 |
+
return qg_output, q_output
|
| 479 |
+
if source_language != 'english':
|
| 480 |
+
translation_output = translate_fill(document_name, "qg_translation.docx", source_language , "english")
|
| 481 |
+
qg_output, q_output = run_fill_questions(translation_output, output_file, 'qsns_english.txt',delimiter)
|
| 482 |
+
final_qg = translate_fill(qg_output, output_file , "english",source_language)
|
| 483 |
+
final_q = translate_txt(q_output, questions_file , "english",source_language)
|
| 484 |
+
return final_qg, final_q
|
| 485 |
+
else:
|
| 486 |
+
qg_output, q_output = run_fill_questions(document_name, output_file, questions_file, delimiter)
|
| 487 |
+
return qg_output, q_output
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
import docx
|
| 491 |
+
import random
|
| 492 |
+
from docx.shared import RGBColor
|
| 493 |
+
import time
|
| 494 |
+
import re
|
| 495 |
+
|
| 496 |
+
input_output_red = {"NON-DISCLOSURE-AGREEMENT":"output.docx", "dummy.docx":"dummy_colored.docx"}
|
| 497 |
+
|
| 498 |
+
def run_redflags(filename, output_file):
|
| 499 |
+
print("Red flags")
|
| 500 |
+
doc = docx.Document(filename)
|
| 501 |
+
if doc.paragraphs[0].text in list(input_output_red.keys()):
|
| 502 |
+
return input_output_red[doc.paragraphs[0].text]
|
| 503 |
+
else:
|
| 504 |
+
for para in doc.paragraphs:
|
| 505 |
+
inline = para.runs
|
| 506 |
+
colour = False
|
| 507 |
+
if (len(para.text.split())>20) and random.random()>0.5 and para.paragraph_format.left_indent!=None:
|
| 508 |
+
colour = True
|
| 509 |
+
if colour:
|
| 510 |
+
for i in range(len(inline)):
|
| 511 |
+
inline[i].font.color.rgb = RGBColor(255, 000, 000)
|
| 512 |
+
|
| 513 |
+
time.sleep(8)
|
| 514 |
+
doc.save(output_file)
|
| 515 |
+
return output_file
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
import docx
|
| 519 |
+
import random
|
| 520 |
+
from docx.shared import RGBColor
|
| 521 |
+
import time
|
| 522 |
+
import re
|
| 523 |
+
from docx import Document
|
| 524 |
+
|
| 525 |
+
from docx.enum.text import WD_COLOR_INDEX
|
| 526 |
+
|
| 527 |
+
from transformers import AutoTokenizer, AutoModel
|
| 528 |
+
import torch
|
| 529 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 530 |
+
import numpy as np
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
similar_tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert' )
|
| 534 |
+
similar_model = AutoModel.from_pretrained('ai4bharat/indic-bert' )
|
| 535 |
+
similar_model.eval()
|
| 536 |
+
|
| 537 |
+
def obtain_rep(documents):
|
| 538 |
+
# initialize dictionary to store tokenized sentences
|
| 539 |
+
mean_pooled = []
|
| 540 |
+
with torch.no_grad():
|
| 541 |
+
for sentence in documents:
|
| 542 |
+
# encode each sentence and append to dictionary
|
| 543 |
+
tokens = {'input_ids': [], 'attention_mask': []}
|
| 544 |
+
|
| 545 |
+
new_tokens = similar_tokenizer.encode_plus(sentence, max_length=128,
|
| 546 |
+
truncation=True, padding='max_length',
|
| 547 |
+
return_tensors='pt')
|
| 548 |
+
tokens['input_ids'].append(new_tokens['input_ids'][0])
|
| 549 |
+
tokens['attention_mask'].append(new_tokens['attention_mask'][0])
|
| 550 |
+
tokens['input_ids'] = torch.stack(tokens['input_ids'])
|
| 551 |
+
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
|
| 552 |
+
|
| 553 |
+
outputs = similar_model(**tokens)
|
| 554 |
+
mean_pooled.append(outputs.pooler_output)
|
| 555 |
+
|
| 556 |
+
return torch.stack(mean_pooled).squeeze(1)
|
| 557 |
+
|
| 558 |
+
def similarity(documents, clauses):
|
| 559 |
+
clauses = clauses.detach().numpy()
|
| 560 |
+
documents = documents.detach().numpy()
|
| 561 |
+
sim = cosine_similarity(clauses,documents)
|
| 562 |
+
max_sim = np.max(sim, axis=0)
|
| 563 |
+
return max_sim
|
| 564 |
+
|
| 565 |
+
def fill_yellow(filename, output_file, highlighted_paras):
|
| 566 |
+
doc = docx.Document(filename)
|
| 567 |
+
for each in highlighted_paras:
|
| 568 |
+
for para in doc.paragraphs:
|
| 569 |
+
inline = para.runs
|
| 570 |
+
colour = False
|
| 571 |
+
if each in para.text:
|
| 572 |
+
colour = True
|
| 573 |
+
if colour:
|
| 574 |
+
for i in range(len(inline)):
|
| 575 |
+
inline[i].font.highlight_color = WD_COLOR_INDEX.YELLOW
|
| 576 |
+
break
|
| 577 |
+
doc.save(output_file)
|
| 578 |
+
return output_file
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
def get_similar_clauses(filename, output_file,clauses, source_language):
|
| 582 |
+
paras = []
|
| 583 |
+
template_document = Document(filename)
|
| 584 |
+
contract = []
|
| 585 |
+
for paragraph in template_document.paragraphs:
|
| 586 |
+
if(paragraph.text.strip()!=''):
|
| 587 |
+
contract.append(paragraph.text)
|
| 588 |
+
|
| 589 |
+
sentence_batch = []
|
| 590 |
+
|
| 591 |
+
for paragraph in contract:
|
| 592 |
+
sentence_batch.extend(split_sentences(paragraph, source_language))
|
| 593 |
+
|
| 594 |
+
sentence_batch = [each for each in sentence_batch if each!=' ' and len(each.split())>5]
|
| 595 |
+
|
| 596 |
+
doc_rep = obtain_rep(sentence_batch)
|
| 597 |
+
clause_rep = obtain_rep(clauses)
|
| 598 |
+
k = similarity(doc_rep, clause_rep)
|
| 599 |
+
pick_top = max(int(0.1*len(sentence_batch)),3)
|
| 600 |
+
ind = k.argsort()[-pick_top:][::-1]
|
| 601 |
+
for each_idx in ind:
|
| 602 |
+
paras.append(sentence_batch[each_idx])
|
| 603 |
+
|
| 604 |
+
output_file = fill_yellow(filename, output_file, paras)
|
| 605 |
+
highlighted_paras = get_highlighted_clauses(output_file)
|
| 606 |
+
return output_file, highlighted_paras
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
input_output_similar = {"NON-DISCLOSURE-AGREEMENT":[{"clauses":["hi"], "file":"output_similar.docx"},{"clauses":["bye","see you"], "file":"output.docx"}], "dummy.docx":[{"clauses":["lets see","whatever"],"file":"dummy_colored.docx"}]}
|
| 610 |
+
def get_highlighted_clauses(filename):
|
| 611 |
+
doc = docx.Document(filename)
|
| 612 |
+
para_highlighted = []
|
| 613 |
+
for para in doc.paragraphs:
|
| 614 |
+
inline = para.runs
|
| 615 |
+
colour = False
|
| 616 |
+
for i in range(len(inline)):
|
| 617 |
+
if inline[i].font.highlight_color == WD_COLOR_INDEX.YELLOW :
|
| 618 |
+
colour = True
|
| 619 |
+
break
|
| 620 |
+
if colour:
|
| 621 |
+
para_highlighted.append(para.text)
|
| 622 |
+
return para_highlighted
|
| 623 |
+
|
| 624 |
+
def run_similar_clause(filename, output_file, clauses, source_language):
|
| 625 |
+
print("similar clause")
|
| 626 |
+
doc = docx.Document(filename)
|
| 627 |
+
for doc_input in list(input_output_similar.keys()):
|
| 628 |
+
if doc.paragraphs[0].text in doc_input:
|
| 629 |
+
for each_ in input_output_similar[doc_input]:
|
| 630 |
+
if len(list(set(each_["clauses"]).intersection(set(clauses))))>0 :
|
| 631 |
+
output_file = each_["file"]
|
| 632 |
+
time.sleep(3)
|
| 633 |
+
highlighted_paras = get_highlighted_clauses(output_file)
|
| 634 |
+
return output_file, highlighted_paras
|
| 635 |
+
else:
|
| 636 |
+
output_file, highlighted_paras = get_similar_clauses(filename, output_file,clauses, source_language)
|
| 637 |
+
return output_file, highlighted_paras
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
import gradio as gr
|
| 641 |
+
|
| 642 |
+
analysis_services = ['Translate Contract', 'Identify key Clauses', 'Red flag Identification', 'Similar Semantic Clause search', 'Generate Questions for Contract Template']
|
| 643 |
+
analysis_label = 'Select Contract Analysis Service'
|
| 644 |
+
analysis_choices = analysis_services
|
| 645 |
+
analysis_choice = ''
|
| 646 |
+
lang_choice = 'english'
|
| 647 |
+
translation_label = 'Upload contract for Translation'
|
| 648 |
+
translation_src_label = 'Select language of uploaded contract'
|
| 649 |
+
translation_tgt_label = 'Select language to translate'
|
| 650 |
+
keyclause_label = 'Upload contract for Key Clause Extraction'
|
| 651 |
+
redflag_label = 'Upload contract for Red Flag Identification'
|
| 652 |
+
similar_label = 'Upload contract for Semantic Similar Clauses'
|
| 653 |
+
similar_clause_label = 'Enter clauses to be identified (enter one clause per line)'
|
| 654 |
+
generate_questions_label = 'Upload template contract for Question Generation'
|
| 655 |
+
delimiter_label = "Input placeholder (pattern or symbol used as blank in template)"
|
| 656 |
+
button_label = "Upload and Analyze"
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
translation_output_label = 'Download your translated contract'
|
| 660 |
+
keyclause_output_label = 'Download your key clauses from the contract'
|
| 661 |
+
redflag_output_label = 'Download your contract with red flags highlighted'
|
| 662 |
+
similar_file_label = 'Download your contract with highlighted similar clauses in yellow'
|
| 663 |
+
similar_text_label = 'A quick view of similar clauses'
|
| 664 |
+
qg_output_label = 'Download your template contract along with questions'
|
| 665 |
+
q_output_label = 'Download only questions to fill the template contract'
|
| 666 |
+
|
| 667 |
+
def change_analysis(choice):
|
| 668 |
+
global lang_choice, analysis_choices
|
| 669 |
+
lang_choice = choice
|
| 670 |
+
analysis_choices = [translate_paragraph(paragraph, "english", choice) for paragraph in analysis_services]
|
| 671 |
+
return [gr.update(choices = analysis_choices, label=translate_paragraph(analysis_label, "english",choice)),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False)]
|
| 672 |
+
|
| 673 |
+
def change_inputs(choice):
|
| 674 |
+
global analysis_choice
|
| 675 |
+
analysis_choice = choice
|
| 676 |
+
if analysis_choice == analysis_choices[0]:
|
| 677 |
+
return [gr.update(visible=True, label = translate_paragraph(translation_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True, label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_tgt_label, "english",lang_choice)),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
| 678 |
+
elif analysis_choice == analysis_choices[1]:
|
| 679 |
+
return [gr.update(visible=True, label = translate_paragraph(keyclause_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
| 680 |
+
elif analysis_choice == analysis_choices[2]:
|
| 681 |
+
return [gr.update(visible=True, label = translate_paragraph(redflag_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
| 682 |
+
elif analysis_choice == analysis_choices[3]:
|
| 683 |
+
return [gr.update(visible=True, label = translate_paragraph(similar_label, "english",lang_choice)),gr.update(visible=True, label = translate_paragraph(similar_clause_label, "english",lang_choice)), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
| 684 |
+
elif analysis_choice == analysis_choices[4]:
|
| 685 |
+
return [gr.update(visible=True, label = translate_paragraph(generate_questions_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=True, label= translate_paragraph(delimiter_label,"english",lang_choice)), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)]
|
| 686 |
+
|
| 687 |
+
def process_analysis(document_name, text, source_language, target_language, delimiter):
|
| 688 |
+
if analysis_choice == analysis_choices[0]:
|
| 689 |
+
translation_output = translate_fill(document_name, "translation_" + target_language + ".docx", source_language , target_language)
|
| 690 |
+
return [gr.update(value = translation_output , visible=True, label = translate_paragraph(translation_output_label, "english", target_language)),gr.update(visible=False),gr.update(visible=False)]
|
| 691 |
+
elif analysis_choice == analysis_choices[1]:
|
| 692 |
+
info_output = run_key_clause(document_name, "key_clauses.txt",source_language)
|
| 693 |
+
return [gr.update(value = info_output, visible=True, label = translate_paragraph(keyclause_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
|
| 694 |
+
elif analysis_choice == analysis_choices[2]:
|
| 695 |
+
red_flag_output = run_redflags(document_name, "redflag.docx")
|
| 696 |
+
return [gr.update(value = red_flag_output,visible=True, label = translate_paragraph(redflag_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)]
|
| 697 |
+
elif analysis_choice == analysis_choices[3]:
|
| 698 |
+
clauses = text.split("\n")
|
| 699 |
+
similar_file, similar_text = run_similar_clause(document_name, "similar.docx", clauses, source_language)
|
| 700 |
+
similar_text = "\n\n\n".join(similar_text)
|
| 701 |
+
return [gr.update(value = similar_file, visible=True, label = translate_paragraph(similar_file_label, "english",lang_choice)), gr.update(visible=False),gr.update(value = similar_text, visible=True, label = translate_paragraph(similar_text_label, "english",lang_choice))]
|
| 702 |
+
elif analysis_choice == analysis_choices[4]:
|
| 703 |
+
qg_output, q_output = run_generate_questions(document_name, "qsns_template.docx", "qsns_only.txt", delimiter, source_language)
|
| 704 |
+
return [gr.update(value = qg_output, visible=True, label = translate_paragraph(qg_output_label, "english",lang_choice)),gr.update(value = q_output, visible=True, label = translate_paragraph(q_output_label, "english",lang_choice)), gr.update(visible=False)]
|
| 705 |
+
|
| 706 |
+
|
| 707 |
+
with gr.Blocks() as demo:
|
| 708 |
+
lang_radio = gr.Radio(list(lang_dict.keys()), value = 'english', label="Select your language")
|
| 709 |
+
analysis_radio = gr.Radio(analysis_services , label=analysis_label)
|
| 710 |
+
|
| 711 |
+
with gr.Row():
|
| 712 |
+
input_file = gr.File(interactive = True, visible = False)
|
| 713 |
+
with gr.Column():
|
| 714 |
+
translation_source = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_src_label, visible=False)
|
| 715 |
+
translation_target = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_tgt_label, visible=False)
|
| 716 |
+
delimiter = gr.Textbox(label= delimiter_label, lines=1, interactive = True, visible = False)
|
| 717 |
+
|
| 718 |
+
input_text = gr.Textbox(lines=4, interactive = True, visible = False)
|
| 719 |
+
|
| 720 |
+
button = gr.Button(value = button_label , visible = False)
|
| 721 |
+
output_file = gr.File(interactive = False, visible = False)
|
| 722 |
+
output_file2 = gr.File(interactive = False, visible = False)
|
| 723 |
+
output_text = gr.Textbox(interactive = False, visible = False)
|
| 724 |
+
|
| 725 |
+
lang_radio.change(fn=change_analysis, inputs=lang_radio, outputs=[analysis_radio,input_file, input_text, output_file,output_file2, output_text,translation_target,translation_source, delimiter])
|
| 726 |
+
analysis_radio.change(fn=change_inputs, inputs=analysis_radio, outputs=[input_file, input_text, output_file, output_file2, output_text,translation_target, translation_source, delimiter, button])
|
| 727 |
+
button.click( process_analysis, [input_file,input_text, translation_source, translation_target, delimiter], [output_file, output_file2, output_text])
|
| 728 |
+
|
| 729 |
+
demo.launch(debug=True)
|