Jforeverss commited on
Commit
9991ce4
·
1 Parent(s): f74b624

Create demo123

Browse files
Files changed (1) hide show
  1. demo123 +173 -0
demo123 ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install Streamlit and pyngrok
2
+ !pip install -q streamlit
3
+ !pip install -q pyngrok
4
+ !pip install -q pdfplumber
5
+ !pip install -q transformers
6
+ !pip install -q tabula-py
7
+ !pip install -q openai
8
+
9
+
10
+ # Write the Streamlit app script
11
+ # Write the Streamlit app script
12
+ %%writefile app.py
13
+ import streamlit as st
14
+ import pdfplumber
15
+ import torch
16
+ from transformers import RobertaTokenizer, RobertaModel
17
+ import nltk
18
+ import openai
19
+ from torch import nn
20
+ import torch.nn.functional as F
21
+ from nltk.tokenize import sent_tokenize
22
+
23
+ # Download the 'punkt' package
24
+ nltk.download('punkt')
25
+
26
+ openai.api_key = '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'
27
+
28
+ # Define your model architecture
29
+ class Bert_model(nn.Module):
30
+ def __init__(self, hidden_size, dropout_rate):
31
+ super(Bert_model, self).__init__()
32
+ self.hidden_size = hidden_size
33
+ self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2')
34
+ self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True)
35
+ self.cls_dropout = nn.Dropout(dropout_rate)
36
+ self.cls_final = nn.Linear(hidden_size, 2, bias=True)
37
+
38
+ def forward(self, input_ids, attention_mask):
39
+ bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
40
+ bert_sequence_output = bert_outputs.last_hidden_state
41
+ bert_pooled_output = bert_sequence_output[:, 0, :]
42
+ pooled_output = self.cls_prj(bert_pooled_output)
43
+ pooled_output = self.cls_dropout(pooled_output)
44
+ logits = self.cls_final(pooled_output)
45
+ return logits
46
+
47
+ # Load the model
48
+ model_path = "/content/model.pt" # Replace with your actual model path
49
+ state_dict = torch.load(model_path)
50
+ device = "cuda" # or "cpu" if GPU is not available
51
+
52
+ # Instantiate the model architecture
53
+ model = Bert_model(hidden_size=768, dropout_rate=0.1) # Adjust the hidden size to match the saved model
54
+ model = nn.DataParallel(model)
55
+ model.load_state_dict(state_dict)
56
+ model = model.to(device)
57
+ model.eval()
58
+
59
+ # Load the tokenizer
60
+ tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2')
61
+
62
+ # Function to preprocess PDF text
63
+ def preprocess_pdf(pdf_path, tokenizer):
64
+ with pdfplumber.open(pdf_path) as pdf:
65
+ text = " ".join([page.extract_text() for page in pdf.pages[2:]])
66
+ tokenized_text = tokenizer.encode_plus(
67
+ text,
68
+ add_special_tokens=True,
69
+ max_length=512,
70
+ padding='max_length',
71
+ return_attention_mask=True
72
+ )
73
+ input_ids = torch.tensor([tokenized_text['input_ids']])
74
+ attention_mask = torch.tensor([tokenized_text['attention_mask']])
75
+ return input_ids, attention_mask, text
76
+
77
+ def translate_text(text, target_language):
78
+ response = openai.ChatCompletion.create(
79
+ model="gpt-4",
80
+ messages=[
81
+ {"role": "system", "content": "You are a helpful assistant that translates English text to other languages."},
82
+ {"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'},
83
+ ],
84
+ )
85
+ return response.choices[0].message['content']
86
+
87
+ def explain_term(term):
88
+ response = openai.ChatCompletion.create(
89
+ model="gpt-4",
90
+ messages=[
91
+ {
92
+ "role": "system",
93
+ "content": "You are a helpful assistant that provides definitions."
94
+ },
95
+ {
96
+ "role": "user",
97
+ "content": f"Explain the term: {term}"
98
+ },
99
+ ],
100
+ )
101
+ return response['choices'][0]['message']['content']
102
+
103
+ # Streamlit code to upload file
104
+ st.title('FinQA (Financial Question-Answering)')
105
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
106
+
107
+ # Select language
108
+ language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese'])
109
+
110
+ if uploaded_file is not None:
111
+ with open("temp.pdf", "wb") as f:
112
+ f.write(uploaded_file.getbuffer())
113
+ input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer)
114
+ st.write('File successfully uploaded and processed')
115
+
116
+ # Ask a question
117
+ question = st.text_input("Enter your question:")
118
+
119
+ if question:
120
+ sentences = sent_tokenize(text)
121
+ predictions = []
122
+
123
+ for sentence in sentences:
124
+ inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
125
+ input_ids = inputs['input_ids'].to(device)
126
+ attention_mask = inputs['attention_mask'].to(device)
127
+
128
+ with torch.no_grad():
129
+ outputs = model(input_ids, attention_mask)
130
+ probabilities = F.softmax(outputs, dim=1)
131
+ max_value, max_index = torch.max(probabilities, dim=1)
132
+ prediction = max_index.item()
133
+ predictions.append((sentence, prediction, probabilities[0].tolist()))
134
+
135
+ predictions.sort(key=lambda pair: pair[1], reverse=True)
136
+ top_5_sentences = predictions[:13]
137
+
138
+ #st.write("Top 5 Relevant Sentences:")
139
+ #for sentence, prediction, probabilities in top_5_sentences:
140
+ #st.write(f"Sentence: {sentence}, Prediction: {prediction}, Probability: {probabilities[prediction]}")
141
+
142
+ # Prepare the chat history with the top 3 sentences
143
+ chat_history = "\n".join([sentence[0] for sentence in top_5_sentences])
144
+
145
+ # Ask the question using OpenAI API
146
+ openai.api_key = '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' # Replace with your actual OpenAI API key
147
+
148
+ response = openai.ChatCompletion.create(
149
+ model="gpt-4",
150
+ messages=[
151
+ {"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."},
152
+ {"role": "user", "content": chat_history},
153
+ {"role": "user", "content": question},
154
+ ]
155
+ )
156
+
157
+ if language != 'English':
158
+ response_content = translate_text(response.choices[0].message['content'], language)
159
+ else:
160
+ response_content = response.choices[0].message['content']
161
+
162
+ st.text("Answer: " + response_content)
163
+
164
+ term = st.text_input("Enter a term you want to define:")
165
+
166
+ if term:
167
+ # Define the term using OpenAI API
168
+ definition = explain_term(term)
169
+
170
+ if language != 'English':
171
+ definition = translate_text(definition, language)
172
+
173
+ st.text("Definition: " + definition)