File size: 5,930 Bytes
9991ce4
 
 
 
 
 
 
 
 
065e758
 
 
9991ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91fbcd9
9991ce4
91fbcd9
9991ce4
91fbcd9
9991ce4
 
 
 
 
 
 
91fbcd9
9991ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cdc15c
9991ce4
 
 
 
 
 
 
 
 
5cdc15c
9991ce4
 
 
 
 
 
 
 
 
 
 
 
 
91fbcd9
8375cd1
812b735
 
 
 
 
 
 
 
 
 
 
 
 
8375cd1
812b735
8375cd1
812b735
 
 
 
9991ce4
3bb25d3
52d60dc
 
54d1786
 
9991ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cdc15c
9991ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e13ec1e
9991ce4
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import streamlit as st
import pdfplumber
import torch
from transformers import RobertaTokenizer, RobertaModel
import nltk
import openai
from torch import nn
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize
import os

print(os.listdir('.'))

nltk.download('punkt')

class Bert_model(nn.Module):
    def __init__(self, hidden_size, dropout_rate):
        super(Bert_model, self).__init__()
        self.hidden_size = hidden_size
        self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2')
        self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True)
        self.cls_dropout = nn.Dropout(dropout_rate)
        self.cls_final = nn.Linear(hidden_size, 2, bias=True)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_sequence_output = bert_outputs.last_hidden_state
        bert_pooled_output = bert_sequence_output[:, 0, :]
        pooled_output = self.cls_prj(bert_pooled_output)
        pooled_output = self.cls_dropout(pooled_output)
        logits = self.cls_final(pooled_output)
        return logits

model_path = "model.pt" 
state_dict = torch.load(model_path)
device = torch.device("cuda:0") 

model = Bert_model(hidden_size=768, dropout_rate=0.1) 
model = nn.DataParallel(model)
model.load_state_dict(state_dict)
model = model.to(device)
model.eval()

tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2')


def preprocess_pdf(pdf_path, tokenizer):
    with pdfplumber.open(pdf_path) as pdf:
        text = " ".join([page.extract_text() for page in pdf.pages[2:]])
        tokenized_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids = torch.tensor([tokenized_text['input_ids']])
        attention_mask = torch.tensor([tokenized_text['attention_mask']])
        return input_ids, attention_mask, text

def translate_text(text, target_language):
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that translates English text to other languages."},
            {"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'},
        ],
    )
    return response.choices[0].message['content']

def explain_term(term):
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant that provides definitions."
            },
            {
                "role": "user",
                "content": f"Explain the term: {term}"
            },
        ],
    )
    return response['choices'][0]['message']['content']

st.title('FinChat')

# api_key = st.text_input("Enter your OpenAI API key:", type="password")

# if api_key:
#     try:
#         openai.api_key = api_key

#         openai.ChatCompletion.create(
#             model="gpt-4-1106-preview",
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant."},
#                 {"role": "user", "content": "Hello"},
#             ],
#         )

#         st.success("API key is valid!")
        
#     except Exception as e:
#         st.error(f"Failed to validate API key: {e}")
# else:
#     st.warning("Please enter your OpenAI API key.")

api_key = st.secrets["api_key"]
openai.api_key = api_key

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese'])

if uploaded_file is not None:
    with open("temp.pdf", "wb") as f:
        f.write(uploaded_file.getbuffer())
    input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer)
    st.write('File successfully uploaded and processed')

    question = st.text_input("Enter your question:")

    if question:
        sentences = sent_tokenize(text)
        predictions = []

        for sentence in sentences:
            inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
                probabilities = F.softmax(outputs, dim=1)
                max_value, max_index = torch.max(probabilities, dim=1)
                prediction = max_index.item()
                predictions.append((sentence, prediction, probabilities[0].tolist()))

        predictions.sort(key=lambda pair: pair[1], reverse=True)
        top_5_sentences = predictions[:13]

        chat_history = "\n".join([sentence[0] for sentence in top_5_sentences])

        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            messages=[
                {"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."},
                {"role": "user", "content": chat_history},
                {"role": "user", "content": question},
            ]
        )

        if language != 'English':
            response_content = translate_text(response.choices[0].message['content'], language)
        else:
            response_content = response.choices[0].message['content']

        st.text("Answer: " + response_content)

term = st.text_input("Enter a term you want to define:")
 
if term:
    definition = explain_term(term)

    if language != 'English':
        definition = translate_text(definition, language)

    st.text("Definition: " + definition)