Spaces:
Runtime error
Runtime error
File size: 5,930 Bytes
9991ce4 065e758 9991ce4 91fbcd9 9991ce4 91fbcd9 9991ce4 91fbcd9 9991ce4 91fbcd9 9991ce4 5cdc15c 9991ce4 5cdc15c 9991ce4 91fbcd9 8375cd1 812b735 8375cd1 812b735 8375cd1 812b735 9991ce4 3bb25d3 52d60dc 54d1786 9991ce4 5cdc15c 9991ce4 e13ec1e 9991ce4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import streamlit as st
import pdfplumber
import torch
from transformers import RobertaTokenizer, RobertaModel
import nltk
import openai
from torch import nn
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize
import os
print(os.listdir('.'))
nltk.download('punkt')
class Bert_model(nn.Module):
def __init__(self, hidden_size, dropout_rate):
super(Bert_model, self).__init__()
self.hidden_size = hidden_size
self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2')
self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True)
self.cls_dropout = nn.Dropout(dropout_rate)
self.cls_final = nn.Linear(hidden_size, 2, bias=True)
def forward(self, input_ids, attention_mask):
bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
bert_sequence_output = bert_outputs.last_hidden_state
bert_pooled_output = bert_sequence_output[:, 0, :]
pooled_output = self.cls_prj(bert_pooled_output)
pooled_output = self.cls_dropout(pooled_output)
logits = self.cls_final(pooled_output)
return logits
model_path = "model.pt"
state_dict = torch.load(model_path)
device = torch.device("cuda:0")
model = Bert_model(hidden_size=768, dropout_rate=0.1)
model = nn.DataParallel(model)
model.load_state_dict(state_dict)
model = model.to(device)
model.eval()
tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2')
def preprocess_pdf(pdf_path, tokenizer):
with pdfplumber.open(pdf_path) as pdf:
text = " ".join([page.extract_text() for page in pdf.pages[2:]])
tokenized_text = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
padding='max_length',
return_attention_mask=True
)
input_ids = torch.tensor([tokenized_text['input_ids']])
attention_mask = torch.tensor([tokenized_text['attention_mask']])
return input_ids, attention_mask, text
def translate_text(text, target_language):
response = openai.ChatCompletion.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": "You are a helpful assistant that translates English text to other languages."},
{"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'},
],
)
return response.choices[0].message['content']
def explain_term(term):
response = openai.ChatCompletion.create(
model="gpt-4-1106-preview",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that provides definitions."
},
{
"role": "user",
"content": f"Explain the term: {term}"
},
],
)
return response['choices'][0]['message']['content']
st.title('FinChat')
# api_key = st.text_input("Enter your OpenAI API key:", type="password")
# if api_key:
# try:
# openai.api_key = api_key
# openai.ChatCompletion.create(
# model="gpt-4-1106-preview",
# messages=[
# {"role": "system", "content": "You are a helpful assistant."},
# {"role": "user", "content": "Hello"},
# ],
# )
# st.success("API key is valid!")
# except Exception as e:
# st.error(f"Failed to validate API key: {e}")
# else:
# st.warning("Please enter your OpenAI API key.")
api_key = st.secrets["api_key"]
openai.api_key = api_key
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese'])
if uploaded_file is not None:
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer)
st.write('File successfully uploaded and processed')
question = st.text_input("Enter your question:")
if question:
sentences = sent_tokenize(text)
predictions = []
for sentence in sentences:
inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask)
probabilities = F.softmax(outputs, dim=1)
max_value, max_index = torch.max(probabilities, dim=1)
prediction = max_index.item()
predictions.append((sentence, prediction, probabilities[0].tolist()))
predictions.sort(key=lambda pair: pair[1], reverse=True)
top_5_sentences = predictions[:13]
chat_history = "\n".join([sentence[0] for sentence in top_5_sentences])
response = openai.ChatCompletion.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."},
{"role": "user", "content": chat_history},
{"role": "user", "content": question},
]
)
if language != 'English':
response_content = translate_text(response.choices[0].message['content'], language)
else:
response_content = response.choices[0].message['content']
st.text("Answer: " + response_content)
term = st.text_input("Enter a term you want to define:")
if term:
definition = explain_term(term)
if language != 'English':
definition = translate_text(definition, language)
st.text("Definition: " + definition) |