Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						9991ce4
	
1
								Parent(s):
							
							f74b624
								
Create demo123
Browse files
    	
        demo123
    ADDED
    
    | @@ -0,0 +1,173 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Install Streamlit and pyngrok
         | 
| 2 | 
            +
            !pip install -q streamlit
         | 
| 3 | 
            +
            !pip install -q pyngrok
         | 
| 4 | 
            +
            !pip install -q pdfplumber
         | 
| 5 | 
            +
            !pip install -q transformers
         | 
| 6 | 
            +
            !pip install -q tabula-py
         | 
| 7 | 
            +
            !pip install -q openai
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Write the Streamlit app script
         | 
| 11 | 
            +
            # Write the Streamlit app script
         | 
| 12 | 
            +
            %%writefile app.py
         | 
| 13 | 
            +
            import streamlit as st
         | 
| 14 | 
            +
            import pdfplumber
         | 
| 15 | 
            +
            import torch
         | 
| 16 | 
            +
            from transformers import RobertaTokenizer, RobertaModel
         | 
| 17 | 
            +
            import nltk
         | 
| 18 | 
            +
            import openai
         | 
| 19 | 
            +
            from torch import nn
         | 
| 20 | 
            +
            import torch.nn.functional as F
         | 
| 21 | 
            +
            from nltk.tokenize import sent_tokenize
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            # Download the 'punkt' package
         | 
| 24 | 
            +
            nltk.download('punkt')
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            openai.api_key = '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'
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            # Define your model architecture
         | 
| 29 | 
            +
            class Bert_model(nn.Module):
         | 
| 30 | 
            +
                def __init__(self, hidden_size, dropout_rate):
         | 
| 31 | 
            +
                    super(Bert_model, self).__init__()
         | 
| 32 | 
            +
                    self.hidden_size = hidden_size
         | 
| 33 | 
            +
                    self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2')
         | 
| 34 | 
            +
                    self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True)
         | 
| 35 | 
            +
                    self.cls_dropout = nn.Dropout(dropout_rate)
         | 
| 36 | 
            +
                    self.cls_final = nn.Linear(hidden_size, 2, bias=True)
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                def forward(self, input_ids, attention_mask):
         | 
| 39 | 
            +
                    bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
         | 
| 40 | 
            +
                    bert_sequence_output = bert_outputs.last_hidden_state
         | 
| 41 | 
            +
                    bert_pooled_output = bert_sequence_output[:, 0, :]
         | 
| 42 | 
            +
                    pooled_output = self.cls_prj(bert_pooled_output)
         | 
| 43 | 
            +
                    pooled_output = self.cls_dropout(pooled_output)
         | 
| 44 | 
            +
                    logits = self.cls_final(pooled_output)
         | 
| 45 | 
            +
                    return logits
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            # Load the model
         | 
| 48 | 
            +
            model_path = "/content/model.pt"  # Replace with your actual model path
         | 
| 49 | 
            +
            state_dict = torch.load(model_path)
         | 
| 50 | 
            +
            device = "cuda"  # or "cpu" if GPU is not available
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            # Instantiate the model architecture
         | 
| 53 | 
            +
            model = Bert_model(hidden_size=768, dropout_rate=0.1)  # Adjust the hidden size to match the saved model
         | 
| 54 | 
            +
            model = nn.DataParallel(model)
         | 
| 55 | 
            +
            model.load_state_dict(state_dict)
         | 
| 56 | 
            +
            model = model.to(device)
         | 
| 57 | 
            +
            model.eval()
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # Load the tokenizer
         | 
| 60 | 
            +
            tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2')
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            # Function to preprocess PDF text
         | 
| 63 | 
            +
            def preprocess_pdf(pdf_path, tokenizer):
         | 
| 64 | 
            +
                with pdfplumber.open(pdf_path) as pdf:
         | 
| 65 | 
            +
                    text = " ".join([page.extract_text() for page in pdf.pages[2:]])
         | 
| 66 | 
            +
                    tokenized_text = tokenizer.encode_plus(
         | 
| 67 | 
            +
                        text,
         | 
| 68 | 
            +
                        add_special_tokens=True,
         | 
| 69 | 
            +
                        max_length=512,
         | 
| 70 | 
            +
                        padding='max_length',
         | 
| 71 | 
            +
                        return_attention_mask=True
         | 
| 72 | 
            +
                    )
         | 
| 73 | 
            +
                    input_ids = torch.tensor([tokenized_text['input_ids']])
         | 
| 74 | 
            +
                    attention_mask = torch.tensor([tokenized_text['attention_mask']])
         | 
| 75 | 
            +
                    return input_ids, attention_mask, text
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            def translate_text(text, target_language):
         | 
| 78 | 
            +
                response = openai.ChatCompletion.create(
         | 
| 79 | 
            +
                    model="gpt-4",
         | 
| 80 | 
            +
                    messages=[
         | 
| 81 | 
            +
                        {"role": "system", "content": "You are a helpful assistant that translates English text to other languages."},
         | 
| 82 | 
            +
                        {"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'},
         | 
| 83 | 
            +
                    ],
         | 
| 84 | 
            +
                )
         | 
| 85 | 
            +
                return response.choices[0].message['content']
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            def explain_term(term):
         | 
| 88 | 
            +
                response = openai.ChatCompletion.create(
         | 
| 89 | 
            +
                    model="gpt-4",
         | 
| 90 | 
            +
                    messages=[
         | 
| 91 | 
            +
                        {
         | 
| 92 | 
            +
                            "role": "system",
         | 
| 93 | 
            +
                            "content": "You are a helpful assistant that provides definitions."
         | 
| 94 | 
            +
                        },
         | 
| 95 | 
            +
                        {
         | 
| 96 | 
            +
                            "role": "user",
         | 
| 97 | 
            +
                            "content": f"Explain the term: {term}"
         | 
| 98 | 
            +
                        },
         | 
| 99 | 
            +
                    ],
         | 
| 100 | 
            +
                )
         | 
| 101 | 
            +
                return response['choices'][0]['message']['content']
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            # Streamlit code to upload file
         | 
| 104 | 
            +
            st.title('FinQA (Financial Question-Answering)')
         | 
| 105 | 
            +
            uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            # Select language
         | 
| 108 | 
            +
            language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese'])
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            if uploaded_file is not None:
         | 
| 111 | 
            +
                with open("temp.pdf", "wb") as f:
         | 
| 112 | 
            +
                    f.write(uploaded_file.getbuffer())
         | 
| 113 | 
            +
                input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer)
         | 
| 114 | 
            +
                st.write('File successfully uploaded and processed')
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                # Ask a question
         | 
| 117 | 
            +
                question = st.text_input("Enter your question:")
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                if question:
         | 
| 120 | 
            +
                    sentences = sent_tokenize(text)
         | 
| 121 | 
            +
                    predictions = []
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                    for sentence in sentences:
         | 
| 124 | 
            +
                        inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
         | 
| 125 | 
            +
                        input_ids = inputs['input_ids'].to(device)
         | 
| 126 | 
            +
                        attention_mask = inputs['attention_mask'].to(device)
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                        with torch.no_grad():
         | 
| 129 | 
            +
                            outputs = model(input_ids, attention_mask)
         | 
| 130 | 
            +
                            probabilities = F.softmax(outputs, dim=1)
         | 
| 131 | 
            +
                            max_value, max_index = torch.max(probabilities, dim=1)
         | 
| 132 | 
            +
                            prediction = max_index.item()
         | 
| 133 | 
            +
                            predictions.append((sentence, prediction, probabilities[0].tolist()))
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                    predictions.sort(key=lambda pair: pair[1], reverse=True)
         | 
| 136 | 
            +
                    top_5_sentences = predictions[:13]
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    #st.write("Top 5 Relevant Sentences:")
         | 
| 139 | 
            +
                    #for sentence, prediction, probabilities in top_5_sentences:
         | 
| 140 | 
            +
                        #st.write(f"Sentence: {sentence}, Prediction: {prediction}, Probability: {probabilities[prediction]}")
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    # Prepare the chat history with the top 3 sentences
         | 
| 143 | 
            +
                    chat_history = "\n".join([sentence[0] for sentence in top_5_sentences])
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                    # Ask the question using OpenAI API
         | 
| 146 | 
            +
                    openai.api_key = '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'  # Replace with your actual OpenAI API key
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                    response = openai.ChatCompletion.create(
         | 
| 149 | 
            +
                        model="gpt-4",
         | 
| 150 | 
            +
                        messages=[
         | 
| 151 | 
            +
                            {"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."},
         | 
| 152 | 
            +
                            {"role": "user", "content": chat_history},
         | 
| 153 | 
            +
                            {"role": "user", "content": question},
         | 
| 154 | 
            +
                        ]
         | 
| 155 | 
            +
                    )
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                    if language != 'English':
         | 
| 158 | 
            +
                        response_content = translate_text(response.choices[0].message['content'], language)
         | 
| 159 | 
            +
                    else:
         | 
| 160 | 
            +
                        response_content = response.choices[0].message['content']
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    st.text("Answer: " + response_content)
         | 
| 163 | 
            +
             | 
| 164 | 
            +
            term = st.text_input("Enter a term you want to define:")
         | 
| 165 | 
            +
             | 
| 166 | 
            +
            if term:
         | 
| 167 | 
            +
                # Define the term using OpenAI API
         | 
| 168 | 
            +
                definition = explain_term(term)
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                if language != 'English':
         | 
| 171 | 
            +
                    definition = translate_text(definition, language)
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                st.text("Definition: " + definition)
         |