Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,29 +3,25 @@ import os
|
|
3 |
import requests
|
4 |
from langdetect import detect
|
5 |
from PyPDF2 import PdfReader
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
"
|
17 |
-
"
|
18 |
-
|
19 |
-
"temperature": temperature,
|
20 |
-
"top_k": top_k,
|
21 |
-
},
|
22 |
}
|
23 |
-
response =
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
return None
|
29 |
|
30 |
# Function to detect language
|
31 |
def detect_language(text):
|
@@ -48,6 +44,20 @@ def extract_text_from_pdf(pdf_file):
|
|
48 |
})
|
49 |
return text_data
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
# Default system prompts for each query translation method
|
52 |
DEFAULT_SYSTEM_PROMPTS = {
|
53 |
"Multi-Query": """You are an AI language model assistant. Your task is to generate five
|
@@ -120,33 +130,65 @@ def main():
|
|
120 |
formatted_prompt = system_prompt.format(question=prompt)
|
121 |
st.write("**Formatted System Prompt:**", formatted_prompt)
|
122 |
|
123 |
-
# Query the
|
124 |
-
translated_queries =
|
125 |
if translated_queries:
|
126 |
-
st.write("**Translated Queries:**"
|
|
|
127 |
|
128 |
# Indexing
|
129 |
if st.button("Apply Indexing"):
|
130 |
st.write(f"**Applied Indexing Method:** {indexing_method}")
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
if st.button("Generate Response"):
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
if response:
|
140 |
st.write("**Response:**", response)
|
141 |
|
142 |
-
# Process PDF content if uploaded
|
143 |
-
if pdf_file is not None:
|
144 |
-
pdf_text_data = extract_text_from_pdf(pdf_file)
|
145 |
-
if prompt:
|
146 |
-
# Search for relevant content in the PDF
|
147 |
-
for entry in pdf_text_data:
|
148 |
-
if prompt.lower() in entry["content"].lower():
|
149 |
-
st.write(f"**Page {entry['page']}, Line {entry['line']}:** {entry['content']}")
|
150 |
-
|
151 |
if __name__ == "__main__":
|
152 |
main()
|
|
|
3 |
import requests
|
4 |
from langdetect import detect
|
5 |
from PyPDF2 import PdfReader
|
6 |
+
import replicate # For interacting with Llama models hosted on Replicate
|
7 |
+
|
8 |
+
# Load the Replicate API token from environment variables
|
9 |
+
replicate_api_token = os.environ.get("Key2") # Replace with your Replicate API token
|
10 |
+
|
11 |
+
# Function to query the Llama 3.2 7B Instruct model via Replicate
|
12 |
+
def query_llama_model(prompt, max_new_tokens=1000, temperature=0.7, top_k=50):
|
13 |
+
model_name = "meta/llama-3-7b-instruct" # Replace with the correct model name on Replicate
|
14 |
+
input_data = {
|
15 |
+
"prompt": prompt,
|
16 |
+
"max_new_tokens": max_new_tokens,
|
17 |
+
"temperature": temperature,
|
18 |
+
"top_k": top_k,
|
|
|
|
|
|
|
19 |
}
|
20 |
+
response = replicate.run(
|
21 |
+
model_name,
|
22 |
+
input=input_data
|
23 |
+
)
|
24 |
+
return "".join(response) # Replicate returns a generator, so we join it into a single string
|
|
|
25 |
|
26 |
# Function to detect language
|
27 |
def detect_language(text):
|
|
|
44 |
})
|
45 |
return text_data
|
46 |
|
47 |
+
# Function to search for query in PDF content
|
48 |
+
def search_pdf_content(pdf_text_data, query):
|
49 |
+
results = []
|
50 |
+
for entry in pdf_text_data:
|
51 |
+
if query.lower() in entry["content"].lower():
|
52 |
+
results.append(entry)
|
53 |
+
return results
|
54 |
+
|
55 |
+
# Function to split text into chunks
|
56 |
+
def split_text_into_chunks(text, chunk_size=500):
|
57 |
+
words = text.split()
|
58 |
+
chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
59 |
+
return chunks
|
60 |
+
|
61 |
# Default system prompts for each query translation method
|
62 |
DEFAULT_SYSTEM_PROMPTS = {
|
63 |
"Multi-Query": """You are an AI language model assistant. Your task is to generate five
|
|
|
130 |
formatted_prompt = system_prompt.format(question=prompt)
|
131 |
st.write("**Formatted System Prompt:**", formatted_prompt)
|
132 |
|
133 |
+
# Query the Llama model for query translation
|
134 |
+
translated_queries = query_llama_model(formatted_prompt, max_new_tokens, temperature, top_k)
|
135 |
if translated_queries:
|
136 |
+
st.write("**Translated Queries:**")
|
137 |
+
st.write(translated_queries.split("\n")[-1]) # Print only the updated question part
|
138 |
|
139 |
# Indexing
|
140 |
if st.button("Apply Indexing"):
|
141 |
st.write(f"**Applied Indexing Method:** {indexing_method}")
|
142 |
+
if pdf_file is not None:
|
143 |
+
# Extract and search PDF content
|
144 |
+
pdf_text_data = extract_text_from_pdf(pdf_file)
|
145 |
+
search_results = search_pdf_content(pdf_text_data, prompt)
|
146 |
+
|
147 |
+
if search_results:
|
148 |
+
st.write("**Relevant Content from PDF:**")
|
149 |
+
for result in search_results:
|
150 |
+
st.write(f"**Page {result['page']}, Line {result['line']}:** {result['content']}")
|
151 |
+
|
152 |
+
# Split text into chunks
|
153 |
+
chunks = split_text_into_chunks("\n".join([result["content"] for result in search_results]))
|
154 |
+
st.write("**Chunks Obtained from PDF:**")
|
155 |
+
for i, chunk in enumerate(chunks):
|
156 |
+
st.write(f"**Chunk {i + 1}:** {chunk}")
|
157 |
+
|
158 |
+
# Print summary of split for Multi-Representation
|
159 |
+
if indexing_method == "Multi-Representation":
|
160 |
+
st.write("**Summary of Split:**")
|
161 |
+
summary = query_llama_model(f"Summarize the following text:\n{chunks[0]}", max_new_tokens, temperature, top_k)
|
162 |
+
st.write(summary)
|
163 |
+
else:
|
164 |
+
st.write("**No relevant content found in the PDF.**")
|
165 |
+
else:
|
166 |
+
st.write("**No PDF uploaded.**")
|
167 |
+
|
168 |
+
# Generate Response
|
169 |
if st.button("Generate Response"):
|
170 |
+
if pdf_file is not None:
|
171 |
+
# Extract and search PDF content
|
172 |
+
pdf_text_data = extract_text_from_pdf(pdf_file)
|
173 |
+
search_results = search_pdf_content(pdf_text_data, prompt)
|
174 |
+
|
175 |
+
if search_results:
|
176 |
+
st.write("**Relevant Content from PDF:**")
|
177 |
+
for result in search_results:
|
178 |
+
st.write(f"**Page {result['page']}, Line {result['line']}:** \"{result['content']}\"")
|
179 |
+
|
180 |
+
# Generate response based on PDF content
|
181 |
+
pdf_context = "\n".join([result["content"] for result in search_results])
|
182 |
+
response = query_llama_model(f"Based on the following context:\n{pdf_context}\n\nAnswer this question: {prompt}", max_new_tokens, temperature, top_k)
|
183 |
+
else:
|
184 |
+
st.write("**No relevant content found in the PDF. Generating response without PDF context.**")
|
185 |
+
response = query_llama_model(prompt, max_new_tokens, temperature, top_k)
|
186 |
+
else:
|
187 |
+
st.write("**No PDF uploaded. Generating response without PDF context.**")
|
188 |
+
response = query_llama_model(prompt, max_new_tokens, temperature, top_k)
|
189 |
+
|
190 |
if response:
|
191 |
st.write("**Response:**", response)
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
if __name__ == "__main__":
|
194 |
main()
|