Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,6 +18,10 @@ from langchain_community.llms import HuggingFaceHub
|
|
| 18 |
from langchain_core.documents import Document
|
| 19 |
from sentence_transformers import SentenceTransformer
|
| 20 |
from llama_parse import LlamaParse
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# Environment variables and configurations
|
| 23 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
|
@@ -92,51 +96,73 @@ def get_model(temperature, top_p, repetition_penalty):
|
|
| 92 |
)
|
| 93 |
|
| 94 |
def duckduckgo_search(query):
|
|
|
|
| 95 |
with DDGS() as ddgs:
|
| 96 |
-
results = ddgs.text(query, max_results=5)
|
|
|
|
| 97 |
return results
|
| 98 |
|
| 99 |
def get_response_with_search(query, temperature, top_p, repetition_penalty, use_pdf=False):
|
|
|
|
|
|
|
|
|
|
| 100 |
model = get_model(temperature, top_p, repetition_penalty)
|
| 101 |
embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
| 102 |
|
| 103 |
if use_pdf and os.path.exists("faiss_database"):
|
|
|
|
| 104 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
| 105 |
retriever = database.as_retriever()
|
| 106 |
relevant_docs = retriever.get_relevant_documents(query)
|
| 107 |
context = "\n".join([f"Content: {doc.page_content}\nSource: {doc.metadata['source']}\n" for doc in relevant_docs])
|
| 108 |
else:
|
|
|
|
| 109 |
search_results = duckduckgo_search(query)
|
| 110 |
context = "\n".join(f"{result['title']}\n{result['body']}\nSource: {result['href']}\n"
|
| 111 |
for result in search_results if 'body' in result)
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
{context}
|
| 115 |
Write a detailed and complete research document that fulfills the following user request: '{query}'
|
| 116 |
After the main content, provide a list of sources used in your response, prefixed with 'Sources:'.
|
| 117 |
Do not include any part of these instructions in your response. [/INST]"""
|
| 118 |
|
|
|
|
| 119 |
response = model(prompt)
|
|
|
|
| 120 |
|
| 121 |
main_content, sources = split_response(response)
|
| 122 |
|
|
|
|
| 123 |
return main_content, sources
|
| 124 |
|
| 125 |
def split_response(response):
|
|
|
|
|
|
|
|
|
|
| 126 |
# Remove any remaining instruction text
|
| 127 |
response = re.sub(r'\[/?INST\]', '', response)
|
| 128 |
response = re.sub(r'~~.*?~~', '', response, flags=re.DOTALL)
|
| 129 |
|
|
|
|
|
|
|
| 130 |
# Split the response into main content and sources
|
| 131 |
parts = response.split("Sources:", 1)
|
| 132 |
main_content = parts[0].strip()
|
| 133 |
sources = parts[1].strip() if len(parts) > 1 else ""
|
| 134 |
|
|
|
|
|
|
|
|
|
|
| 135 |
return main_content, sources
|
| 136 |
|
| 137 |
def chatbot_interface(message, history, temperature, top_p, repetition_penalty, use_pdf):
|
|
|
|
| 138 |
main_content, sources = get_response_with_search(message, temperature, top_p, repetition_penalty, use_pdf)
|
| 139 |
formatted_response = f"{main_content}\n\nSources:\n{sources}"
|
|
|
|
| 140 |
return formatted_response
|
| 141 |
|
| 142 |
# Gradio interface
|
|
|
|
| 18 |
from langchain_core.documents import Document
|
| 19 |
from sentence_transformers import SentenceTransformer
|
| 20 |
from llama_parse import LlamaParse
|
| 21 |
+
import logging
|
| 22 |
+
|
| 23 |
+
# Set up logging
|
| 24 |
+
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 25 |
|
| 26 |
# Environment variables and configurations
|
| 27 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
|
|
|
| 96 |
)
|
| 97 |
|
| 98 |
def duckduckgo_search(query):
|
| 99 |
+
logging.debug(f"Performing DuckDuckGo search for query: {query}")
|
| 100 |
with DDGS() as ddgs:
|
| 101 |
+
results = list(ddgs.text(query, max_results=5))
|
| 102 |
+
logging.debug(f"Search returned {len(results)} results")
|
| 103 |
return results
|
| 104 |
|
| 105 |
def get_response_with_search(query, temperature, top_p, repetition_penalty, use_pdf=False):
|
| 106 |
+
logging.debug(f"Getting response for query: {query}")
|
| 107 |
+
logging.debug(f"Parameters: temperature={temperature}, top_p={top_p}, repetition_penalty={repetition_penalty}, use_pdf={use_pdf}")
|
| 108 |
+
|
| 109 |
model = get_model(temperature, top_p, repetition_penalty)
|
| 110 |
embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
| 111 |
|
| 112 |
if use_pdf and os.path.exists("faiss_database"):
|
| 113 |
+
logging.debug("Using PDF database for context")
|
| 114 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
| 115 |
retriever = database.as_retriever()
|
| 116 |
relevant_docs = retriever.get_relevant_documents(query)
|
| 117 |
context = "\n".join([f"Content: {doc.page_content}\nSource: {doc.metadata['source']}\n" for doc in relevant_docs])
|
| 118 |
else:
|
| 119 |
+
logging.debug("Using web search for context")
|
| 120 |
search_results = duckduckgo_search(query)
|
| 121 |
context = "\n".join(f"{result['title']}\n{result['body']}\nSource: {result['href']}\n"
|
| 122 |
for result in search_results if 'body' in result)
|
| 123 |
+
|
| 124 |
+
logging.debug(f"Context generated. Length: {len(context)} characters")
|
| 125 |
|
| 126 |
+
prompt = f"""<s>[INST] Using the following context:
|
| 127 |
{context}
|
| 128 |
Write a detailed and complete research document that fulfills the following user request: '{query}'
|
| 129 |
After the main content, provide a list of sources used in your response, prefixed with 'Sources:'.
|
| 130 |
Do not include any part of these instructions in your response. [/INST]"""
|
| 131 |
|
| 132 |
+
logging.debug("Sending prompt to model")
|
| 133 |
response = model(prompt)
|
| 134 |
+
logging.debug(f"Received response from model. Length: {len(response)} characters")
|
| 135 |
|
| 136 |
main_content, sources = split_response(response)
|
| 137 |
|
| 138 |
+
logging.debug(f"Split response. Main content length: {len(main_content)}, Sources length: {len(sources)}")
|
| 139 |
return main_content, sources
|
| 140 |
|
| 141 |
def split_response(response):
|
| 142 |
+
logging.debug("Splitting response")
|
| 143 |
+
logging.debug(f"Original response: {response[:100]}...") # Log first 100 characters
|
| 144 |
+
|
| 145 |
# Remove any remaining instruction text
|
| 146 |
response = re.sub(r'\[/?INST\]', '', response)
|
| 147 |
response = re.sub(r'~~.*?~~', '', response, flags=re.DOTALL)
|
| 148 |
|
| 149 |
+
logging.debug(f"After removing instructions: {response[:100]}...") # Log first 100 characters
|
| 150 |
+
|
| 151 |
# Split the response into main content and sources
|
| 152 |
parts = response.split("Sources:", 1)
|
| 153 |
main_content = parts[0].strip()
|
| 154 |
sources = parts[1].strip() if len(parts) > 1 else ""
|
| 155 |
|
| 156 |
+
logging.debug(f"Main content starts with: {main_content[:100]}...") # Log first 100 characters
|
| 157 |
+
logging.debug(f"Sources: {sources[:100]}...") # Log first 100 characters
|
| 158 |
+
|
| 159 |
return main_content, sources
|
| 160 |
|
| 161 |
def chatbot_interface(message, history, temperature, top_p, repetition_penalty, use_pdf):
|
| 162 |
+
logging.debug(f"Chatbot interface called with message: {message}")
|
| 163 |
main_content, sources = get_response_with_search(message, temperature, top_p, repetition_penalty, use_pdf)
|
| 164 |
formatted_response = f"{main_content}\n\nSources:\n{sources}"
|
| 165 |
+
logging.debug(f"Formatted response: {formatted_response[:100]}...") # Log first 100 characters
|
| 166 |
return formatted_response
|
| 167 |
|
| 168 |
# Gradio interface
|