Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,8 +22,13 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
| 22 |
from langchain_community.llms import HuggingFaceHub
|
| 23 |
from langchain_core.documents import Document
|
| 24 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
|
|
|
| 27 |
|
| 28 |
# Load SentenceTransformer model
|
| 29 |
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
|
@@ -108,12 +113,28 @@ class EnhancedContextDrivenChatbot:
|
|
| 108 |
|
| 109 |
return contextualized_question, topics, self.entity_tracker
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""Loads and splits the document into pages."""
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
def update_vectors(files):
|
| 117 |
if not files:
|
| 118 |
return "Please upload at least one PDF file."
|
| 119 |
|
|
@@ -122,7 +143,7 @@ def update_vectors(files):
|
|
| 122 |
|
| 123 |
all_data = []
|
| 124 |
for file in files:
|
| 125 |
-
data = load_document(file)
|
| 126 |
all_data.extend(data)
|
| 127 |
total_chunks += len(data)
|
| 128 |
|
|
@@ -134,7 +155,7 @@ def update_vectors(files):
|
|
| 134 |
|
| 135 |
database.save_local("faiss_database")
|
| 136 |
|
| 137 |
-
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
|
| 138 |
|
| 139 |
def get_embeddings():
|
| 140 |
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
|
@@ -410,17 +431,17 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, c
|
|
| 410 |
|
| 411 |
return "An unexpected error occurred. Please try again later."
|
| 412 |
|
| 413 |
-
# Gradio interface
|
| 414 |
# Gradio interface
|
| 415 |
with gr.Blocks() as demo:
|
| 416 |
-
gr.Markdown("# Context-Driven Conversational Chatbot")
|
| 417 |
|
| 418 |
with gr.Row():
|
| 419 |
file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
|
|
|
|
| 420 |
update_button = gr.Button("Upload PDF")
|
| 421 |
|
| 422 |
update_output = gr.Textbox(label="Update Status")
|
| 423 |
-
update_button.click(update_vectors, inputs=[file_input], outputs=update_output)
|
| 424 |
|
| 425 |
with gr.Row():
|
| 426 |
with gr.Column(scale=2):
|
|
@@ -433,10 +454,10 @@ with gr.Blocks() as demo:
|
|
| 433 |
repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
|
| 434 |
web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
|
| 435 |
|
| 436 |
-
|
| 437 |
|
| 438 |
def chat(question, history, temperature, top_p, repetition_penalty, web_search):
|
| 439 |
-
answer = ask_question(question, temperature, top_p, repetition_penalty, web_search,
|
| 440 |
history.append((question, answer))
|
| 441 |
return "", history
|
| 442 |
|
|
|
|
| 22 |
from langchain_community.llms import HuggingFaceHub
|
| 23 |
from langchain_core.documents import Document
|
| 24 |
from sentence_transformers import SentenceTransformer
|
| 25 |
+
import nest_asyncio
|
| 26 |
+
from llama_parse import LlamaParse
|
| 27 |
+
|
| 28 |
+
nest_asyncio.apply()
|
| 29 |
|
| 30 |
huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
|
| 31 |
+
llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
|
| 32 |
|
| 33 |
# Load SentenceTransformer model
|
| 34 |
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
|
|
|
| 113 |
|
| 114 |
return contextualized_question, topics, self.entity_tracker
|
| 115 |
|
| 116 |
+
# Initialize LlamaParse
|
| 117 |
+
llama_parser = LlamaParse(
|
| 118 |
+
api_key=llama_cloud_api_key,
|
| 119 |
+
result_type="markdown",
|
| 120 |
+
num_workers=4,
|
| 121 |
+
verbose=True,
|
| 122 |
+
language="en",
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
def load_document(file: NamedTemporaryFile, parser: str = "pypdf") -> List[Document]:
|
| 126 |
"""Loads and splits the document into pages."""
|
| 127 |
+
if parser == "pypdf":
|
| 128 |
+
loader = PyPDFLoader(file.name)
|
| 129 |
+
return loader.load_and_split()
|
| 130 |
+
elif parser == "llamaparse":
|
| 131 |
+
documents = llama_parser.load_data(file.name)
|
| 132 |
+
# Convert LlamaParse output to langchain Document format
|
| 133 |
+
return [Document(page_content=doc.text, metadata={"source": file.name}) for doc in documents]
|
| 134 |
+
else:
|
| 135 |
+
raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
|
| 136 |
|
| 137 |
+
def update_vectors(files, parser):
|
| 138 |
if not files:
|
| 139 |
return "Please upload at least one PDF file."
|
| 140 |
|
|
|
|
| 143 |
|
| 144 |
all_data = []
|
| 145 |
for file in files:
|
| 146 |
+
data = load_document(file, parser)
|
| 147 |
all_data.extend(data)
|
| 148 |
total_chunks += len(data)
|
| 149 |
|
|
|
|
| 155 |
|
| 156 |
database.save_local("faiss_database")
|
| 157 |
|
| 158 |
+
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}."
|
| 159 |
|
| 160 |
def get_embeddings():
|
| 161 |
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
|
|
|
| 431 |
|
| 432 |
return "An unexpected error occurred. Please try again later."
|
| 433 |
|
|
|
|
| 434 |
# Gradio interface
|
| 435 |
with gr.Blocks() as demo:
|
| 436 |
+
gr.Markdown("# Enhanced Context-Driven Conversational Chatbot")
|
| 437 |
|
| 438 |
with gr.Row():
|
| 439 |
file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
|
| 440 |
+
parser_dropdown = gr.Dropdown(choices=["pypdf", "llamaparse"], label="Select PDF Parser", value="pypdf")
|
| 441 |
update_button = gr.Button("Upload PDF")
|
| 442 |
|
| 443 |
update_output = gr.Textbox(label="Update Status")
|
| 444 |
+
update_button.click(update_vectors, inputs=[file_input, parser_dropdown], outputs=update_output)
|
| 445 |
|
| 446 |
with gr.Row():
|
| 447 |
with gr.Column(scale=2):
|
|
|
|
| 454 |
repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
|
| 455 |
web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
|
| 456 |
|
| 457 |
+
enhanced_context_driven_chatbot = EnhancedContextDrivenChatbot()
|
| 458 |
|
| 459 |
def chat(question, history, temperature, top_p, repetition_penalty, web_search):
|
| 460 |
+
answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, enhanced_context_driven_chatbot)
|
| 461 |
history.append((question, answer))
|
| 462 |
return "", history
|
| 463 |
|