SearchGPT

Running

App Files Files Community

Shreyas094 commited on Aug 4, 2024

Commit

63416d1

verified ·

1 Parent(s): 0efdc97

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -116

app.py CHANGED Viewed

@@ -102,97 +102,68 @@ uploaded_documents = load_documents()
 from langchain.vectorstores import FAISS
 import faiss
-def update_vectors(files, parser):
-    global uploaded_documents
-    logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
-    if not files:
-        logging.warning("No files provided for update_vectors")
-        return "Please upload at least one PDF file.", display_documents()
-    embed = get_embeddings()
-    all_data = []
-    seen_contents = set()
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
-            data = load_document(file, parser)
-            if not data:
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
-            logging.info(f"Loaded {len(data)} chunks from {file.name}")
-            for chunk in data:
-                if chunk.page_content not in seen_contents:
-                    all_data.append(chunk)
-                    seen_contents.add(chunk.page_content)
-                else:
-                    logging.warning(f"Duplicate content detected in {file.name}, skipping...")
-            if not any(doc["name"] == file.name for doc in uploaded_documents):
-                uploaded_documents.append({"name": file.name, "selected": True})
-                logging.info(f"Added new document to uploaded_documents: {file.name}")
-            else:
-                logging.info(f"Document already exists in uploaded_documents: {file.name}")
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
-    if not all_data:
-        logging.warning("No valid data extracted from uploaded files")
-        return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
     try:
-        if os.path.exists("faiss_database"):
-            logging.info("Updating existing FAISS database")
-            database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-            initial_size = database.index.ntotal
-            database.add_documents(all_data)
-            final_size = database.index.ntotal
-            logging.info(f"FAISS database updated. Initial size: {initial_size}, Final size: {final_size}")
-        else:
-            logging.info("Creating new FAISS database")
-            database = FAISS.from_documents(all_data, embed)
-            logging.info(f"New FAISS database created with {database.index.ntotal} vectors")
-        database.save_local("faiss_database")
-        logging.info("FAISS database saved")
-        # Check the database after updating
-        check_faiss_database()
-        # Analyze document similarity
-        analyze_document_similarity()
     except Exception as e:
         logging.error(f"Error updating FAISS database: {str(e)}")
-        return f"Error updating vector store: {str(e)}", display_documents()
-    save_documents(uploaded_documents)
-    logging.info(f"Updated documents saved. Total documents: {len(uploaded_documents)}")
-    return f"Vector store updated successfully. Processed {len(all_data)} chunks from {len(files)} files using {parser}.", display_documents()
-from sklearn.metrics.pairwise import cosine_similarity
-def analyze_document_similarity():
-    embed = get_embeddings()
-    database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-    docs = list(database.docstore.docs.values())
-    embeddings = [database.embedding_function(doc.page_content) for doc in docs]
-    similarity_matrix = cosine_similarity(embeddings)
-    for i in range(len(docs)):
-        for j in range(i+1, len(docs)):
-            similarity = similarity_matrix[i][j]
-            logging.info(f"Similarity between {docs[i].metadata['source']} and {docs[j].metadata['source']}: {similarity}")
-            if similarity > 0.9:  # Adjust this threshold as needed
-                logging.warning(f"High similarity detected between {docs[i].metadata['source']} and {docs[j].metadata['source']}")
-# Call this after updating the vector store
-analyze_document_similarity()
 def delete_documents(selected_docs):
     global uploaded_documents
@@ -522,17 +493,17 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
         return
     try:
-        retriever = database.as_retriever(search_kwargs={"k": 10})  # Increase k to retrieve more documents
         logging.info(f"Retrieving relevant documents for query: {query}")
-        relevant_docs = retriever.get_relevant_documents(query)
-        logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
-        for i, doc in enumerate(relevant_docs):
-            logging.info(f"Relevant document {i+1}: {doc.metadata['source']}, Score: {doc.metadata.get('score', 'N/A')}")
-            logging.info(f"Relevant document {i+1} content preview: {doc.page_content[:100]}...")
-        # Filter relevant_docs based on selected documents, but keep original order
-        filtered_docs = [doc for doc in relevant_docs if any(selected_doc in doc.metadata["source"] for selected_doc in selected_docs)]
         logging.info(f"Number of filtered documents: {len(filtered_docs)}")
         if not filtered_docs:
@@ -541,47 +512,37 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
             return
         for i, doc in enumerate(filtered_docs):
-            logging.info(f"Filtered document {i+1} source: {doc.metadata['source']}")
-            logging.info(f"Filtered document {i+1} content preview: {doc.page_content[:100]}...")
-        context_str = "\n\n".join([f"Document: {doc.metadata['source']}\n{doc.page_content}" for doc in filtered_docs])
         logging.info(f"Total context length: {len(context_str)}")
-        prompt = f"""You are analyzing multiple financial documents. The following documents have been selected: {', '.join(selected_docs)}
-Using the following context from the selected PDF documents:
-{context_str}
-Please provide a detailed and complete response that answers the following user question, making sure to consider information from all selected documents: '{query}'
-If the information is not found in the provided context, please state that clearly."""
         if model == "@cf/meta/llama-3.1-8b-instruct":
             logging.info("Using Cloudflare API")
-            for response in get_response_from_cloudflare(prompt=prompt, context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
                 yield response
         else:
             logging.info("Using Hugging Face API")
             client = InferenceClient(model, token=huggingface_token)
             response = ""
             for i in range(num_calls):
                 logging.info(f"API call {i+1}/{num_calls}")
-                try:
-                    for message in client.chat_completion(
-                        messages=[{"role": "user", "content": prompt}],
-                        max_tokens=10000,
-                        temperature=temperature,
-                        stream=True,
-                    ):
-                        if message.choices and message.choices[0].delta and message.choices[0].delta.content:
-                            chunk = message.choices[0].delta.content
-                            response += chunk
-                            yield response  # Yield partial response
-                except Exception as e:
-                    logging.error(f"Error in API call {i+1}: {str(e)}")
-                    yield f"Error in API call {i+1}: {str(e)}. Attempting next call..."
             logging.info("Finished generating response")

 from langchain.vectorstores import FAISS
 import faiss
+def add_documents_to_faiss(documents: List[Document], embeddings):
+    logging.info(f"Adding {len(documents)} documents to FAISS database")
+    if os.path.exists("faiss_database"):
+        db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
+        logging.info(f"Loaded existing FAISS database with {db.index.ntotal} vectors")
+        initial_size = db.index.ntotal
+        db.add_documents(documents)
+        final_size = db.index.ntotal
+        logging.info(f"FAISS database updated. Initial size: {initial_size}, Final size: {final_size}")
+    else:
+        db = FAISS.from_documents(documents, embeddings)
+        logging.info(f"Created new FAISS database with {db.index.ntotal} vectors")
+    db.save_local("faiss_database")
+    logging.info("FAISS database saved")
+    return db
+def get_relevant_documents(query: str, selected_docs: List[str], embeddings) -> List[Document]:
+    if not os.path.exists("faiss_database"):
+        logging.warning("No FAISS database found")
+        return []
+    db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
+    logging.info(f"Loaded FAISS database with {db.index.ntotal} vectors")
+    # Retrieve documents without filtering first
+    all_docs = db.similarity_search(query, k=20)  # Increase k to ensure we get enough documents
+    logging.info(f"Retrieved {len(all_docs)} documents from FAISS")
+    # Log all retrieved documents
+    for i, doc in enumerate(all_docs):
+        logging.info(f"Retrieved document {i+1} source: {doc.metadata['source']}")
+    # Filter documents based on selected_docs
+    filtered_docs = [doc for doc in all_docs if doc.metadata["source"] in selected_docs]
+    logging.info(f"Filtered to {len(filtered_docs)} documents based on selection")
+    return filtered_docs
+def update_vectors(files: List[NamedTemporaryFile], parser: str, embeddings) -> str:
+    all_documents = []
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
+            documents = load_document(file, parser)
+            if not documents:
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
+            logging.info(f"Loaded {len(documents)} chunks from {file.name}")
+            all_documents.extend(documents)
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
+    if not all_documents:
+        return "No valid data could be extracted from the uploaded files."
     try:
+        db = add_documents_to_faiss(all_documents, embeddings)
+        return f"Vector store updated successfully. Added {len(all_documents)} chunks from {len(files)} files."
     except Exception as e:
         logging.error(f"Error updating FAISS database: {str(e)}")
+        return f"Error updating vector store: {str(e)}"
 def delete_documents(selected_docs):
     global uploaded_documents
         return
     try:
+        retriever = database.as_retriever(search_kwargs={"k": 20})  # Increase k to retrieve more documents initially
         logging.info(f"Retrieving relevant documents for query: {query}")
+        all_relevant_docs = retriever.get_relevant_documents(query)
+        logging.info(f"Number of relevant documents retrieved: {len(all_relevant_docs)}")
+        # Log all retrieved documents before filtering
+        for i, doc in enumerate(all_relevant_docs):
+            logging.info(f"Retrieved document {i+1} source: {doc.metadata['source']}")
+        # Filter relevant_docs based on selected documents
+        filtered_docs = [doc for doc in all_relevant_docs if doc.metadata["source"] in selected_docs]
         logging.info(f"Number of filtered documents: {len(filtered_docs)}")
         if not filtered_docs:
             return
         for i, doc in enumerate(filtered_docs):
+            logging.info(f"Document {i+1} source: {doc.metadata['source']}")
+            logging.info(f"Document {i+1} content preview: {doc.page_content[:100]}...")
+        context_str = "\n".join([doc.page_content for doc in filtered_docs])
         logging.info(f"Total context length: {len(context_str)}")
         if model == "@cf/meta/llama-3.1-8b-instruct":
             logging.info("Using Cloudflare API")
+            for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
                 yield response
         else:
             logging.info("Using Hugging Face API")
+            prompt = f"""Using the following context from the PDF documents:
+{context_str}
+Write a detailed and complete response that answers the following user question: '{query}'"""
             client = InferenceClient(model, token=huggingface_token)
             response = ""
             for i in range(num_calls):
                 logging.info(f"API call {i+1}/{num_calls}")
+                for message in client.chat_completion(
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=10000,
+                    temperature=temperature,
+                    stream=True,
+                ):
+                    if message.choices and message.choices[0].delta and message.choices[0].delta.content:
+                        chunk = message.choices[0].delta.content
+                        response += chunk
+                        yield response  # Yield partial response
             logging.info("Finished generating response")