Redmind commited on
Commit
a268161
·
verified ·
1 Parent(s): c8b57b9

Delete services

Browse files
services/__pycache__/chat_service.cpython-310.pyc DELETED
Binary file (6.05 kB)
 
services/__pycache__/chat_service.cpython-311.pyc DELETED
Binary file (9.56 kB)
 
services/__pycache__/chat_service.cpython-312.pyc DELETED
Binary file (8.75 kB)
 
services/__pycache__/file_upload_service.cpython-310.pyc DELETED
Binary file (5.25 kB)
 
services/__pycache__/file_upload_service.cpython-312.pyc DELETED
Binary file (8.64 kB)
 
services/__pycache__/multidoc_files_upload.cpython-310.pyc DELETED
Binary file (4.42 kB)
 
services/__pycache__/multidoc_files_upload.cpython-311.pyc DELETED
Binary file (8.14 kB)
 
services/chat_service.py DELETED
@@ -1,137 +0,0 @@
1
- import os
2
- import logging
3
- from dotenv import load_dotenv
4
- from langchain.memory import ConversationSummaryMemory
5
- from langchain_core.prompts import ChatPromptTemplate
6
- from langchain_community.utilities import SQLDatabase
7
- from langchain_core.output_parsers import StrOutputParser
8
- from langchain_core.runnables import RunnablePassthrough
9
- from langchain_openai import ChatOpenAI
10
- from langchain_openai import OpenAIEmbeddings
11
- from langchain.agents import create_tool_calling_agent, AgentExecutor, Tool
12
- from langchain_community.vectorstores import FAISS
13
- from config.settings import Settings
14
-
15
- # Load environment variables
16
- load_dotenv()
17
- open_api_key_token = os.getenv('OPENAI_API_KEY')
18
- #db_uri = os.getenv('POST_DB_URI')
19
- db_uri = Settings.DB_URI
20
-
21
- class ChatAgentService:
22
- def __init__(self):
23
- # Database setup
24
- self.db = SQLDatabase.from_uri(db_uri)
25
- self.llm = ChatOpenAI(model="gpt-3.5-turbo-0125", api_key=open_api_key_token,max_tokens=150,temperature=0.2)
26
- self.memory = ConversationSummaryMemory(llm=self.llm, return_messages=True)
27
-
28
-
29
- # Tools setup
30
- self.tools = [
31
- Tool(
32
- name="DatabaseQuery",
33
- func=self.database_tool,
34
- description="Queries the SQL database using dynamically generated SQL queries based on user questions. Aimed to retrieve structured data like counts, specific records, or summaries from predefined schemas.",
35
- tool_choice="required"
36
- ),
37
- Tool(
38
- name="DocumentData",
39
- func=self.document_data_tool,
40
- description="Searches through indexed documents to find relevant information based on user queries. Handles unstructured data from various document formats like PDF, DOCX, or TXT files.",
41
- tool_choice="required"
42
- ),
43
- ]
44
-
45
- # Agent setup
46
- prompt_template = self.setup_prompt()
47
- self.agent = create_tool_calling_agent(self.llm.bind(memory=self.memory), self.tools, prompt_template)
48
- self.agent_executor = AgentExecutor(agent=self.agent, tools=self.tools, memory=self.memory, verbose=True)
49
-
50
- def setup_prompt(self):
51
- prompt_template = f"""
52
- You are an assistant that helps with database queries and document retrieval.
53
- Please base your responses strictly on available data and avoid assumptions.
54
- If the question pertains to numerical data or structured queries, use the DatabaseQuery tool.
55
- If the question relates to content within various documents, use the DocumentData tool.
56
- Question: {{input}}
57
- {{agent_scratchpad}}
58
- """
59
- return ChatPromptTemplate.from_template(prompt_template)
60
-
61
- def database_tool(self, question):
62
- sql_query = self.generate_sql_query(question)
63
- return self.run_query(sql_query)
64
-
65
- def get_schema(self,_):
66
- # print(self.db.get_table_info())
67
- return self.db.get_table_info()
68
- def generate_sql_query(self, question):
69
- schema = self.get_schema(None) # Get the schema using the function
70
- template_query_generation = """Generate a SQL query to answer the user's question based on the available database schema.
71
- {schema}
72
- Question: {question}
73
- SQL Query:"""
74
-
75
- prompt_query_generation = ChatPromptTemplate.from_template(template_query_generation)
76
- # Correctly setting up the initial data dictionary for the chain
77
- input_data = {'question': question}
78
- # Setup the chain correctly
79
- sql_chain = (RunnablePassthrough.assign(schema=self.get_schema)
80
- | prompt_query_generation
81
- | self.llm.bind(stop="\nSQL Result:")
82
- | StrOutputParser())
83
-
84
- # Make sure to invoke with an empty dictionary if all needed data is already assigned
85
- return sql_chain.invoke(input_data)
86
-
87
- def run_query(self, query):
88
- try:
89
- logging.info(f"Executing SQL query: {query}")
90
- result = self.db.run(query)
91
- logging.info(f"Query successful: {result}")
92
- return result
93
- except Exception as e:
94
- logging.error(f"Error executing query: {query}, Error: {str(e)}")
95
- return None
96
-
97
- def document_data_tool(self, query):
98
- try:
99
- logging.info(f"Searching documents for query: {query}")
100
- embeddings = OpenAIEmbeddings(api_key=open_api_key_token)
101
- index_paths = self.find_index_for_document(query)
102
- responses = []
103
- for index_path in index_paths:
104
- vector_store = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
105
- response = self.query_vector_store(vector_store, query)
106
- responses.append(response)
107
- logging.info(f"Document search results: {responses}")
108
- return "\n".join(responses)
109
- except Exception as e:
110
- logging.error(f"Error in document data tool for query: {query}, Error: {str(e)}")
111
- return "Error processing document query."
112
-
113
- def find_index_for_document(self, query):
114
- base_path = os.getenv('VECTOR_DB_PATH')
115
- # document_hint = self.extract_document_hint(query)
116
- index_paths = []
117
- for root, dirs, files in os.walk(base_path):
118
- for dir in dirs:
119
- if 'index.faiss' in os.listdir(os.path.join(root, dir)):
120
- index_paths.append(os.path.join(root, dir, ''))
121
- return index_paths
122
-
123
- def query_vector_store(self, vector_store, query):
124
- docs = vector_store.similarity_search(query)
125
- return '\n\n'.join([doc.page_content for doc in docs])
126
-
127
- def answer_question(self, user_question):
128
- try:
129
- logging.info(f"Received question: {user_question}")
130
- response = self.agent_executor.invoke({"input": user_question})
131
- output_response = response.get("output", "No valid response generated.")
132
- logging.info(f"Response generated: {output_response}")
133
- return output_response
134
- except Exception as e:
135
- logging.error(f"Error processing question: {user_question}, Error: {str(e)}")
136
- return f"An error occurred: {str(e)}"
137
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
services/file_upload_service.py DELETED
@@ -1,141 +0,0 @@
1
- import io
2
- import os
3
- import tempfile
4
- import hashlib
5
- import json
6
- import logging
7
- import pandas as pd
8
- from datetime import datetime
9
- from dotenv import load_dotenv
10
- from langchain_community.vectorstores import FAISS
11
- from langchain_openai import OpenAIEmbeddings
12
- from langchain.text_splitter import CharacterTextSplitter
13
- from PyPDF2 import PdfReader
14
- from docx import Document
15
- # from transformers import pipeline
16
-
17
- # Load environment variables
18
- load_dotenv()
19
- open_api_key_token = os.getenv('OPENAI_API_KEY')
20
-
21
-
22
- class FileHandler:
23
- def __init__(self, vector_db_path):
24
- self.vector_db_path = vector_db_path
25
- self.embeddings = OpenAIEmbeddings(api_key=open_api_key_token)
26
- # self.summarizer = pipeline("summarization")
27
-
28
- def prepare_metadata_string(self, document_name, document_description, department, version, last_updated):
29
- metadata_string = f"\nDocument Name: {document_name}\nDocument Description: {document_description}\nDepartment: {department}\nVersion: {version}\nLast Updated: {last_updated}"
30
- return metadata_string
31
-
32
- async def handle_file_upload(self, file, document_name, document_description, department, version, last_updated):
33
- content = await file.read()
34
- file_hash = hashlib.md5(content).hexdigest()
35
- file_key = f"{file.filename}_{file_hash}"
36
- vector_store_path = os.path.join(self.vector_db_path, f"{file_key}.vectorstore")
37
- metadata_path = os.path.join(self.vector_db_path, f"{file_key}.metadata.json")
38
-
39
- metadata_string = self.prepare_metadata_string(document_name, document_description, department, version,
40
- last_updated)
41
-
42
- if os.path.exists(vector_store_path) and os.path.exists(metadata_path):
43
- with open(metadata_path, 'r') as md_file:
44
- metadata = json.load(md_file)
45
- return {'path': vector_store_path, 'metadata': metadata, 'status': 'skipped - duplicate'}
46
-
47
- if file.filename.endswith('.csv') or file.filename.endswith('.xlsx'):
48
- texts = self.load_and_split_table(content, file.filename,metadata_string)
49
- else:
50
- texts = await self.load_and_split_text(content, file.filename,metadata_string)
51
-
52
- vector_store = self.create_vector_store(texts)
53
- vector_store.save_local(vector_store_path)
54
-
55
- metadata = {
56
- 'filename': file.filename,
57
- 'document_name': document_name,
58
- 'document_description': document_description,
59
- 'department': department,
60
- 'version': version,
61
- 'last_updated': last_updated,
62
- 'hash': file_hash,
63
- 'upload_date': datetime.now().isoformat(),
64
- 'file_path': vector_store_path,
65
- 'file_size': len(content),
66
- 'content_type': file.content_type
67
- }
68
-
69
- with open(metadata_path, 'w') as md_file:
70
- json.dump(metadata, md_file)
71
-
72
- return {"message": "File processed and vector store created successfully", "file_metadata": metadata}
73
-
74
- def summarize_text(self, text):
75
- try:
76
- summary = self.summarizer(text, max_length=150, min_length=10, do_sample=False)
77
- logging.info("Text summarization successful")
78
- return summary[0]['summary_text']
79
- except Exception as e:
80
- logging.error(f"Error in summarization: {str(e)}")
81
- # Log error or handle exception
82
- return text # Return original text if summarization is not possible
83
-
84
- def load_and_split_table(self, content, filename,metadata_string):
85
- # Handle CSV and Excel file reading
86
- if filename.endswith('.csv'):
87
- df = pd.read_csv(io.StringIO(content.decode('utf-8')))
88
- else: # Excel
89
- df = pd.read_excel(io.BytesIO(content))
90
- text = df.to_string(index=False) # Convert DataFrame to string
91
- text += metadata_string # Append metadata to the text
92
- return self.split_text(text)
93
-
94
- async def load_and_split_text(self, content, filename,metadata_string):
95
- with tempfile.NamedTemporaryFile(delete=False, mode='w+b', suffix=f"_{filename}") as temp_file:
96
- temp_file.write(content)
97
- temp_file.flush()
98
- temp_file_path = temp_file.name
99
-
100
- # Ensure the temp file is closed before reading from it
101
- if filename.endswith('.pdf'):
102
- texts = await self.load_and_split_pdf(temp_file_path,metadata_string)
103
- elif filename.endswith('.docx'):
104
- texts = await self.load_and_split_docx(temp_file_path,metadata_string)
105
- elif filename.endswith('.txt'):
106
- texts = await self.load_and_split_txt(temp_file_path,metadata_string)
107
-
108
- # Apply summarization here to each text segment
109
- # summarized_texts = [self.summarize_text(text) for text in texts]
110
-
111
- # os.unlink(temp_file_path) # Explicitly remove the temporary file
112
- # return summarized_texts
113
- os.unlink(temp_file_path) # Explicitly remove the temporary file
114
- return texts
115
-
116
- async def load_and_split_pdf(self, pdf_path,metadata_string):
117
- reader = PdfReader(pdf_path)
118
- text = ''
119
- for page in reader.pages:
120
- text += page.extract_text() or ""
121
- text += metadata_string # Append metadata to the text
122
- return self.split_text(text)
123
-
124
- async def load_and_split_docx(self, docx_path,metadata_string):
125
- doc = Document(docx_path)
126
- text = '\n'.join([paragraph.text for paragraph in doc.paragraphs if paragraph.text])
127
- text += metadata_string # Append metadata to the text
128
- return self.split_text(text)
129
-
130
- async def load_and_split_txt(self, txt_path,metadata_string):
131
- with open(txt_path, 'r', encoding='utf-8') as file:
132
- text = file.read()
133
- text += metadata_string # Append metadata to the text
134
- return self.split_text(text)
135
-
136
- def split_text(self, text):
137
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
138
- return text_splitter.split_text(text)
139
-
140
- def create_vector_store(self, texts):
141
- return FAISS.from_texts(texts, self.embeddings)