HanLee commited on
Commit
bb87055
·
1 Parent(s): 7cf4793

chore: linting

Browse files
Files changed (1) hide show
  1. app/app.py +18 -28
app/app.py CHANGED
@@ -1,8 +1,9 @@
1
  # Chroma compatibility issue resolution
2
  # https://docs.trychroma.com/troubleshooting#sqlite
3
- __import__('pysqlite3')
4
  import sys
5
- sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
 
6
 
7
  from tempfile import NamedTemporaryFile
8
  from typing import List
@@ -11,7 +12,7 @@ import chainlit as cl
11
  from chainlit.types import AskFileResponse
12
  import chromadb
13
  from chromadb.config import Settings
14
- from langchain.chains import LLMChain, RetrievalQAWithSourcesChain
15
  from langchain.chat_models import ChatOpenAI
16
  from langchain.document_loaders import PDFPlumberLoader
17
  from langchain.embeddings.openai import OpenAIEmbeddings
@@ -31,7 +32,7 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
31
 
32
  Args:
33
  file (AskFileResponse): input file to be processed
34
-
35
  Raises:
36
  ValueError: when we fail to process PDF files. We consider PDF file
37
  processing failure when there's no text returned. For example, PDFs
@@ -51,8 +52,7 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
51
  documents = loader.load()
52
 
53
  text_splitter = RecursiveCharacterTextSplitter(
54
- chunk_size=3000,
55
- chunk_overlap=100
56
  )
57
  docs = text_splitter.split_documents(documents)
58
 
@@ -66,7 +66,9 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
66
  return docs
67
 
68
 
69
- def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> VectorStore:
 
 
70
  """Takes a list of Langchain Documents and an embedding model API wrapper
71
  and build a search index using a VectorStore.
72
 
@@ -80,27 +82,21 @@ def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> Vec
80
  """
81
  # Initialize Chromadb client to enable resetting and disable telemtry
82
  client = chromadb.EphemeralClient()
83
- client_settings=Settings(
84
- allow_reset=True,
85
- anonymized_telemetry=False
86
- )
87
 
88
  # Reset the search engine to ensure we don't use old copies.
89
  # NOTE: we do not need this for production
90
- search_engine = Chroma(
91
- client=client,
92
- client_settings=client_settings
93
- )
94
  search_engine._client.reset()
95
  search_engine = Chroma.from_documents(
96
  client=client,
97
  documents=docs,
98
  embedding=embeddings,
99
- client_settings=client_settings
100
  )
101
 
102
  return search_engine
103
-
104
 
105
  @cl.on_chat_start
106
  async def on_chat_start():
@@ -123,20 +119,17 @@ async def on_chat_start():
123
  # Process and save data in the user session
124
  msg = cl.Message(content=f"Processing `{file.name}`...")
125
  await msg.send()
126
-
127
  docs = process_file(file=file)
128
  cl.user_session.set("docs", docs)
129
  msg.content = f"`{file.name}` processed. Loading ..."
130
  await msg.update()
131
 
132
  # Indexing documents into our search engine
133
- embeddings = OpenAIEmbeddings(
134
- model="text-embedding-ada-002"
135
- )
136
  try:
137
  search_engine = await cl.make_async(create_search_engine)(
138
- docs=docs,
139
- embeddings=embeddings
140
  )
141
  except Exception as e:
142
  await cl.Message(content=f"Error: {e}").send()
@@ -145,9 +138,7 @@ async def on_chat_start():
145
  await msg.update()
146
 
147
  model = ChatOpenAI(
148
- model="gpt-3.5-turbo-16k-0613",
149
- temperature=0,
150
- streaming=True
151
  )
152
 
153
  chain = RetrievalQAWithSourcesChain.from_chain_type(
@@ -164,13 +155,12 @@ async def on_chat_start():
164
 
165
  @cl.on_message
166
  async def main(message: cl.Message):
167
-
168
  # Let's load the chain from user_session
169
  chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain
170
 
171
  response = await chain.acall(
172
  message.content,
173
- callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)]
174
  )
175
  answer = response["answer"]
176
  sources = response["sources"].strip()
 
1
  # Chroma compatibility issue resolution
2
  # https://docs.trychroma.com/troubleshooting#sqlite
3
+ __import__("pysqlite3")
4
  import sys
5
+
6
+ sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
7
 
8
  from tempfile import NamedTemporaryFile
9
  from typing import List
 
12
  from chainlit.types import AskFileResponse
13
  import chromadb
14
  from chromadb.config import Settings
15
+ from langchain.chains import RetrievalQAWithSourcesChain
16
  from langchain.chat_models import ChatOpenAI
17
  from langchain.document_loaders import PDFPlumberLoader
18
  from langchain.embeddings.openai import OpenAIEmbeddings
 
32
 
33
  Args:
34
  file (AskFileResponse): input file to be processed
35
+
36
  Raises:
37
  ValueError: when we fail to process PDF files. We consider PDF file
38
  processing failure when there's no text returned. For example, PDFs
 
52
  documents = loader.load()
53
 
54
  text_splitter = RecursiveCharacterTextSplitter(
55
+ chunk_size=3000, chunk_overlap=100
 
56
  )
57
  docs = text_splitter.split_documents(documents)
58
 
 
66
  return docs
67
 
68
 
69
+ def create_search_engine(
70
+ *, docs: List[Document], embeddings: Embeddings
71
+ ) -> VectorStore:
72
  """Takes a list of Langchain Documents and an embedding model API wrapper
73
  and build a search index using a VectorStore.
74
 
 
82
  """
83
  # Initialize Chromadb client to enable resetting and disable telemtry
84
  client = chromadb.EphemeralClient()
85
+ client_settings = Settings(allow_reset=True, anonymized_telemetry=False)
 
 
 
86
 
87
  # Reset the search engine to ensure we don't use old copies.
88
  # NOTE: we do not need this for production
89
+ search_engine = Chroma(client=client, client_settings=client_settings)
 
 
 
90
  search_engine._client.reset()
91
  search_engine = Chroma.from_documents(
92
  client=client,
93
  documents=docs,
94
  embedding=embeddings,
95
+ client_settings=client_settings,
96
  )
97
 
98
  return search_engine
99
+
100
 
101
  @cl.on_chat_start
102
  async def on_chat_start():
 
119
  # Process and save data in the user session
120
  msg = cl.Message(content=f"Processing `{file.name}`...")
121
  await msg.send()
122
+
123
  docs = process_file(file=file)
124
  cl.user_session.set("docs", docs)
125
  msg.content = f"`{file.name}` processed. Loading ..."
126
  await msg.update()
127
 
128
  # Indexing documents into our search engine
129
+ embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
 
 
130
  try:
131
  search_engine = await cl.make_async(create_search_engine)(
132
+ docs=docs, embeddings=embeddings
 
133
  )
134
  except Exception as e:
135
  await cl.Message(content=f"Error: {e}").send()
 
138
  await msg.update()
139
 
140
  model = ChatOpenAI(
141
+ model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True
 
 
142
  )
143
 
144
  chain = RetrievalQAWithSourcesChain.from_chain_type(
 
155
 
156
  @cl.on_message
157
  async def main(message: cl.Message):
 
158
  # Let's load the chain from user_session
159
  chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain
160
 
161
  response = await chain.acall(
162
  message.content,
163
+ callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)],
164
  )
165
  answer = response["answer"]
166
  sources = response["sources"].strip()