HanLee commited on
Commit
4a49d79
Β·
1 Parent(s): 2dd27c9

feat: final

Browse files
.chainlit/config.toml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+ # List of environment variables to be provided by each user to use the app.
6
+ user_env = []
7
+
8
+ # Duration (in seconds) during which the session is saved when the connection is lost
9
+ session_timeout = 3600
10
+
11
+ # Enable third parties caching (e.g LangChain cache)
12
+ cache = false
13
+
14
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
15
+ # follow_symlink = false
16
+
17
+ [features]
18
+ # Show the prompt playground
19
+ prompt_playground = true
20
+
21
+ # Authorize users to upload files with messages
22
+ multi_modal = true
23
+
24
+ # Allows user to use speech to text
25
+ [features.speech_to_text]
26
+ enabled = false
27
+ # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
28
+ # language = "en-US"
29
+
30
+ [UI]
31
+ # Name of the app and chatbot.
32
+ name = "Chatbot"
33
+
34
+ # Show the readme while the conversation is empty.
35
+ show_readme_as_default = true
36
+
37
+ # Description of the app and chatbot. This is used for HTML tags.
38
+ # description = ""
39
+
40
+ # Large size content are by default collapsed for a cleaner ui
41
+ default_collapse_content = true
42
+
43
+ # The default value for the expand messages settings.
44
+ default_expand_messages = false
45
+
46
+ # Hide the chain of thought details from the user in the UI.
47
+ hide_cot = false
48
+
49
+ # Link to your github repo. This will add a github button in the UI's header.
50
+ github = "https://github.com/LinkedInLearning/hands-on-ai-building-and-deploying-llm-powered-apps-4511409"
51
+
52
+ # Specify a CSS file that can be used to customize the user interface.
53
+ # The CSS file can be served from the public directory or via an external link.
54
+ # custom_css = "/public/test.css"
55
+
56
+ # Override default MUI light theme. (Check theme.ts)
57
+ [UI.theme.light]
58
+ #background = "#FAFAFA"
59
+ #paper = "#FFFFFF"
60
+
61
+ [UI.theme.light.primary]
62
+ #main = "#F80061"
63
+ #dark = "#980039"
64
+ #light = "#FFE7EB"
65
+
66
+ # Override default MUI dark theme. (Check theme.ts)
67
+ [UI.theme.dark]
68
+ #background = "#FAFAFA"
69
+ #paper = "#FFFFFF"
70
+
71
+ [UI.theme.dark.primary]
72
+ #main = "#F80061"
73
+ #dark = "#980039"
74
+ #light = "#FFE7EB"
75
+
76
+
77
+ [meta]
78
+ generated_by = "0.7.501"
.env.sample ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ALLOW_RESET=TRUE
2
+ OPENAI_API_KEY="sk-your-openai-api-key"
.gitignore CHANGED
@@ -1,4 +1,170 @@
 
 
 
 
 
 
 
 
 
1
  .DS_Store
2
  node_modules
3
  .tmp
4
  npm-debug.log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ruff
2
+ .ruff_cache/
3
+
4
+ # Chainlit
5
+ .chainlit/.langchain.db
6
+
7
+ # Chroma
8
+ .chromadb/
9
+
10
  .DS_Store
11
  node_modules
12
  .tmp
13
  npm-debug.log
14
+
15
+ # VSCode
16
+ .vscode/
17
+
18
+ # Byte-compiled / optimized / DLL files
19
+ __pycache__/
20
+ *.py[cod]
21
+ *$py.class
22
+
23
+ # C extensions
24
+ *.so
25
+
26
+ # Distribution / packaging
27
+ .Python
28
+ build/
29
+ develop-eggs/
30
+ dist/
31
+ downloads/
32
+ eggs/
33
+ .eggs/
34
+ lib/
35
+ lib64/
36
+ parts/
37
+ sdist/
38
+ var/
39
+ wheels/
40
+ share/python-wheels/
41
+ *.egg-info/
42
+ .installed.cfg
43
+ *.egg
44
+ MANIFEST
45
+
46
+ # PyInstaller
47
+ # Usually these files are written by a python script from a template
48
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
49
+ *.manifest
50
+ *.spec
51
+
52
+ # Installer logs
53
+ pip-log.txt
54
+ pip-delete-this-directory.txt
55
+
56
+ # Unit test / coverage reports
57
+ htmlcov/
58
+ .tox/
59
+ .nox/
60
+ .coverage
61
+ .coverage.*
62
+ .cache
63
+ nosetests.xml
64
+ coverage.xml
65
+ *.cover
66
+ *.py,cover
67
+ .hypothesis/
68
+ .pytest_cache/
69
+ cover/
70
+
71
+ # Translations
72
+ *.mo
73
+ *.pot
74
+
75
+ # Django stuff:
76
+ *.log
77
+ local_settings.py
78
+ db.sqlite3
79
+ db.sqlite3-journal
80
+
81
+ # Flask stuff:
82
+ instance/
83
+ .webassets-cache
84
+
85
+ # Scrapy stuff:
86
+ .scrapy
87
+
88
+ # Sphinx documentation
89
+ docs/_build/
90
+
91
+ # PyBuilder
92
+ .pybuilder/
93
+ target/
94
+
95
+ # Jupyter Notebook
96
+ .ipynb_checkpoints
97
+
98
+ # IPython
99
+ profile_default/
100
+ ipython_config.py
101
+
102
+ # pyenv
103
+ # For a library or package, you might want to ignore these files since the code is
104
+ # intended to run in multiple environments; otherwise, check them in:
105
+ # .python-version
106
+
107
+ # pipenv
108
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
110
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
111
+ # install all needed dependencies.
112
+ #Pipfile.lock
113
+
114
+ # poetry
115
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
117
+ # commonly ignored for libraries.
118
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119
+ #poetry.lock
120
+
121
+ # pdm
122
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123
+ #pdm.lock
124
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125
+ # in version control.
126
+ # https://pdm.fming.dev/#use-with-ide
127
+ .pdm.toml
128
+
129
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130
+ __pypackages__/
131
+
132
+ # Celery stuff
133
+ celerybeat-schedule
134
+ celerybeat.pid
135
+
136
+ # SageMath parsed files
137
+ *.sage.py
138
+
139
+ # Environments
140
+ .env
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
.vscode/settings.json CHANGED
@@ -17,7 +17,6 @@
17
  "files.autoSave": "afterDelay",
18
  "screencastMode.onlyKeyboardShortcuts": true,
19
  "terminal.integrated.fontSize": 18,
20
- "workbench.activityBar.visible": true,
21
  "workbench.colorTheme": "Visual Studio Dark",
22
  "workbench.fontAliasing": "antialiased",
23
  "workbench.statusBar.visible": true
 
17
  "files.autoSave": "afterDelay",
18
  "screencastMode.onlyKeyboardShortcuts": true,
19
  "terminal.integrated.fontSize": 18,
 
20
  "workbench.colorTheme": "Visual Studio Dark",
21
  "workbench.fontAliasing": "antialiased",
22
  "workbench.statusBar.visible": true
app/app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chroma compatibility issue resolution
2
+ # https://docs.trychroma.com/troubleshooting#sqlite
3
+ __import__('pysqlite3')
4
+ import sys
5
+ sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
6
+
7
+ from tempfile import NamedTemporaryFile
8
+
9
+ import chainlit as cl
10
+ from chainlit.types import AskFileResponse
11
+
12
+ import chromadb
13
+ from chromadb.config import Settings
14
+ from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
15
+ from langchain.chains.base import Chain
16
+ from langchain.chat_models import ChatOpenAI
17
+ from langchain.document_loaders import PDFPlumberLoader
18
+ from langchain.embeddings.openai import OpenAIEmbeddings
19
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
20
+ from langchain.vectorstores import Chroma
21
+ from langchain.vectorstores.base import VectorStore
22
+
23
+ from prompt import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE
24
+
25
+
26
+ namespaces = set()
27
+
28
+
29
+ def process_file(*, file: AskFileResponse) -> list:
30
+ if file.type != "application/pdf":
31
+ raise TypeError("Only PDF files are supported")
32
+
33
+
34
+ with NamedTemporaryFile() as tempfile:
35
+ tempfile.write(file.content)
36
+
37
+ ######################################################################
38
+ #
39
+ # 1. Load the PDF
40
+ #
41
+ ######################################################################
42
+ loader = PDFPlumberLoader(tempfile.name)
43
+
44
+ ######################################################################
45
+ documents = loader.load()
46
+
47
+ ######################################################################
48
+ #
49
+ # 2. Split the text
50
+ #
51
+ ######################################################################
52
+ text_splitter = RecursiveCharacterTextSplitter(
53
+ chunk_size=3000,
54
+ chunk_overlap=100
55
+ )
56
+ ######################################################################
57
+
58
+ docs = text_splitter.split_documents(documents)
59
+
60
+ for i, doc in enumerate(docs):
61
+ doc.metadata["source"] = f"source_{i}"
62
+
63
+ if not docs:
64
+ raise ValueError("PDF file parsing failed.")
65
+
66
+ return docs
67
+
68
+
69
+ def create_search_engine(*, file: AskFileResponse) -> VectorStore:
70
+
71
+ # Process and save data in the user session
72
+ docs = process_file(file=file)
73
+ cl.user_session.set("docs", docs)
74
+
75
+ ##########################################################################
76
+ #
77
+ # 3. Set the Encoder model for creating embeddings
78
+ #
79
+ ##########################################################################
80
+ encoder = OpenAIEmbeddings(
81
+ model="text-embedding-ada-002"
82
+ )
83
+ ##########################################################################
84
+
85
+ # Initialize Chromadb client and settings, reset to ensure we get a clean
86
+ # search engine
87
+ client = chromadb.EphemeralClient()
88
+ client_settings=Settings(
89
+ allow_reset=True,
90
+ anonymized_telemetry=False
91
+ )
92
+ search_engine = Chroma(
93
+ client=client,
94
+ client_settings=client_settings
95
+ )
96
+ search_engine._client.reset()
97
+
98
+ ##########################################################################
99
+ #
100
+ # 4. Create the document search engine. Remember to add
101
+ # client_settings using the above settings.
102
+ #
103
+ ##########################################################################
104
+
105
+ search_engine = Chroma.from_documents(
106
+ client=client,
107
+ documents=docs,
108
+ embedding=encoder,
109
+ client_settings=client_settings
110
+ )
111
+ ##########################################################################
112
+
113
+ return search_engine
114
+
115
+
116
+ @cl.on_chat_start
117
+ async def start():
118
+
119
+ files = None
120
+ while files is None:
121
+ files = await cl.AskFileMessage(
122
+ content=WELCOME_MESSAGE,
123
+ accept=["application/pdf"],
124
+ max_size_mb=20,
125
+ ).send()
126
+
127
+ file = files[0]
128
+ msg = cl.Message(content=f"Processing `{file.name}`...")
129
+ await msg.send()
130
+
131
+ try:
132
+ search_engine = await cl.make_async(create_search_engine)(file=file)
133
+ except Exception as e:
134
+ await cl.Message(content=f"Error: {e}").send()
135
+ raise SystemError
136
+
137
+ llm = ChatOpenAI(
138
+ model='gpt-3.5-turbo-16k-0613',
139
+ temperature=0,
140
+ streaming=True
141
+ )
142
+
143
+ ##########################################################################
144
+ #
145
+ # 5. Create the chain / tool for RetrievalQAWithSourcesChain.
146
+ #
147
+ ##########################################################################
148
+ chain = RetrievalQAWithSourcesChain.from_chain_type(
149
+ llm=llm,
150
+ chain_type="stuff",
151
+ retriever=search_engine.as_retriever(max_tokens_limit=4097),
152
+ ######################################################################
153
+ # 6. Customize prompts to improve summarization and question
154
+ # answering performance. Perhaps create your own prompt in prompts.py?
155
+ ######################################################################
156
+ chain_type_kwargs={
157
+ "prompt": PROMPT,
158
+ "document_prompt": EXAMPLE_PROMPT
159
+ },
160
+ )
161
+ ##########################################################################
162
+
163
+ # await msg.update(content=f"`{file.name}` processed. You can now ask questions!")
164
+ msg.content = f"`{file.name}` processed. You can now ask questions!"
165
+ await msg.update()
166
+
167
+ cl.user_session.set("chain", chain)
168
+
169
+
170
+ @cl.on_message
171
+ async def main(message: cl.Message):
172
+
173
+ chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
174
+ cb = cl.AsyncLangchainCallbackHandler()
175
+ response = await chain.acall(message.content, callbacks=[cb])
176
+ answer = response["answer"]
177
+ sources = response["sources"].strip()
178
+ source_elements = []
179
+
180
+ # Get the documents from the user session
181
+ docs = cl.user_session.get("docs")
182
+ metadatas = [doc.metadata for doc in docs]
183
+ all_sources = [m["source"] for m in metadatas]
184
+
185
+ # Adding sources to the answer
186
+ if sources:
187
+ found_sources = []
188
+
189
+ # Add the sources to the message
190
+ for source in sources.split(","):
191
+ source_name = source.strip().replace(".", "")
192
+ # Get the index of the source
193
+ try:
194
+ index = all_sources.index(source_name)
195
+ except ValueError:
196
+ continue
197
+ text = docs[index].page_content
198
+ found_sources.append(source_name)
199
+ # Create the text element referenced in the message
200
+ source_elements.append(cl.Text(content=text, name=source_name))
201
+
202
+ if found_sources:
203
+ answer += f"\nSources: {', '.join(found_sources)}"
204
+ else:
205
+ answer += "\nNo sources found"
206
+
207
+ await cl.Message(content=answer, elements=source_elements).send()
app/prompt.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ from langchain.prompts import PromptTemplate
3
+
4
+ WELCOME_MESSAGE = """\
5
+ Welcome to Introduction to LLM App Development Sample PDF QA Application!
6
+ To get started:
7
+ 1. Upload a PDF or text file
8
+ 2. Ask any question about the file!
9
+ """
10
+
11
+ template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
12
+ If you don't know the answer, just say that you don't know. Don't try to make up an answer.
13
+ ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
14
+
15
+ QUESTION: {question}
16
+ =========
17
+ {summaries}
18
+ =========
19
+ FINAL ANSWER:"""
20
+
21
+ PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
22
+
23
+ EXAMPLE_PROMPT = PromptTemplate(
24
+ template="Content: {page_content}\nSource: {source}",
25
+ input_variables=["page_content", "source"],
26
+ )
chainlit.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Welcome to your PDF QA Sample Application! πŸš€πŸ€–
2
+
3
+ Hi Team! πŸ‘‹ Congratulations on launching your first LLM Application. This application is build using OpenAI, Langchain, Chainlit, and Chroma. The goal of this application is to provite a quick overview of the most basic archetype of LLM application and the prototyping and debugging environment.
4
+
5
+ ## Useful Links πŸ”—
6
+
7
+ - **Langchain Documentation:** Get started with [Langchain Documentation](https://python.langchain.com/) πŸ”—
8
+ - **Chainlit Documentation:** Get started with [Chainlit Documentation](https://docs.chainlit.io) πŸ“š
requirements.txt CHANGED
@@ -1 +1,9 @@
1
  # Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
 
 
 
 
 
 
 
 
 
1
  # Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
2
+ openai==1.2.3
3
+ langchain==0.0.334
4
+ chainlit==0.7.501
5
+ tiktoken==0.5.1
6
+ pdfplumber==0.10.3
7
+ chromadb==0.4.17
8
+ pysqlite3-binary==0.5.2.post1
9
+ ruff==0.1.5
sample_pdf/NVDA 2QFY24.pdf ADDED
Binary file (85.3 kB). View file