Shad0ws commited on
Commit
f402e2d
·
1 Parent(s): 66c323a

Upload 6 files

Browse files
Files changed (6) hide show
  1. .vscode/settings.json +3 -0
  2. app.py +74 -0
  3. knowledge_base.py +100 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +27 -0
  6. requirements.txt +2 -0
.vscode/settings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "python.linting.enabled": false
3
+ }
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ from knowledge_base import KnowledgeBase
5
+
6
+ # Page setup
7
+ st.set_page_config(page_title="Website to AI-Powered Knowledge Base", page_icon="🐍")
8
+ st.title("AI-Powered Knowledge Base")
9
+
10
+ # Remove whitespace from the top of the page and sidebar
11
+ st.markdown(
12
+ """
13
+ <style>
14
+ .css-18e3th9 {
15
+ padding-top: 0rem;
16
+ padding-bottom: 10rem;
17
+ padding-left: 5rem;
18
+ padding-right: 5rem;
19
+ }
20
+ .css-1d391kg {
21
+ padding-top: 3.5rem;
22
+ padding-right: 1rem;
23
+ padding-bottom: 3.5rem;
24
+ padding-left: 1rem;
25
+ }
26
+ </style>
27
+ """,
28
+ unsafe_allow_html=True,
29
+ )
30
+
31
+ st.markdown("## Config")
32
+
33
+ col1, col2 = st.columns(2)
34
+
35
+ with col1:
36
+ sitemap_url = st.text_input("URL to the website sitemap", value="")
37
+
38
+ with col2:
39
+ pattern = st.text_input("URL filter pattern (optional)", value="")
40
+
41
+
42
+ st.markdown("## Ask")
43
+
44
+
45
+ @st.cache_resource
46
+ def get_knowledge_base(url, pattern):
47
+ return KnowledgeBase(
48
+ sitemap_url=url,
49
+ pattern=pattern,
50
+ chunk_size=8000,
51
+ chunk_overlap=3000,
52
+ )
53
+
54
+
55
+ @st.cache_resource
56
+ def get_answer(url, pattern, query):
57
+ kb = get_knowledge_base(sitemap_url, pattern)
58
+ return kb.ask(query)
59
+
60
+
61
+ if sitemap_url and pattern:
62
+ with st.spinner("Getting the knowledge base ready, this may take a bit ..."):
63
+ kb = get_knowledge_base(sitemap_url, pattern)
64
+
65
+ query = st.text_input("Question", value="")
66
+
67
+ if query:
68
+ with st.spinner("Getting the answer ..."):
69
+ result = get_answer(sitemap_url, pattern, query)
70
+
71
+ st.markdown("### Answer")
72
+ st.markdown(result["answer"])
73
+ st.markdown("### Sources")
74
+ st.markdown("\n ".join([f"- {x}" for x in result["sources"].split("\n")]))
knowledge_base.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from langchain.embeddings.openai import OpenAIEmbeddings
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.vectorstores import Chroma
6
+ from langchain.document_loaders import UnstructuredURLLoader
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.chains import RetrievalQAWithSourcesChain
9
+
10
+ import requests
11
+ import xml.etree.ElementTree as ET
12
+ from dotenv import load_dotenv
13
+ from loguru import logger
14
+
15
+ load_dotenv()
16
+
17
+
18
+ def extract_urls_from_sitemap(sitemap):
19
+ """
20
+ Extract all URLs from a sitemap XML string.
21
+
22
+ Args:
23
+ sitemap_string (str): The sitemap XML string.
24
+
25
+ Returns:
26
+ A list of URLs extracted from the sitemap.
27
+ """
28
+ # Parse the XML from the string
29
+ root = ET.fromstring(sitemap)
30
+
31
+ # Define the namespace for the sitemap XML
32
+ namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
33
+
34
+ # Find all <loc> elements under the <url> elements
35
+ urls = [
36
+ url.find("ns:loc", namespace).text for url in root.findall("ns:url", namespace)
37
+ ]
38
+
39
+ # Return the list of URLs
40
+ return urls
41
+
42
+
43
+ class KnowledgeBase:
44
+ def __init__(
45
+ self,
46
+ sitemap_url: str,
47
+ chunk_size: int,
48
+ chunk_overlap: int,
49
+ pattern: Optional[str] = None,
50
+ ):
51
+ logger.info("Building the knowledge base ...")
52
+
53
+ logger.info("Loading sitemap from {sitemap_url} ...", sitemap_url=sitemap_url)
54
+ sitemap = requests.get(sitemap_url).text
55
+ urls = extract_urls_from_sitemap(sitemap)
56
+
57
+ if pattern:
58
+ logger.info("Filtering URLs with pattern {pattern} ...", pattern=pattern)
59
+ urls = [x for x in urls if pattern in x]
60
+ logger.info("{n} URLs extracted", n=len(urls))
61
+
62
+ logger.info("Loading URLs content ...")
63
+ loader = UnstructuredURLLoader(urls)
64
+ data = loader.load()
65
+
66
+ logger.info("Splitting documents in chunks ...")
67
+ doc_splitter = CharacterTextSplitter(
68
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
69
+ )
70
+ docs = doc_splitter.split_documents(data)
71
+ logger.info("{n} chunks created", n=len(docs))
72
+
73
+ logger.info("Building the vector database ...")
74
+ embeddings = OpenAIEmbeddings()
75
+ docsearch = Chroma.from_documents(docs, embeddings)
76
+
77
+ logger.info("Building the retrieval chain ...")
78
+ self.chain = RetrievalQAWithSourcesChain.from_chain_type(
79
+ ChatOpenAI(),
80
+ chain_type="map_reduce",
81
+ retriever=docsearch.as_retriever(),
82
+ )
83
+
84
+ logger.info("Knowledge base created!")
85
+
86
+ def ask(self, query: str):
87
+ return self.chain({"question": query}, return_only_outputs=True)
88
+
89
+
90
+ if __name__ == "__main__":
91
+ # Build the knowledge base
92
+ kb = KnowledgeBase(
93
+ sitemap_url="https://nextjs.org/sitemap.xml",
94
+ pattern="docs/api-refe",
95
+ chunk_size=8000,
96
+ chunk_overlap=3000,
97
+ )
98
+
99
+ # Ask a question
100
+ res = kb.ask("How do I deploy my Next.js app?")
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "website-to-knowledge-base"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["JimZer <[email protected]>"]
6
+ readme = "README.md"
7
+ packages = [{include = "website_to_knowledge_base"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+ openai = "^0.27.4"
12
+ langchain = "^0.0.144"
13
+ unstructured = "^0.5.13"
14
+ chromadb = "^0.3.21"
15
+ tiktoken = "^0.3.3"
16
+ python-dotenv = "^1.0.0"
17
+ loguru = "^0.7.0"
18
+ streamlit = "^1.21.0"
19
+
20
+
21
+ [tool.poetry.group.dev.dependencies]
22
+ black = "^23.3.0"
23
+ isort = "^5.12.0"
24
+
25
+ [build-system]
26
+ requires = ["poetry-core"]
27
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit
2
+ poetry