CADomatic / src /rag_builder.py
YSMlearnsCode
changed to oss120b, updated readme tags
eb06e93
raw
history blame
3.79 kB
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
load_dotenv()
BASE_URL_WIKI = "https://wiki.freecad.org/Power_users_hub"
BASE_URL_GITHUB = "https://github.com/shaise/FreeCAD_FastenersWB"
DOMAIN_WHITELIST = [
"https://wiki.freecad.org",
"https://github.com/shaise"
]
LANG_IDENTIFIERS = [
"/id", "/de", "/tr", "/es", "/fr", "/hr", "/it", "/pl",
"/pt", "/pt-br", "/ro", "/fi", "/sv", "/cs", "/ru", "/zh-cn",
"/zh-tw", "/ja", "/ko"
]
CHECKPOINT_INTERVAL = 500 # save every 500 pages
VECTORSTORE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../vectorstore")
os.makedirs(VECTORSTORE_PATH, exist_ok=True)
def is_excluded_url(url):
url_lower = url.lower()
if "wiki.freecad.org" in url_lower:
if any(lang in url_lower for lang in LANG_IDENTIFIERS):
return True
return (
".jpg" in url_lower or
".png" in url_lower or
"edit&section" in url_lower
)
def crawl_wiki(start_url, max_pages):
visited = set()
to_visit = [start_url]
pages = []
while to_visit and len(visited) < max_pages:
url = to_visit.pop(0)
if url in visited or is_excluded_url(url):
continue
try:
print(f"Fetching: {url}")
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "html.parser")
visited.add(url)
for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
tag.extract()
text = soup.get_text(separator="\n")
clean = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
pages.append({"url": url, "text": clean})
# Queue internal links
for a in soup.find_all("a", href=True):
full = urljoin(url, a["href"])
if any(full.startswith(domain) for domain in DOMAIN_WHITELIST):
if full not in visited and not is_excluded_url(full):
to_visit.append(full)
# --- Checkpoint: save every N pages ---
if len(pages) % CHECKPOINT_INTERVAL == 0:
save_vectorstore_checkpoint(pages, checkpoint_suffix=f"checkpoint_{len(pages)}")
print(f"Checkpoint saved after {len(pages)} pages")
except Exception as e:
print(f"Error fetching {url}: {e}")
return pages
def save_vectorstore_checkpoint(pages, checkpoint_suffix="latest"):
texts = [p["text"] for p in pages]
metadatas = [{"source": p["url"]} for p in pages]
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = splitter.create_documents(texts, metadatas=metadatas)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)
checkpoint_path = os.path.join(VECTORSTORE_PATH, checkpoint_suffix)
os.makedirs(checkpoint_path, exist_ok=True)
vectorstore.save_local(checkpoint_path)
def build_vectorstore():
wiki_pages = crawl_wiki(BASE_URL_WIKI, max_pages=2000) #2000
github_pages = crawl_wiki(BASE_URL_GITHUB, max_pages=450) #450
all_pages = wiki_pages + github_pages
if not all_pages:
print("No pages crawled. Exiting.")
return
# Final save
save_vectorstore_checkpoint(all_pages, checkpoint_suffix="final")
print(f"Vectorstore fully saved to {VECTORSTORE_PATH}/final")
if __name__ == "__main__":
build_vectorstore()