chatwebsite_2 / scrape.py
antfraia's picture
Update scrape.py
9b0553b
raw
history blame
1.59 kB
import os
from apify_client import ApifyClient
from langchain.document_loaders import ApifyDatasetLoader
from langchain.document_loaders.base import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
# Access variables and secrets as environment variables
WEBSITE_URL = os.environ.get('WEBSITE_URL')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN')
if __name__ == '__main__':
apify_client = ApifyClient(APIFY_API_TOKEN)
print(f'Extracting data from "{WEBSITE_URL}". Please wait...')
actor_run_info = apify_client.actor('apify/website-content-crawler').call(
run_input={'startUrls': [{'url': WEBSITE_URL}]}
)
print('Saving data into the vector database. Please wait...')
loader = ApifyDatasetLoader(
dataset_id=actor_run_info['defaultDatasetId'],
dataset_mapping_function=lambda item: Document(
page_content=item['text'] or '', metadata={'source': item['url']}
),
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
# Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
vectordb = Chroma.from_documents(
documents=docs,
embedding=embedding,
persist_directory='db2',
)
vectordb.persist()
print('All done!')