import os from apify_client import ApifyClient from langchain.document_loaders import ApifyDatasetLoader from langchain.document_loaders.base import Document from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma # Access variables and secrets as environment variables WEBSITE_URL = os.environ.get('WEBSITE_URL') OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN') if __name__ == '__main__': apify_client = ApifyClient(APIFY_API_TOKEN) print(f'Extracting data from "{WEBSITE_URL}". Please wait...') actor_run_info = apify_client.actor('apify/website-content-crawler').call( run_input={'startUrls': [{'url': WEBSITE_URL}]} ) print('Saving data into the vector database. Please wait...') loader = ApifyDatasetLoader( dataset_id=actor_run_info['defaultDatasetId'], dataset_mapping_function=lambda item: Document( page_content=item['text'] or '', metadata={'source': item['url']} ), ) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100) docs = text_splitter.split_documents(documents) # Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY) vectordb = Chroma.from_documents( documents=docs, embedding=embedding, persist_directory='db2', ) vectordb.persist() print('All done!')