Spaces:
Sleeping
Sleeping
import os | |
from apify_client import ApifyClient | |
from langchain.document_loaders import ApifyDatasetLoader | |
from langchain.document_loaders.base import Document | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
# Access variables and secrets as environment variables | |
WEBSITE_URL = os.environ.get('WEBSITE_URL') | |
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') | |
APIFY_API_TOKEN = os.environ.get('APIFY_API_TOKEN') | |
if __name__ == '__main__': | |
apify_client = ApifyClient(APIFY_API_TOKEN) | |
print(f'Extracting data from "{WEBSITE_URL}". Please wait...') | |
actor_run_info = apify_client.actor('apify/website-content-crawler').call( | |
run_input={'startUrls': [{'url': WEBSITE_URL}]} | |
) | |
print('Saving data into the vector database. Please wait...') | |
loader = ApifyDatasetLoader( | |
dataset_id=actor_run_info['defaultDatasetId'], | |
dataset_mapping_function=lambda item: Document( | |
page_content=item['text'] or '', metadata={'source': item['url']} | |
), | |
) | |
documents = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100) | |
docs = text_splitter.split_documents(documents) | |
# Ensure the OPENAI_API_KEY is used correctly in OpenAIEmbeddings | |
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY) | |
vectordb = Chroma.from_documents( | |
documents=docs, | |
embedding=embedding, | |
persist_directory='db2', | |
) | |
vectordb.persist() | |
print('All done!') |