Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| from dotenv import load_dotenv | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| load_dotenv() | |
| api_key = os.getenv("GEMINI_API_KEY") | |
| # --- 1. Load Document --- | |
| print("Loading document...") | |
| loader = TextLoader("data.txt") | |
| documents = loader.load() | |
| # --- 2. Split Document --- | |
| print("Splitting text...") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| texts = text_splitter.split_documents(documents) | |
| if not texts: | |
| print("No text found in 'data.txt'. Exiting.") | |
| exit() | |
| # --- 3. Create Embeddings & Vector Store --- | |
| print("Initializing embeddings...") | |
| embeddings = GoogleGenerativeAIEmbeddings( | |
| model="gemini-embedding-001", | |
| google_api_key=api_key | |
| ) | |
| # Set a reasonable batch size (Google's API limit is 100) | |
| batch_size = 90 | |
| db = None | |
| try: | |
| # Create the vector store with the first batch | |
| first_batch = texts[0:batch_size] | |
| print(f"Creating vector store with initial batch (0 to {len(first_batch)})...") | |
| db = FAISS.from_documents(first_batch, embeddings) | |
| # Now, add the rest of the batches | |
| for i in range(batch_size, len(texts), batch_size): | |
| batch = texts[i:i+batch_size] | |
| print(f"Adding batch {i} to {i+len(batch)}...") | |
| # Use add_documents to add to the existing index | |
| db.add_documents(batch) | |
| # Optional: Add a small delay if you still see rate limit errors | |
| # time.sleep(1) | |
| # --- 4. Save the final vector store --- | |
| db.save_local("faiss_index") | |
| print("\nDone. Vector store saved as 'faiss_index'") | |
| except Exception as e: | |
| print(f"\n--- AN ERROR OCCURRED ---") | |
| print(f"{e}") | |
| print("\nThis was likely a network timeout or API issue.") | |
| print("Please check your firewall/VPN settings and try running the script again.") |