MathWizard1729 commited on
Commit
9ce288c
·
verified ·
1 Parent(s): db62c2d

Delete indexer.py

Browse files
Files changed (1) hide show
  1. indexer.py +0 -145
indexer.py DELETED
@@ -1,145 +0,0 @@
1
-
2
- import os
3
- import logging
4
- from dotenv import load_dotenv
5
- from PyPDF2 import PdfReader
6
- from langchain.docstore.document import Document
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.embeddings import BedrockEmbeddings
9
- from langchain_chroma import Chroma
10
- from botocore.exceptions import ClientError
11
-
12
- # Set up logging
13
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
- logger = logging.getLogger(__name__)
15
-
16
- def load_environment():
17
- """Load environment variables from .env file or system environment."""
18
- load_dotenv()
19
- required_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_REGION']
20
- for var in required_vars:
21
- if not os.getenv(var):
22
- logger.error(f"Missing environment variable: {var}")
23
- raise ValueError(f"Missing environment variable: {var}")
24
- logger.info("Environment variables loaded successfully")
25
-
26
- def load_uploaded_pdfs(uploaded_files):
27
- """Load and extract text from uploaded PDF files."""
28
- documents = []
29
- pdf_count = 0
30
- try:
31
- for file_path in uploaded_files:
32
- pdf_count += 1
33
- file_name = os.path.basename(file_path)
34
- logger.info(f"Loading uploaded PDF: {file_name}")
35
- # Open the file from the provided path
36
- with open(file_path, 'rb') as pdf_file:
37
- pdf_reader = PdfReader(pdf_file)
38
- # Extract text from each page
39
- text = ""
40
- for page_num, page in enumerate(pdf_reader.pages):
41
- page_text = page.extract_text() or ""
42
- text += page_text
43
- # Create a LangChain Document for each page
44
- documents.append(Document(
45
- page_content=page_text,
46
- metadata={"source": file_name, "page": page_num + 1}
47
- ))
48
- if not text.strip():
49
- logger.warning(f"No text extracted from {file_name}")
50
- if not documents:
51
- logger.warning("No PDF files provided or no text extracted")
52
- else:
53
- logger.info(f"Loaded {len(documents)} pages from {pdf_count} PDFs")
54
- return documents, pdf_count
55
- except Exception as e:
56
- logger.error(f"Error loading PDFs: {str(e)}")
57
- raise
58
-
59
- def split_documents(documents):
60
- """Split documents into chunks for embedding."""
61
- try:
62
- text_splitter = RecursiveCharacterTextSplitter(
63
- chunk_size=1000,
64
- chunk_overlap=200,
65
- length_function=len
66
- )
67
- chunks = text_splitter.split_documents(documents)
68
- logger.info(f"Split documents into {len(chunks)} chunks")
69
- return chunks
70
- except Exception as e:
71
- logger.error(f"Error splitting documents: {str(e)}")
72
- raise
73
-
74
- def initialize_embeddings():
75
- """Initialize Amazon Bedrock embeddings."""
76
- try:
77
- embeddings = BedrockEmbeddings(
78
- model_id="amazon.titan-embed-text-v1",
79
- region_name=os.getenv("AWS_REGION")
80
- )
81
- logger.info("Initialized Bedrock embeddings")
82
- return embeddings
83
- except ClientError as e:
84
- logger.error(f"Error initializing Bedrock embeddings: {str(e)}")
85
- raise
86
-
87
- def store_in_chroma(chunks, embeddings, db_directory="./chroma_db", collection_name="pdf_rag"):
88
- """Store document chunks and embeddings in Chroma vector database."""
89
- try:
90
- # Clear existing Chroma database if it exists
91
- if os.path.exists(db_directory):
92
- import shutil
93
- shutil.rmtree(db_directory)
94
- logger.info(f"Cleared existing Chroma database at {db_directory}")
95
- os.makedirs(db_directory, exist_ok=True)
96
-
97
- vector_store = Chroma.from_documents(
98
- documents=chunks,
99
- embedding=embeddings,
100
- collection_name=collection_name,
101
- persist_directory=db_directory
102
- )
103
- logger.info(f"Stored {len(chunks)} chunks in Chroma vector database at {db_directory}")
104
- return vector_store
105
- except Exception as e:
106
- logger.error(f"Error storing in Chroma: {str(e)}")
107
- raise
108
-
109
- def index_uploaded_pdfs(uploaded_files, db_directory="./chroma_db"):
110
- """Index uploaded PDF files and return vector store and summary."""
111
- try:
112
- # Load environment variables
113
- load_environment()
114
-
115
- # Load and process PDFs
116
- documents, pdf_count = load_uploaded_pdfs(uploaded_files)
117
- if not documents:
118
- return None, {"pdf_count": 0, "page_count": 0, "chunk_count": 0, "db_location": db_directory}
119
-
120
- # Split documents into chunks
121
- chunks = split_documents(documents)
122
-
123
- # Initialize embeddings
124
- embeddings = initialize_embeddings()
125
-
126
- # Store in Chroma
127
- vector_store = store_in_chroma(chunks, embeddings, db_directory)
128
-
129
- # Summary
130
- summary = {
131
- "pdf_count": pdf_count,
132
- "page_count": len(documents),
133
- "chunk_count": len(chunks),
134
- "db_location": db_directory
135
- }
136
- logger.info("Indexing Summary:")
137
- logger.info(f" Number of PDFs processed: {summary['pdf_count']}")
138
- logger.info(f" Total pages loaded: {summary['page_count']}")
139
- logger.info(f" Total chunks created: {summary['chunk_count']}")
140
- logger.info(f" Chroma database location: {summary['db_location']}")
141
- return vector_store, summary
142
-
143
- except Exception as e:
144
- logger.error(f"Indexing failed: {str(e)}")
145
- raise