File size: 21,699 Bytes
872c2a9
3f61806
 
 
 
 
 
 
 
 
 
 
 
 
872c2a9
3f61806
24ae72d
 
3f61806
 
7361b6f
 
 
 
5bda5ed
b9ccd0b
 
 
7361b6f
5bda5ed
b9ccd0b
 
7361b6f
 
5bda5ed
 
b9ccd0b
 
7361b6f
b9ccd0b
 
 
 
 
 
 
 
 
 
 
872c2a9
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
872c2a9
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f61806
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7361b6f
b9ccd0b
 
 
 
 
 
 
 
872c2a9
 
b9ccd0b
 
 
 
 
 
 
 
7361b6f
b9ccd0b
 
 
 
 
 
 
 
 
 
872c2a9
7361b6f
b9ccd0b
 
 
872c2a9
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bda5ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7361b6f
b9ccd0b
 
 
 
 
 
5bda5ed
 
3f61806
 
5bda5ed
 
 
 
 
 
b9ccd0b
5bda5ed
 
 
 
 
 
b9ccd0b
5bda5ed
 
 
 
b9ccd0b
5bda5ed
 
 
 
 
 
 
 
 
 
b9ccd0b
5bda5ed
 
 
b9ccd0b
5bda5ed
 
 
 
b9ccd0b
 
5bda5ed
 
 
b9ccd0b
5bda5ed
 
7361b6f
5bda5ed
 
 
 
 
 
 
 
b9ccd0b
5bda5ed
 
 
 
 
 
 
 
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
7361b6f
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
872c2a9
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f61806
 
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
7361b6f
 
 
 
 
 
 
b9ccd0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7361b6f
b9ccd0b
7361b6f
5bda5ed
b9ccd0b
7361b6f
 
5bda5ed
b9ccd0b
3f61806
b9ccd0b
 
872c2a9
3f61806
872c2a9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
# /home/bk_anupam/code/LLM_agents/RAG_BOT/bot.py
import telebot
import sys
from telebot.types import Message, Update
from datetime import datetime
import re
import os
from flask import Flask, request, jsonify

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)

from config import Config
from RAG_BOT.logger import logger
from vector_store import VectorStore
# Updated import for build_agent
from RAG_BOT.agent.graph_builder import build_agent
from langchain_core.messages import HumanMessage
from message_handler import MessageHandler
from RAG_BOT.utils import detect_document_language
from RAG_BOT.file_manager import FileManager 
from RAG_BOT.document_indexer import DocumentIndexer 
from RAG_BOT.pdf_processor import PdfProcessor
from RAG_BOT.htm_processor import HtmProcessor # Added import


class TelegramBotApp:
    def __init__(self, config: Config, vector_store_instance: VectorStore, agent, 
                 handler: MessageHandler, pdf_processor: PdfProcessor = None, htm_processor: HtmProcessor = None):
        # Initialize Flask app
        self.app = Flask(__name__)
        self.config = config
        self.vector_store_instance = vector_store_instance
        self.pdf_processor = pdf_processor or PdfProcessor()
        self.htm_processor = htm_processor or HtmProcessor()

        # Use injected dependencies
        self.vectordb = vector_store_instance.get_vectordb()
        self.agent = agent
        self.handler = handler

        # Assumes a 'data' folder path exists in .env
        self.DATA_DIRECTORY = self.config.DATA_PATH
        logger.info(f"Data directory set to: {self.DATA_DIRECTORY}")

        if not self.config.TELEGRAM_BOT_TOKEN:
            logger.error("TELEGRAM_BOT_TOKEN is not set. Please set it in your environment variables.")
            exit(1)

        try:
            # Create Telegram bot instance
            self.bot = telebot.TeleBot(self.config.TELEGRAM_BOT_TOKEN)
            logger.info("Telegram bot initialized successfully")

            # Setup webhook route after initializing bot and config
            self._setup_webhook_route()
            logger.info("Webhook route set up successfully")

            # Register message handlers after bot initialization
            self.bot.register_message_handler(self.send_welcome, commands=['start'])
            self.bot.register_message_handler(self.send_help, commands=['help'])
            self.bot.register_message_handler(self.handle_language_command, commands=['language']) # Register new command
            self.bot.register_message_handler(self.handle_document, content_types=['document'])
            self.bot.register_message_handler(self.handle_all_messages, func=lambda message: True)
            logger.info("Message handlers registered successfully")

        except Exception as e:
            logger.critical(f"Failed during application startup: {str(e)}", exc_info=True)
            exit(1)


    def _setup_webhook_route(self):
        """Sets up the webhook endpoint for Telegram."""
        @self.app.route(f'/{self.config.TELEGRAM_BOT_TOKEN}', methods=['POST'])
        def webhook():
            """Handle incoming webhook requests from Telegram"""
            if request.headers.get('content-type') == 'application/json':
                logger.info("Received webhook request") # Changed level to debug for less noise
                try:
                    json_data = request.get_json()
                    update = Update.de_json(json_data)
                    self.bot.process_new_updates([update])
                    return jsonify({"status": "ok"})
                except Exception as e:
                    logger.error(f"Error processing webhook update: {e}", exc_info=True)
                    return jsonify({"status": "error", "message": "Internal server error"}), 500
            else:
                logger.warning(f"Received invalid content type for webhook: {request.headers.get('content-type')}")
                return jsonify({"status": "error", "message": "Invalid content type"}), 400


    def send_response(self, message, user_id, response_text):
        """
        Sends a response to the user, handling potential message length limits.
        """
        if not response_text:
            logger.warning(f"Attempted to send empty response to user {user_id}")
            response_text = "Sorry, I could not generate a response."

        # Maximum allowed message length in Telegram (adjust if needed)
        max_telegram_length = 4096
        chunks = [response_text[i:i + max_telegram_length] for i in range(0, len(response_text), max_telegram_length)]
        try:
            # Send first chunk as reply, subsequent as regular messages to the chat
            if chunks:
                logger.info(f"Sending response to user {user_id}: {chunks[0][:100]}...")
                self.bot.reply_to(message, chunks[0])
                for chunk in chunks[1:]:
                    self.bot.send_message(message.chat.id, chunk)
        except telebot.apihelper.ApiException as e:
            logger.error(f"Error sending message chunk to user {user_id} in chat {message.chat.id}: {str(e)}")
            # Maybe try sending a generic error message if the main response failed
            try:
                self.bot.reply_to(message, "Sorry, there was an error sending the full response.")
            except Exception:
                logger.error(f"Failed even to send error notification to user {user_id}")
        except Exception as e:
             logger.error(f"Unexpected error in send_response for user {user_id}: {e}", exc_info=True)


    # Telegram message handlers
    @property
    def message_handlers(self):
        """Returns a list of message handlers for the bot."""
        return [
            self.send_welcome,
            self.send_help,
            self.handle_language_command,
            self.handle_document,
            self.handle_all_messages,
        ]


    def send_welcome(self, message):
        logger.info(f"Received /start command from user {message.from_user.id}")
        self.bot.reply_to(message, "Welcome to the spiritual chatbot! Ask me questions about the indexed documents, or use /help for commands.")


    def send_help(self, message):
        logger.info(f"Received /help command from user {message.from_user.id}")
        self.bot.reply_to(message,
            """
            Available Commands:
            /start - Show welcome message.
            /help - Show this help message.
            /language <lang> - Set bot language (english or hindi). Example: /language hindi
            /query <your question> [date:YYYY-MM-DD] - Ask a question about the documents. Optionally filter by date.
            You can also just type your question directly.
            """
        )


    def handle_language_command(self, message: Message):
        """Handles the /language command to set user preference."""
        user_id = message.from_user.id
        parts = message.text.split(maxsplit=1)

        if len(parts) < 2:
            # Fetch usage help from config
            usage_text = self.config.get_user_message('language_usage_help',
                                                      "Usage: /language <language>\nSupported languages: english, hindi")
            self.bot.reply_to(message, usage_text)
            return

        lang_input = parts[1].strip().lower()
        lang_code = None
        if lang_input == 'english':
            lang_code = 'en'
        elif lang_input == 'hindi':
            lang_code = 'hi'
        else:
            unsupported_text = self.config.get_user_message('language_unsupported',
                                                            "Unsupported language. Please use 'english' or 'hindi'.")
            self.bot.reply_to(message, unsupported_text)
            return

        # Initialize session for the user if it doesn't exist
        self.config.USER_SESSIONS.setdefault(user_id, {})
        # Store the language preference
        self.config.USER_SESSIONS[user_id]['language'] = lang_code
        logger.info(f"Set language preference for user {user_id} to '{lang_code}'")

        # Get confirmation message in the selected language (fetch from prompts or use defaults)
        confirmation_prompt_key = f"language_set_{lang_code}"
        # Define defaults just in case the keys are missing from prompts.yaml
        default_confirmations = {'en': "Language set to English.", 'hi': "भाषा हिंदी में सेट कर दी गई है।"}
        # Use the new config method to get the message
        reply_text = self.config.get_user_message(confirmation_prompt_key, default_confirmations[lang_code])

        self.bot.reply_to(message, reply_text)


    def _cleanup_uploaded_file(self, file_path, processed_successfully):
        """Handles cleanup of uploaded files after processing."""
        if processed_successfully and os.path.exists(file_path):
            try:
                os.remove(file_path)
                logger.info(f"Successfully processed and removed '{file_path}' from uploads directory.")
            except OSError as e:
                logger.error(f"Error removing processed file '{file_path}' from uploads: {e}")
        elif not processed_successfully and os.path.exists(file_path):
            logger.info(f"File '{file_path}' was not successfully processed/indexed and will remain in the uploads directory.")
        elif not os.path.exists(file_path) and processed_successfully:
            logger.warning(f"Attempted to remove '{file_path}', but it was already deleted (or never saved properly).")

    def _determine_file_name(self, message, file_ext, default_doc_name):
        """Determines the correct file name for the uploaded document."""
        original_file_name = message.document.file_name
        file_name = original_file_name or default_doc_name
        # Ensure the filename has the correct extension if it was defaulted
        if not file_name.lower().endswith(file_ext) and original_file_name is None:
            file_name = os.path.splitext(file_name)[0] + file_ext
        return file_name

    def _process_document_metadata(self, message: Message):
        """
        Determines file extension, default name, and processing mime type
        based on the uploaded document's mime type and filename.
        Returns a tuple: (file_ext, default_doc_name, processing_mime_type)
        Raises ValueError if the file type is unsupported.
        """
        mime_type = message.document.mime_type
        file_id = message.document.file_id
        original_file_name = message.document.file_name

        file_ext = None
        processing_mime_type = mime_type # Default to original mime type

        if mime_type == 'application/pdf':
            file_ext = ".pdf"
            default_doc_name = f"doc_{file_id}.pdf"
        elif mime_type in ['text/html', 'application/xhtml+xml']:
            file_ext = ".htm"
            default_doc_name = f"doc_{file_id}.htm"
        elif mime_type == 'application/octet-stream':
             # If generic binary, try to determine type from file name
             if original_file_name:
                 name, ext = os.path.splitext(original_file_name)
                 if ext.lower() in ['.htm', '.html']:
                     file_ext = ".htm"
                     default_doc_name = original_file_name
                     processing_mime_type = 'text/html' # Treat as html for processing
                 elif ext.lower() == '.pdf':
                     file_ext = ".pdf"
                     default_doc_name = original_file_name
                     processing_mime_type = 'application/pdf' # Treat as pdf for processing
             
             if file_ext is None: # If still no specific type determined
                 raise ValueError(f"Unsupported file type or unable to determine type from '{original_file_name or 'uploaded file'}'.")

        else: # Handle other explicit unsupported mime types
            raise ValueError(f"Unsupported file type ({mime_type}).")

        return file_ext, default_doc_name, processing_mime_type


    # --- Document Upload Handling (Consider if needed with startup indexing) ---
    def handle_document(self, message: Message):
        """
        Handles incoming document messages. Checks for PDF, saves, and indexes.
        Detects language using utility function.
        """
        user_id = message.from_user.id
        if not message.document:
            self.bot.reply_to(message, "No document provided.")
            return

        file_id = message.document.file_id
        mime_type = message.document.mime_type # Keep original mime_type for logging initially
        logger.info(f"Received document from user mime_type: {mime_type} (file_id: {file_id})")
        file_path = None # Initialize file_path
        documents = [] # Initialize documents list
        processed_successfully = False
        try:
            # Use the new helper method to process metadata
            file_ext, default_doc_name, processing_mime_type = self._process_document_metadata(message)
            file_name = self._determine_file_name(message, file_ext, default_doc_name)
            logger.info(f"User {user_id} uploaded {mime_type} (processed as {processing_mime_type}): {file_name}")
            # Define a specific upload directory
            upload_dir = os.path.join(project_root, "uploads")
            os.makedirs(upload_dir, exist_ok=True)
            file_path = os.path.join(upload_dir, file_name)                    
            file_info = self.bot.get_file(file_id)
            downloaded_file = self.bot.download_file(file_info.file_path)
            with open(file_path, 'wb') as new_file:
                new_file.write(downloaded_file)
            logger.info(f"Document saved to: {file_path}")
            # Load the document using the appropriate processor based on processing_mime_type
            if processing_mime_type == 'application/pdf':
                documents = self.pdf_processor.load_pdf(file_path)
            elif processing_mime_type in ['text/html', 'application/xhtml+xml']:
                # HtmProcessor.load_htm returns a single Document or None
                doc = self.htm_processor.load_htm(file_path)
                if doc:
                    documents.append(doc)
            
            if not documents:
                logger.warning(f"No documents loaded from: {file_path}. Skipping indexing.")
                self.bot.reply_to(message, f"Could not load content from '{file_name}'.")
                # File remains in uploads dir if loading fails
                return

            # Detect language using the utility function with loaded documents
            language = detect_document_language(documents, file_name_for_logging=file_name)             
            # Add detected language metadata
            for doc in documents:
                doc.metadata['language'] = language
            logger.info(f"Added language metadata '{language}' to uploaded document: {file_name}")
            # Index the document list
            was_indexed = self.vector_store_instance.index_document(documents, semantic_chunk=self.config.SEMANTIC_CHUNKING)                        
            if was_indexed:
                self.bot.reply_to(message, f"Document '{file_name}' uploaded and indexed successfully.")
                processed_successfully = True
            else:
                self.bot.reply_to(message, f"Document '{file_name}' was not indexed (possibly already exists or an error occurred).")
                # File remains in uploads dir if indexing fails or it's a duplicate

        except ValueError as ve: # Catch unsupported file type errors from _process_document_metadata
             logger.warning(f"Unsupported file type for user {user_id}: {ve}")
             self.bot.reply_to(message, str(ve))
             # No file was saved in this case, so no cleanup needed
             return
        except Exception as e:
            logger.error(f"Error handling document upload from user {user_id} for {file_name}: {str(e)}", exc_info=True)
            self.bot.reply_to(message, "Sorry, I encountered an error processing your document.")            
        finally:
            # Delete the file from upload_dir ONLY if processed and indexed successfully
            # Ensure file_path is not None before attempting cleanup
            if file_path:
                self._cleanup_uploaded_file(file_path, processed_successfully)

    # --- End Document Upload Handling ---


    def handle_all_messages(self, message: Message):
        """
        Handles all non-command text messages.
        """
        user_id = message.from_user.id
        # Get user's preferred language from session, default to 'en' if not set
        user_lang = self.config.USER_SESSIONS.get(user_id, {}).get('language', 'en')
        logger.info(f"Received message from user {user_id}: '{message.text[:100]}...'")
        try:
            # Process the message using the handler (which might invoke the agent or query directly)
            response_text = self.handler.process_message(message, user_lang)
            self.send_response(message, user_id, response_text)
        except Exception as e:
            logger.error(f"Error processing message from user {user_id}: {str(e)}", exc_info=True)
            self.bot.reply_to(message, "Sorry, I encountered an error processing your request.")


    # Setup and webhook configuration functions
    def setup_webhook(self, url):
        """Set up the webhook for the Telegram bot"""
        if not url:
            logger.error("WEBHOOK_URL is not configured. Cannot set webhook.")
            return False # Indicate failure
        try:
            webhook_url = f"{url.rstrip('/')}/{self.config.TELEGRAM_BOT_TOKEN}"
            logger.info("Removing existing webhook (if any)...")
            self.bot.remove_webhook()
            logger.info(f"Setting webhook to: {webhook_url}")
            success = self.bot.set_webhook(url=webhook_url)
            if success:
                logger.info("Webhook set successfully.")
                return True
            else:
                logger.error("Failed to set webhook.")
                return False
        except Exception as e:
            logger.error(f"Error setting up webhook: {e}", exc_info=True)
            return False


    def run(self):
        """Runs the Flask application."""
        WEBHOOK_URL = self.config.WEBHOOK_URL
        if not WEBHOOK_URL:
            logger.error("WEBHOOK_URL is not set in config. Cannot start Flask server with webhook.")
            exit(1)

        if self.setup_webhook(WEBHOOK_URL):
            logger.info(f"Starting Flask server on port {self.config.PORT}")
            self.app.run(host='0.0.0.0', port=self.config.PORT, debug=False)
        else:
            logger.critical("Failed to set up webhook. Aborting Flask server start.")
            exit(1)


if __name__ == "__main__":
    try:
        # Initialize dependencies
        config = Config()

        # Assumes a 'data' folder path exists in .env
        DATA_DIRECTORY = config.DATA_PATH
        logger.info(f"Data directory set to: {DATA_DIRECTORY}")

        # Instantiate the vector store instance
        logger.info("Initializing VectorStore...")
        vector_store_instance = VectorStore(config.VECTOR_STORE_PATH)
        vectordb = vector_store_instance.get_vectordb() # Get the db instance after init
        logger.info("VectorStore initialized.")

        # --- Index data directory on startup ---
        # Instantiate FileManager and DocumentIndexer
        file_manager_instance = FileManager()
        document_indexer_instance = DocumentIndexer(vector_store_instance=vector_store_instance, file_manager_instance=file_manager_instance)

        # Call index_directory on the DocumentIndexer instance
        document_indexer_instance.index_directory(DATA_DIRECTORY)
        # --- End Indexing ---

        # Log the final state of indexed metadata after potential indexing
        logger.info("Logging final indexed metadata...")
        vector_store_instance.log_all_indexed_metadata()

        # Create rag agent instance
        logger.info("Initializing RAG agent...")
        # Ensure vectordb is valid before passing to agent
        if vectordb is None:
             logger.error("VectorDB instance is None after initialization and indexing. Cannot build agent.")
             exit(1)
        agent = build_agent(vectordb=vectordb, model_name=config.LLM_MODEL_NAME)
        logger.info("RAG agent initialized successfully")

        # Initialize message handler (for non-command messages)        
        handler = MessageHandler(agent=agent, config=config)
        pdf_processor = PdfProcessor() # Initialize PDF processor
        htm_processor = HtmProcessor() # Initialize HTM processor

        # Create an instance of the TelegramBotApp and run it        
        bot_app = TelegramBotApp(config=config, vector_store_instance=vector_store_instance, agent=agent, 
                                 handler=handler, pdf_processor=pdf_processor, htm_processor=htm_processor) # Pass htm_processor
        bot_app.run()

    except Exception as e:
        logger.critical(f"Failed during application startup: {str(e)}", exc_info=True)
        exit(1)

# Keep start_bot for potential polling mode if needed, but it's not used with webhook
# def start_bot():
#    logger.info("Starting bot in polling mode...")
#    bot.infinity_polling()