# /home/bk_anupam/code/LLM_agents/RAG_BOT/utils.py from typing import Optional import re import json from RAG_BOT.logger import logger import os from langdetect import detect, LangDetectException from langchain_core.documents import Document # Added for type hinting from typing import List # Added for type hinting def parse_json_answer(content: str) -> Optional[dict]: """ Extracts and parses a JSON object embedded within a markdown code block. Args: content: The raw string output from the LLM, potentially containing ```json ... ```. Returns: The parsed dictionary if successful, None otherwise. """ if not content: logger.warning("Attempted to parse empty LLM output.") return None json_str = content.strip() # Regex to find content within ```json ... ``` or ``` ... ```, handling potential variations # This pattern accounts for optional whitespace around the JSON content match = re.search(r"```(?:json)?\s*(.*?)\s*```", json_str, re.DOTALL | re.IGNORECASE) if match: json_str = match.group(1).strip() # Extract the content and strip whitespace logger.debug("Extracted JSON string from markdown block.") else: # If no markdown block is found, assume the content *might* be raw JSON # We still proceed, but log a warning if it doesn't look like JSON if not json_str.startswith('{') and not json_str.startswith('['): logger.warning("LLM output did not contain a markdown JSON block and doesn't start with { or [. Attempting direct parse anyway.") else: logger.debug("No markdown block found, attempting to parse content directly as JSON.") try: # Attempt to parse the extracted (or original) string parsed_json = json.loads(json_str, strict=False) if isinstance(parsed_json, dict): # Optionally, validate if the dict has the expected 'answer' key if "answer" in parsed_json: logger.debug("Successfully parsed JSON answer with 'answer' key.") return parsed_json else: logger.warning("Parsed JSON is a dictionary but does not contain 'answer' key. Returning dict anyway.") # Decide if you want to return the dict even without 'answer' # return parsed_json # Or return None if 'answer' is mandatory return None else: logger.warning(f"Parsed JSON is not a dictionary: {type(parsed_json)}. Content: {json_str[:100]}...") return None except json.JSONDecodeError as e: # Log the specific error and the problematic string portion error_context_start = max(0, e.pos - 30) error_context_end = min(len(json_str), e.pos + 30) error_snippet = json_str[error_context_start:error_context_end] # Replace newline characters in the snippet for cleaner logging error_snippet_oneline = error_snippet.replace('\n', '\\n') logger.error(f"Failed to parse JSON: {e}. Near char {e.pos}: '{error_snippet_oneline}'") # Log the full content only at DEBUG level to avoid flooding logs logger.error(f"Full content that failed parsing:\n{content}") return None except Exception as e: logger.error(f"An unexpected error occurred during JSON parsing: {e}", exc_info=True) logger.error(f"Full content during unexpected error:\n{content}") return None def detect_document_language(documents: List[Document], file_name_for_logging: str = "uploaded document", default_lang: str = 'en') -> str: """ Detects the language of the content within a list of Document objects. Args: documents: A list of Langchain Document objects. file_name_for_logging: The name of the file (for logging purposes). default_lang: The language code to return if detection fails. Returns: The detected language code (e.g., 'en', 'hi') or the default language. """ logger.debug(f"Attempting to detect language for: {file_name_for_logging}") try: if not documents: logger.warning(f"No documents provided for '{file_name_for_logging}'. Cannot detect language. Defaulting to '{default_lang}'.") return default_lang # Concatenate content from first few documents/pages for detection # Using page_content attribute of Langchain Document sample_text = " ".join([doc.page_content for doc in documents[:5]]).strip() if not sample_text: logger.warning(f"Document(s) '{file_name_for_logging}' contain no text to detect language from. Defaulting to '{default_lang}'.") return default_lang detected_lang = detect(sample_text) logger.info(f"Detected language '{detected_lang}' for: {file_name_for_logging}") return detected_lang except LangDetectException as lang_err: logger.warning(f"Could not detect language for '{file_name_for_logging}': {lang_err}. Defaulting to '{default_lang}'.") return default_lang except Exception as e: logger.error(f"Error during language detection for '{file_name_for_logging}': {e}", exc_info=True) return default_lang # Example usage (can be removed or kept for testing) if __name__ == '__main__': test_cases = [ '```json\n{\n "answer": "This is a valid answer."\n}\n```', '```json \n { "answer": " Another valid answer with space. " } \n ```', '```\n{\n "answer": "No language specified."\n}\n```', '{\n "answer": "Raw JSON string."\n}', ' {\n "answer": "Raw JSON with leading/trailing whitespace."\n} ', '```json\n{\n "answer": "Invalid JSON structure"\n', # Missing closing brace '```json\n{\n "answer": "Contains\x00invalid control char."\n}\n```', # Example invalid char 'Some text before ```json\n{\n "answer": "Text around JSON."\n}\n``` and after.', 'Plain text response, not JSON.', '```json\n{\n "other_key": "No answer key."\n}\n```', '' # Empty string ] print("--- Testing parse_json_answer ---") for i, case in enumerate(test_cases): print(f"\nTest Case {i+1}:") print(f"Input:\n{case}") result = parse_json_answer(case) print(f"Output: {result}") print("-" * 20)