bk-anupam
feat: Enhance document processing with HTML support and improve language detection. Documents indexed from 1976 to 1980 in both english and hindi.
5bda5ed
# /home/bk_anupam/code/LLM_agents/RAG_BOT/utils.py
from typing import Optional
import re
import json
from RAG_BOT.logger import logger
import os
from langdetect import detect, LangDetectException
from langchain_core.documents import Document # Added for type hinting
from typing import List # Added for type hinting
def parse_json_answer(content: str) -> Optional[dict]:
"""
Extracts and parses a JSON object embedded within a markdown code block.
Args:
content: The raw string output from the LLM, potentially containing ```json ... ```.
Returns:
The parsed dictionary if successful, None otherwise.
"""
if not content:
logger.warning("Attempted to parse empty LLM output.")
return None
json_str = content.strip()
# Regex to find content within ```json ... ``` or ``` ... ```, handling potential variations
# This pattern accounts for optional whitespace around the JSON content
match = re.search(r"```(?:json)?\s*(.*?)\s*```", json_str, re.DOTALL | re.IGNORECASE)
if match:
json_str = match.group(1).strip() # Extract the content and strip whitespace
logger.debug("Extracted JSON string from markdown block.")
else:
# If no markdown block is found, assume the content *might* be raw JSON
# We still proceed, but log a warning if it doesn't look like JSON
if not json_str.startswith('{') and not json_str.startswith('['):
logger.warning("LLM output did not contain a markdown JSON block and doesn't start with { or [. Attempting direct parse anyway.")
else:
logger.debug("No markdown block found, attempting to parse content directly as JSON.")
try:
# Attempt to parse the extracted (or original) string
parsed_json = json.loads(json_str, strict=False)
if isinstance(parsed_json, dict):
# Optionally, validate if the dict has the expected 'answer' key
if "answer" in parsed_json:
logger.debug("Successfully parsed JSON answer with 'answer' key.")
return parsed_json
else:
logger.warning("Parsed JSON is a dictionary but does not contain 'answer' key. Returning dict anyway.")
# Decide if you want to return the dict even without 'answer'
# return parsed_json
# Or return None if 'answer' is mandatory
return None
else:
logger.warning(f"Parsed JSON is not a dictionary: {type(parsed_json)}. Content: {json_str[:100]}...")
return None
except json.JSONDecodeError as e:
# Log the specific error and the problematic string portion
error_context_start = max(0, e.pos - 30)
error_context_end = min(len(json_str), e.pos + 30)
error_snippet = json_str[error_context_start:error_context_end]
# Replace newline characters in the snippet for cleaner logging
error_snippet_oneline = error_snippet.replace('\n', '\\n')
logger.error(f"Failed to parse JSON: {e}. Near char {e.pos}: '{error_snippet_oneline}'")
# Log the full content only at DEBUG level to avoid flooding logs
logger.error(f"Full content that failed parsing:\n{content}")
return None
except Exception as e:
logger.error(f"An unexpected error occurred during JSON parsing: {e}", exc_info=True)
logger.error(f"Full content during unexpected error:\n{content}")
return None
def detect_document_language(documents: List[Document], file_name_for_logging: str = "uploaded document",
default_lang: str = 'en') -> str:
"""
Detects the language of the content within a list of Document objects.
Args:
documents: A list of Langchain Document objects.
file_name_for_logging: The name of the file (for logging purposes).
default_lang: The language code to return if detection fails.
Returns:
The detected language code (e.g., 'en', 'hi') or the default language.
"""
logger.debug(f"Attempting to detect language for: {file_name_for_logging}")
try:
if not documents:
logger.warning(f"No documents provided for '{file_name_for_logging}'. Cannot detect language. Defaulting to '{default_lang}'.")
return default_lang
# Concatenate content from first few documents/pages for detection
# Using page_content attribute of Langchain Document
sample_text = " ".join([doc.page_content for doc in documents[:5]]).strip()
if not sample_text:
logger.warning(f"Document(s) '{file_name_for_logging}' contain no text to detect language from. Defaulting to '{default_lang}'.")
return default_lang
detected_lang = detect(sample_text)
logger.info(f"Detected language '{detected_lang}' for: {file_name_for_logging}")
return detected_lang
except LangDetectException as lang_err:
logger.warning(f"Could not detect language for '{file_name_for_logging}': {lang_err}. Defaulting to '{default_lang}'.")
return default_lang
except Exception as e:
logger.error(f"Error during language detection for '{file_name_for_logging}': {e}", exc_info=True)
return default_lang
# Example usage (can be removed or kept for testing)
if __name__ == '__main__':
test_cases = [
'```json\n{\n "answer": "This is a valid answer."\n}\n```',
'```json \n { "answer": " Another valid answer with space. " } \n ```',
'```\n{\n "answer": "No language specified."\n}\n```',
'{\n "answer": "Raw JSON string."\n}',
' {\n "answer": "Raw JSON with leading/trailing whitespace."\n} ',
'```json\n{\n "answer": "Invalid JSON structure"\n', # Missing closing brace
'```json\n{\n "answer": "Contains\x00invalid control char."\n}\n```', # Example invalid char
'Some text before ```json\n{\n "answer": "Text around JSON."\n}\n``` and after.',
'Plain text response, not JSON.',
'```json\n{\n "other_key": "No answer key."\n}\n```',
'' # Empty string
]
print("--- Testing parse_json_answer ---")
for i, case in enumerate(test_cases):
print(f"\nTest Case {i+1}:")
print(f"Input:\n{case}")
result = parse_json_answer(case)
print(f"Output: {result}")
print("-" * 20)