Spaces:
Building
Building
bk-anupam
feat: Enhance document processing with HTML support and improve language detection. Documents indexed from 1976 to 1980 in both english and hindi.
5bda5ed
# /home/bk_anupam/code/LLM_agents/RAG_BOT/utils.py | |
from typing import Optional | |
import re | |
import json | |
from RAG_BOT.logger import logger | |
import os | |
from langdetect import detect, LangDetectException | |
from langchain_core.documents import Document # Added for type hinting | |
from typing import List # Added for type hinting | |
def parse_json_answer(content: str) -> Optional[dict]: | |
""" | |
Extracts and parses a JSON object embedded within a markdown code block. | |
Args: | |
content: The raw string output from the LLM, potentially containing ```json ... ```. | |
Returns: | |
The parsed dictionary if successful, None otherwise. | |
""" | |
if not content: | |
logger.warning("Attempted to parse empty LLM output.") | |
return None | |
json_str = content.strip() | |
# Regex to find content within ```json ... ``` or ``` ... ```, handling potential variations | |
# This pattern accounts for optional whitespace around the JSON content | |
match = re.search(r"```(?:json)?\s*(.*?)\s*```", json_str, re.DOTALL | re.IGNORECASE) | |
if match: | |
json_str = match.group(1).strip() # Extract the content and strip whitespace | |
logger.debug("Extracted JSON string from markdown block.") | |
else: | |
# If no markdown block is found, assume the content *might* be raw JSON | |
# We still proceed, but log a warning if it doesn't look like JSON | |
if not json_str.startswith('{') and not json_str.startswith('['): | |
logger.warning("LLM output did not contain a markdown JSON block and doesn't start with { or [. Attempting direct parse anyway.") | |
else: | |
logger.debug("No markdown block found, attempting to parse content directly as JSON.") | |
try: | |
# Attempt to parse the extracted (or original) string | |
parsed_json = json.loads(json_str, strict=False) | |
if isinstance(parsed_json, dict): | |
# Optionally, validate if the dict has the expected 'answer' key | |
if "answer" in parsed_json: | |
logger.debug("Successfully parsed JSON answer with 'answer' key.") | |
return parsed_json | |
else: | |
logger.warning("Parsed JSON is a dictionary but does not contain 'answer' key. Returning dict anyway.") | |
# Decide if you want to return the dict even without 'answer' | |
# return parsed_json | |
# Or return None if 'answer' is mandatory | |
return None | |
else: | |
logger.warning(f"Parsed JSON is not a dictionary: {type(parsed_json)}. Content: {json_str[:100]}...") | |
return None | |
except json.JSONDecodeError as e: | |
# Log the specific error and the problematic string portion | |
error_context_start = max(0, e.pos - 30) | |
error_context_end = min(len(json_str), e.pos + 30) | |
error_snippet = json_str[error_context_start:error_context_end] | |
# Replace newline characters in the snippet for cleaner logging | |
error_snippet_oneline = error_snippet.replace('\n', '\\n') | |
logger.error(f"Failed to parse JSON: {e}. Near char {e.pos}: '{error_snippet_oneline}'") | |
# Log the full content only at DEBUG level to avoid flooding logs | |
logger.error(f"Full content that failed parsing:\n{content}") | |
return None | |
except Exception as e: | |
logger.error(f"An unexpected error occurred during JSON parsing: {e}", exc_info=True) | |
logger.error(f"Full content during unexpected error:\n{content}") | |
return None | |
def detect_document_language(documents: List[Document], file_name_for_logging: str = "uploaded document", | |
default_lang: str = 'en') -> str: | |
""" | |
Detects the language of the content within a list of Document objects. | |
Args: | |
documents: A list of Langchain Document objects. | |
file_name_for_logging: The name of the file (for logging purposes). | |
default_lang: The language code to return if detection fails. | |
Returns: | |
The detected language code (e.g., 'en', 'hi') or the default language. | |
""" | |
logger.debug(f"Attempting to detect language for: {file_name_for_logging}") | |
try: | |
if not documents: | |
logger.warning(f"No documents provided for '{file_name_for_logging}'. Cannot detect language. Defaulting to '{default_lang}'.") | |
return default_lang | |
# Concatenate content from first few documents/pages for detection | |
# Using page_content attribute of Langchain Document | |
sample_text = " ".join([doc.page_content for doc in documents[:5]]).strip() | |
if not sample_text: | |
logger.warning(f"Document(s) '{file_name_for_logging}' contain no text to detect language from. Defaulting to '{default_lang}'.") | |
return default_lang | |
detected_lang = detect(sample_text) | |
logger.info(f"Detected language '{detected_lang}' for: {file_name_for_logging}") | |
return detected_lang | |
except LangDetectException as lang_err: | |
logger.warning(f"Could not detect language for '{file_name_for_logging}': {lang_err}. Defaulting to '{default_lang}'.") | |
return default_lang | |
except Exception as e: | |
logger.error(f"Error during language detection for '{file_name_for_logging}': {e}", exc_info=True) | |
return default_lang | |
# Example usage (can be removed or kept for testing) | |
if __name__ == '__main__': | |
test_cases = [ | |
'```json\n{\n "answer": "This is a valid answer."\n}\n```', | |
'```json \n { "answer": " Another valid answer with space. " } \n ```', | |
'```\n{\n "answer": "No language specified."\n}\n```', | |
'{\n "answer": "Raw JSON string."\n}', | |
' {\n "answer": "Raw JSON with leading/trailing whitespace."\n} ', | |
'```json\n{\n "answer": "Invalid JSON structure"\n', # Missing closing brace | |
'```json\n{\n "answer": "Contains\x00invalid control char."\n}\n```', # Example invalid char | |
'Some text before ```json\n{\n "answer": "Text around JSON."\n}\n``` and after.', | |
'Plain text response, not JSON.', | |
'```json\n{\n "other_key": "No answer key."\n}\n```', | |
'' # Empty string | |
] | |
print("--- Testing parse_json_answer ---") | |
for i, case in enumerate(test_cases): | |
print(f"\nTest Case {i+1}:") | |
print(f"Input:\n{case}") | |
result = parse_json_answer(case) | |
print(f"Output: {result}") | |
print("-" * 20) | |