Spaces:
Building
Building
File size: 6,428 Bytes
b9ccd0b 24ae72d b9ccd0b 5bda5ed b9ccd0b 24ae72d b9ccd0b 24ae72d b9ccd0b 24ae72d b9ccd0b 7361b6f b9ccd0b 24ae72d b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b 5bda5ed b9ccd0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# /home/bk_anupam/code/LLM_agents/RAG_BOT/utils.py
from typing import Optional
import re
import json
from RAG_BOT.logger import logger
import os
from langdetect import detect, LangDetectException
from langchain_core.documents import Document # Added for type hinting
from typing import List # Added for type hinting
def parse_json_answer(content: str) -> Optional[dict]:
"""
Extracts and parses a JSON object embedded within a markdown code block.
Args:
content: The raw string output from the LLM, potentially containing ```json ... ```.
Returns:
The parsed dictionary if successful, None otherwise.
"""
if not content:
logger.warning("Attempted to parse empty LLM output.")
return None
json_str = content.strip()
# Regex to find content within ```json ... ``` or ``` ... ```, handling potential variations
# This pattern accounts for optional whitespace around the JSON content
match = re.search(r"```(?:json)?\s*(.*?)\s*```", json_str, re.DOTALL | re.IGNORECASE)
if match:
json_str = match.group(1).strip() # Extract the content and strip whitespace
logger.debug("Extracted JSON string from markdown block.")
else:
# If no markdown block is found, assume the content *might* be raw JSON
# We still proceed, but log a warning if it doesn't look like JSON
if not json_str.startswith('{') and not json_str.startswith('['):
logger.warning("LLM output did not contain a markdown JSON block and doesn't start with { or [. Attempting direct parse anyway.")
else:
logger.debug("No markdown block found, attempting to parse content directly as JSON.")
try:
# Attempt to parse the extracted (or original) string
parsed_json = json.loads(json_str, strict=False)
if isinstance(parsed_json, dict):
# Optionally, validate if the dict has the expected 'answer' key
if "answer" in parsed_json:
logger.debug("Successfully parsed JSON answer with 'answer' key.")
return parsed_json
else:
logger.warning("Parsed JSON is a dictionary but does not contain 'answer' key. Returning dict anyway.")
# Decide if you want to return the dict even without 'answer'
# return parsed_json
# Or return None if 'answer' is mandatory
return None
else:
logger.warning(f"Parsed JSON is not a dictionary: {type(parsed_json)}. Content: {json_str[:100]}...")
return None
except json.JSONDecodeError as e:
# Log the specific error and the problematic string portion
error_context_start = max(0, e.pos - 30)
error_context_end = min(len(json_str), e.pos + 30)
error_snippet = json_str[error_context_start:error_context_end]
# Replace newline characters in the snippet for cleaner logging
error_snippet_oneline = error_snippet.replace('\n', '\\n')
logger.error(f"Failed to parse JSON: {e}. Near char {e.pos}: '{error_snippet_oneline}'")
# Log the full content only at DEBUG level to avoid flooding logs
logger.error(f"Full content that failed parsing:\n{content}")
return None
except Exception as e:
logger.error(f"An unexpected error occurred during JSON parsing: {e}", exc_info=True)
logger.error(f"Full content during unexpected error:\n{content}")
return None
def detect_document_language(documents: List[Document], file_name_for_logging: str = "uploaded document",
default_lang: str = 'en') -> str:
"""
Detects the language of the content within a list of Document objects.
Args:
documents: A list of Langchain Document objects.
file_name_for_logging: The name of the file (for logging purposes).
default_lang: The language code to return if detection fails.
Returns:
The detected language code (e.g., 'en', 'hi') or the default language.
"""
logger.debug(f"Attempting to detect language for: {file_name_for_logging}")
try:
if not documents:
logger.warning(f"No documents provided for '{file_name_for_logging}'. Cannot detect language. Defaulting to '{default_lang}'.")
return default_lang
# Concatenate content from first few documents/pages for detection
# Using page_content attribute of Langchain Document
sample_text = " ".join([doc.page_content for doc in documents[:5]]).strip()
if not sample_text:
logger.warning(f"Document(s) '{file_name_for_logging}' contain no text to detect language from. Defaulting to '{default_lang}'.")
return default_lang
detected_lang = detect(sample_text)
logger.info(f"Detected language '{detected_lang}' for: {file_name_for_logging}")
return detected_lang
except LangDetectException as lang_err:
logger.warning(f"Could not detect language for '{file_name_for_logging}': {lang_err}. Defaulting to '{default_lang}'.")
return default_lang
except Exception as e:
logger.error(f"Error during language detection for '{file_name_for_logging}': {e}", exc_info=True)
return default_lang
# Example usage (can be removed or kept for testing)
if __name__ == '__main__':
test_cases = [
'```json\n{\n "answer": "This is a valid answer."\n}\n```',
'```json \n { "answer": " Another valid answer with space. " } \n ```',
'```\n{\n "answer": "No language specified."\n}\n```',
'{\n "answer": "Raw JSON string."\n}',
' {\n "answer": "Raw JSON with leading/trailing whitespace."\n} ',
'```json\n{\n "answer": "Invalid JSON structure"\n', # Missing closing brace
'```json\n{\n "answer": "Contains\x00invalid control char."\n}\n```', # Example invalid char
'Some text before ```json\n{\n "answer": "Text around JSON."\n}\n``` and after.',
'Plain text response, not JSON.',
'```json\n{\n "other_key": "No answer key."\n}\n```',
'' # Empty string
]
print("--- Testing parse_json_answer ---")
for i, case in enumerate(test_cases):
print(f"\nTest Case {i+1}:")
print(f"Input:\n{case}")
result = parse_json_answer(case)
print(f"Output: {result}")
print("-" * 20)
|