import logging
from typing import Dict, Any
from config import settings
import xml.etree.ElementTree as ET
from openai import OpenAI
import base64
import re
logger = logging.getLogger(__name__)
def _extract_xml_block(text: str, tag_name: str) -> str:
"""
Extracts the last complete XML block from a string, ignoring surrounding text.
"""
# This regex finds all occurrences of the specified XML block
matches = re.findall(f"<{tag_name}.*?{tag_name}>", text, re.DOTALL)
if matches:
# Return the last match, which should be the assistant's response
return matches[-1]
logger.error(f"Could not find <{tag_name}> block in text: {text}")
return ""
def analyze_with_llm(
pdf_file: Any,
metadata: Dict[str, Any]
) -> Dict[str, Any]:
"""
Perform compliance analysis with an LLM using a single, unified prompt.
"""
logger.info("Performing compliance analysis with LLM.")
# Create a unified prompt
unified_prompt = f"""
You are an expert in academic document formatting and citation. Your goal is to analyze the user's document for compliance with the Graduate Center's formatting rules and generate a comprehensive compliance report in XML format.
Your response MUST be in the following XML format. Do not include any other text or explanations outside of the XML structure.
**Formatting Rules to Enforce**
{get_formatting_rules()}
**Document Metadata**
{metadata}
**Instructions**
Please analyze the attached PDF document and generate the compliance report.
**Important Considerations for Analysis:**
* **Citation Style and Department:** Determine the citation style (e.g., APA, MLA, Chicago) and the author's department from the document's content. The document should follow the style manual for its discipline.
* **Page Numbering:** When reporting the location of an issue, use the page number exactly as it is written in the document (e.g., 'vii', '12'). Do not use the PDF reader's page count (unless necessary to clarify).
* **Visual Formatting:** When assessing visual properties like line spacing, margins, or font size from a PDF, be aware that text extraction can be imperfect. Base your findings on clear and consistent evidence throughout the document. Do not flag minor variations that could be due to PDF rendering. For example, only flag a line spacing issue if it is consistently incorrect across multiple pages and sections. Assume line spacing is correct unless it is obviously and consistently wrong.
* **Rule Interpretation:** Apply the formatting rules strictly but fairly. If a rule is ambiguous, note the ambiguity in your assessment.
* **Completeness:** Ensure that you check every rule against the document and that your report is complete.
"""
# Initialize the OpenAI client
client = OpenAI(
base_url=settings.llm_base_url,
api_key=settings.openrouter_api_key,
)
# Read the PDF and encode it as base64
base64_pdf = base64.b64encode(pdf_file).decode('utf-8')
try:
completion = client.chat.completions.create(
model=settings.llm_model_name,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": unified_prompt},
{
"type": "file",
"file": {
"file_data": f"data:application/pdf;base64,{base64_pdf}"
}
}
],
}
],
)
raw_response = completion.choices[0].message.content
except Exception as e:
logger.error(f"An error occurred: {e}")
return {"error": "An error occurred while communicating with the LLM."}
clean_xml = _extract_xml_block(raw_response, "compliance_report")
if not clean_xml:
logger.error("Could not extract XML block from the response.")
return {"error": "Could not extract XML block from the response."}
logger.info(f"Final assembled report:\n{clean_xml}")
# Parse the final XML output
try:
root = ET.fromstring(clean_xml)
summary_node = root.find("summary")
summary = {
"overall_assessment": summary_node.findtext("overall_assessment", "No assessment available."),
"total_issues": summary_node.findtext("total_issues", "N/A"),
"critical_issues": summary_node.findtext("critical_issues", "N/A"),
"warning_issues": summary_node.findtext("warning_issues", "N/A"),
} if summary_node is not None else {}
issues = []
for issue_node in root.findall(".//issue"):
issues.append({
"severity": issue_node.get("severity"),
"message": issue_node.findtext("message"),
"location": issue_node.findtext("location"),
"suggestion": issue_node.findtext("suggestion"),
})
recommendations = [rec.text for rec in root.findall(".//recommendation")]
return {
"raw_xml": clean_xml,
"summary": summary,
"issues": issues,
"recommendations": recommendations,
}
except ET.ParseError as e:
logger.error(f"Failed to parse final LLM output: {e}", exc_info=True)
return {
"raw_xml": raw_response,
"error": "Failed to parse final LLM output."
}
def get_formatting_rules() -> str:
"""
Load the formatting rules from the markdown file.
"""
with open(settings.formatting_rules_path, "r") as f:
return f.read()