FormatReview / doc_analyzer.py
Stephen Zweibel
Add initial implementation of FormatReview tool with core features and configurations
bb869fd
import logging
import base64
import re
import xml.etree.ElementTree as ET
from typing import Dict, Any, Union
from config import settings
from openai import OpenAI
# This function is no longer needed as we're using markdown directly
logger = logging.getLogger(__name__)
def _extract_xml_block(text: str, tag_name: str) -> str:
"""
Extracts the last complete XML block from a string, ignoring surrounding text.
"""
# This regex finds all occurrences of the specified XML block
matches = re.findall(f"<{tag_name}.*?</{tag_name}>", text, re.DOTALL)
if matches:
# Return the last match, which should be the assistant's response
return matches[-1]
logger.error(f"Could not find <{tag_name}> block in text: {text}")
return ""
def analyze_document(uploaded_file, rules: str) -> Dict[str, Any]:
"""
Analyzes a document against formatting rules using an LLM.
Args:
uploaded_file: The uploaded file (PDF or DOCX)
rules: The formatting rules as a string (in markdown format)
Returns:
Dict containing analysis results
"""
logger.info("Analyzing document against formatting rules")
# The rules are already in markdown format, so we can use them directly
formatted_rules = rules
try:
# Read the file bytes
file_bytes = uploaded_file.getvalue()
# Create a unified prompt
unified_prompt = f"""
You are an expert in academic document formatting and citation. Your goal is to analyze the user's document for compliance with the journal's formatting rules and generate a comprehensive compliance report in XML format.
Your response MUST be in the following XML format. Do not include any other text or explanations outside of the XML structure.
<compliance_report>
<summary>
<overall_assessment></overall_assessment>
<total_issues></total_issues>
<critical_issues></critical_issues>
<warning_issues></warning_issues>
</summary>
<recommendations>
<recommendation></recommendation>
</recommendations>
<issues>
<issue severity="critical/warning/info">
<message></message>
<location></location>
<suggestion></suggestion>
</issue>
</issues>
</compliance_report>
**Formatting Rules to Enforce**
{formatted_rules}
**Instructions**
Please analyze the attached document and generate the compliance report.
**Important Considerations for Analysis:**
* **Citation Style:** Determine the citation style (e.g., APA, MLA, Chicago) from the document's content and the journal's requirements. The document should follow the style specified in the formatting rules.
* **Page Numbering:** When reporting the location of an issue, use the page number exactly as it is written in the document (e.g., 'vii', '12'). Do not use the PDF reader's page count (unless necessary to clarify).
* **Visual Formatting:** When assessing visual properties like line spacing, margins, or font size from a PDF, be aware that text extraction can be imperfect. Base your findings on clear and consistent evidence throughout the document. Do not flag minor variations that could be due to PDF rendering. For example, only flag a line spacing issue if it is consistently incorrect across multiple pages and sections. Assume line spacing is correct unless it is obviously and consistently wrong.
* **Rule Interpretation:** Apply the formatting rules strictly but fairly. If a rule is ambiguous, note the ambiguity in your assessment.
* **Completeness:** Ensure that you check every rule against the document and that your report is complete.
"""
# Initialize the OpenAI client
client = OpenAI(
base_url=settings.llm_base_url,
api_key=settings.openrouter_api_key,
)
# Encode the file as base64
base64_file = base64.b64encode(file_bytes).decode('utf-8')
# Determine file type
file_extension = uploaded_file.name.split('.')[-1].lower()
mime_type = "application/pdf" if file_extension == "pdf" else "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
try:
# Call the LLM API
completion = client.chat.completions.create(
model=settings.llm_model_name,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": unified_prompt},
{
"type": "file",
"file": {
"file_data": f"data:{mime_type};base64,{base64_file}"
}
}
],
}
],
)
raw_response = completion.choices[0].message.content
except Exception as e:
logger.error(f"An error occurred during LLM API call: {e}")
return {"error": f"An error occurred during LLM API call: {e}"}
# Extract the XML block
clean_xml = _extract_xml_block(raw_response, "compliance_report")
if not clean_xml:
logger.error("Could not extract <compliance_report> XML block from the response.")
return {"error": "Could not extract <compliance_report> XML block from the response."}
logger.info(f"Final assembled report:\n{clean_xml}")
# Parse the final XML output
try:
root = ET.fromstring(clean_xml)
summary_node = root.find("summary")
summary = {
"overall_assessment": summary_node.findtext("overall_assessment", "No assessment available."),
"total_issues": summary_node.findtext("total_issues", "N/A"),
"critical_issues": summary_node.findtext("critical_issues", "N/A"),
"warning_issues": summary_node.findtext("warning_issues", "N/A"),
} if summary_node is not None else {}
issues = []
for issue_node in root.findall(".//issue"):
issues.append({
"severity": issue_node.get("severity"),
"message": issue_node.findtext("message"),
"location": issue_node.findtext("location"),
"suggestion": issue_node.findtext("suggestion"),
})
recommendations = [rec.text for rec in root.findall(".//recommendation")]
return {
"raw_xml": clean_xml,
"summary": summary,
"issues": issues,
"recommendations": recommendations,
}
except ET.ParseError as e:
logger.error(f"Failed to parse final LLM output: {e}", exc_info=True)
return {
"raw_xml": raw_response,
"error": "Failed to parse final LLM output."
}
except Exception as e:
logger.error(f"Error analyzing document: {str(e)}")
return {"error": f"Error analyzing document: {str(e)}"}