Spaces:

Zwounds
/

FormatReview

Sleeping

FormatReview / doc_analyzer.py

Stephen Zweibel

Add initial implementation of FormatReview tool with core features and configurations

bb869fd about 2 months ago

7.25 kB

	import logging
	import base64
	import re
	import xml.etree.ElementTree as ET
	from typing import Dict, Any, Union
	from config import settings
	from openai import OpenAI

	# This function is no longer needed as we're using markdown directly

	logger = logging.getLogger(__name__)

	def _extract_xml_block(text: str, tag_name: str) -> str:
	"""
	Extracts the last complete XML block from a string, ignoring surrounding text.
	"""
	# This regex finds all occurrences of the specified XML block
	matches = re.findall(f"<{tag_name}.*?</{tag_name}>", text, re.DOTALL)
	if matches:
	# Return the last match, which should be the assistant's response
	return matches[-1]
	logger.error(f"Could not find <{tag_name}> block in text: {text}")
	return ""

	def analyze_document(uploaded_file, rules: str) -> Dict[str, Any]:
	"""
	Analyzes a document against formatting rules using an LLM.

	Args:
	uploaded_file: The uploaded file (PDF or DOCX)
	rules: The formatting rules as a string (in markdown format)

	Returns:
	Dict containing analysis results
	"""
	logger.info("Analyzing document against formatting rules")

	# The rules are already in markdown format, so we can use them directly
	formatted_rules = rules

	try:
	# Read the file bytes
	file_bytes = uploaded_file.getvalue()

	# Create a unified prompt
	unified_prompt = f"""
	You are an expert in academic document formatting and citation. Your goal is to analyze the user's document for compliance with the journal's formatting rules and generate a comprehensive compliance report in XML format.

	Your response MUST be in the following XML format. Do not include any other text or explanations outside of the XML structure.

	<compliance_report>
	<summary>
	<overall_assessment></overall_assessment>
	<total_issues></total_issues>
	<critical_issues></critical_issues>
	<warning_issues></warning_issues>
	</summary>
	<recommendations>
	<recommendation></recommendation>
	</recommendations>
	<issues>
	<issue severity="critical/warning/info">
	<message></message>
	<location></location>
	<suggestion></suggestion>
	</issue>
	</issues>
	</compliance_report>

	Formatting Rules to Enforce

	{formatted_rules}

	Instructions

	Please analyze the attached document and generate the compliance report.

	Important Considerations for Analysis:
	* Citation Style: Determine the citation style (e.g., APA, MLA, Chicago) from the document's content and the journal's requirements. The document should follow the style specified in the formatting rules.
	* Page Numbering: When reporting the location of an issue, use the page number exactly as it is written in the document (e.g., 'vii', '12'). Do not use the PDF reader's page count (unless necessary to clarify).
	* Visual Formatting: When assessing visual properties like line spacing, margins, or font size from a PDF, be aware that text extraction can be imperfect. Base your findings on clear and consistent evidence throughout the document. Do not flag minor variations that could be due to PDF rendering. For example, only flag a line spacing issue if it is consistently incorrect across multiple pages and sections. Assume line spacing is correct unless it is obviously and consistently wrong.
	* Rule Interpretation: Apply the formatting rules strictly but fairly. If a rule is ambiguous, note the ambiguity in your assessment.
	* Completeness: Ensure that you check every rule against the document and that your report is complete.
	"""

	# Initialize the OpenAI client
	client = OpenAI(
	base_url=settings.llm_base_url,
	api_key=settings.openrouter_api_key,
	)

	# Encode the file as base64
	base64_file = base64.b64encode(file_bytes).decode('utf-8')

	# Determine file type
	file_extension = uploaded_file.name.split('.')[-1].lower()
	mime_type = "application/pdf" if file_extension == "pdf" else "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

	try:
	# Call the LLM API
	completion = client.chat.completions.create(
	model=settings.llm_model_name,
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": unified_prompt},
	{
	"type": "file",
	"file": {
	"file_data": f"data:{mime_type};base64,{base64_file}"
	}
	}
	],
	}
	],
	)
	raw_response = completion.choices[0].message.content
	except Exception as e:
	logger.error(f"An error occurred during LLM API call: {e}")
	return {"error": f"An error occurred during LLM API call: {e}"}

	# Extract the XML block
	clean_xml = _extract_xml_block(raw_response, "compliance_report")
	if not clean_xml:
	logger.error("Could not extract <compliance_report> XML block from the response.")
	return {"error": "Could not extract <compliance_report> XML block from the response."}

	logger.info(f"Final assembled report:\n{clean_xml}")

	# Parse the final XML output
	try:
	root = ET.fromstring(clean_xml)

	summary_node = root.find("summary")
	summary = {
	"overall_assessment": summary_node.findtext("overall_assessment", "No assessment available."),
	"total_issues": summary_node.findtext("total_issues", "N/A"),
	"critical_issues": summary_node.findtext("critical_issues", "N/A"),
	"warning_issues": summary_node.findtext("warning_issues", "N/A"),
	} if summary_node is not None else {}

	issues = []
	for issue_node in root.findall(".//issue"):
	issues.append({
	"severity": issue_node.get("severity"),
	"message": issue_node.findtext("message"),
	"location": issue_node.findtext("location"),
	"suggestion": issue_node.findtext("suggestion"),
	})

	recommendations = [rec.text for rec in root.findall(".//recommendation")]

	return {
	"raw_xml": clean_xml,
	"summary": summary,
	"issues": issues,
	"recommendations": recommendations,
	}

	except ET.ParseError as e:
	logger.error(f"Failed to parse final LLM output: {e}", exc_info=True)
	return {
	"raw_xml": raw_response,
	"error": "Failed to parse final LLM output."
	}

	except Exception as e:
	logger.error(f"Error analyzing document: {str(e)}")
	return {"error": f"Error analyzing document: {str(e)}"}