Stephen Zweibel commited on
Commit
5d74609
·
0 Parent(s):

Update app for Hugging Face

Browse files
.gitignore ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+
4
+ # Log files
5
+ *.log
6
+
7
+ # Python cache
8
+ __pycache__/
9
+ *.pyc
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # Test documents
17
+ test_docs/
18
+
19
+ # IDE settings
20
+ .vscode/
21
+
22
+ # Streamlit secrets
23
+ .streamlit/secrets.toml
24
+
25
+ # Exclude binary files
26
+ *.pdf
27
+ *.docx
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ title: dissistant
4
+ emoji: 🚀
5
+ colorFrom: blue
6
+ colorTo: green
7
+ sdk: streamlit
8
+ sdk_version: 1.29.0
9
+ python_version: 3.9
10
+ app_file: app.py
11
+ ---
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ from pathlib import Path
5
+ import time
6
+ import logging
7
+ import asyncio
8
+
9
+ # Import modules
10
+ from modules.llm_interface import analyze_with_llm
11
+ from modules.report_generator import generate_report
12
+
13
+ # Import configuration
14
+ from config import settings
15
+
16
+ # Get logger
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # --- Password Authentication Function ---
20
+ def check_authentication():
21
+ """Returns `True` if the user is authenticated."""
22
+ expected_password = st.secrets.get("APP_PASSWORD")
23
+
24
+ if not expected_password:
25
+ st.session_state.authenticated = True
26
+ return True
27
+
28
+ if "authenticated" not in st.session_state:
29
+ st.session_state.authenticated = False
30
+
31
+ if st.session_state.authenticated:
32
+ return True
33
+
34
+ with st.form(key="password_form"):
35
+ st.subheader("Enter Password to Access")
36
+ password_attempt = st.text_input("Password", type="password", key="password_input_field")
37
+ login_button = st.form_submit_button("Login")
38
+
39
+ if login_button:
40
+ if password_attempt == expected_password:
41
+ st.session_state.authenticated = True
42
+ st.rerun()
43
+ else:
44
+ st.error("Incorrect password. Please try again.")
45
+ return False
46
+ return False
47
+
48
+ # Set page configuration
49
+ st.set_page_config(
50
+ page_title="Graduate Center Dissertation Review Tool",
51
+ page_icon="📚",
52
+ layout="wide",
53
+ initial_sidebar_state="expanded"
54
+ )
55
+
56
+ # Custom CSS
57
+ st.markdown("""
58
+ <style>
59
+ .main .block-container {
60
+ padding-top: 2rem;
61
+ padding-bottom: 2rem;
62
+ }
63
+ h1, h2, h3 {
64
+ margin-bottom: 1rem;
65
+ }
66
+ .stProgress > div > div > div {
67
+ background-color: #4CAF50;
68
+ }
69
+ </style>
70
+ """, unsafe_allow_html=True)
71
+
72
+ # Initialize session state
73
+ if "analysis_results" not in st.session_state:
74
+ st.session_state.analysis_results = None
75
+ if "report_generated" not in st.session_state:
76
+ st.session_state.report_generated = False
77
+
78
+ # Title and description
79
+ st.title("Graduate Center Dissertation Review Tool")
80
+ st.markdown("""
81
+ This tool automatically checks dissertations and theses for formatting and citation rules.
82
+ Upload your document to receive a detailed report.
83
+ """)
84
+
85
+ # Sidebar for configuration
86
+ with st.sidebar:
87
+ st.header("Configuration")
88
+
89
+ # Reset button
90
+ if st.button("Start Over"):
91
+ for key in st.session_state.keys():
92
+ del st.session_state[key]
93
+ st.rerun()
94
+
95
+ if check_authentication():
96
+ # Main content area
97
+ tab1, tab2 = st.tabs(["Document Upload", "Review Report"])
98
+
99
+ with tab1:
100
+ st.header("Upload Your Document")
101
+
102
+ uploaded_file = st.file_uploader("Choose a PDF or Word document", type=["pdf", "docx"])
103
+
104
+ if uploaded_file is not None:
105
+ # Display file info
106
+ file_details = {
107
+ "Filename": uploaded_file.name,
108
+ "File size": f"{uploaded_file.size / 1024:.2f} KB",
109
+ "File type": uploaded_file.type
110
+ }
111
+
112
+ st.write("File Details:")
113
+ for key, value in file_details.items():
114
+ st.write(f"- {key}: {value}")
115
+
116
+ # Process button
117
+ if st.button("Process Document"):
118
+ logger.info(f"Processing document: {uploaded_file.name}")
119
+ try:
120
+ with st.spinner("Processing document..."):
121
+ # Process document
122
+ progress_bar = st.progress(0)
123
+
124
+ # Step 1: Read file bytes
125
+ pdf_bytes = uploaded_file.getvalue()
126
+ progress_bar.progress(25)
127
+ time.sleep(0.5) # Simulate processing time
128
+
129
+ # Step 2: Metadata extraction
130
+ logger.info("Extracting metadata...")
131
+ st.write("Extracting metadata...")
132
+ metadata = {"title": uploaded_file.name} # Dummy metadata
133
+ progress_bar.progress(50)
134
+ time.sleep(0.5) # Simulate processing time
135
+
136
+ # Step 3: LLM analysis
137
+ logger.info("Performing analysis with LLM...")
138
+ st.write("Performing analysis with LLM...")
139
+ analysis_results = analyze_with_llm(
140
+ pdf_file=pdf_bytes,
141
+ metadata=metadata
142
+ )
143
+ st.session_state.analysis_results = analysis_results
144
+ progress_bar.progress(100)
145
+
146
+ # Generate report
147
+ logger.info("Generating report...")
148
+ st.session_state.report = generate_report(analysis_results)
149
+
150
+ st.session_state.report_generated = True
151
+
152
+ # Switch to report tab
153
+ st.success("Document processed successfully! View the compliance report in the next tab.")
154
+
155
+ except Exception as e:
156
+ logger.error(f"An error occurred during processing: {str(e)}", exc_info=True)
157
+ st.error(f"An error occurred during processing: {str(e)}")
158
+
159
+ with tab2:
160
+ st.header("Review Report")
161
+
162
+ if not st.session_state.report_generated:
163
+ st.info("Upload and process a document to generate a review report.")
164
+ else:
165
+ # Display summary
166
+ st.subheader("Summary")
167
+ summary = st.session_state.analysis_results.get("summary", {})
168
+ st.write(f"**Overall Assessment**: {summary.get('overall_assessment', 'N/A')}")
169
+ st.write(f"**Total Issues**: {summary.get('total_issues', 'N/A')}")
170
+ st.write(f"**Critical Issues**: {summary.get('critical_issues', 'N/A')}")
171
+ st.write(f"**Warning Issues**: {summary.get('warning_issues', 'N/A')}")
172
+
173
+ # Display recommendations
174
+ st.subheader("Recommendations")
175
+ recommendations = st.session_state.analysis_results.get("recommendations", [])
176
+ if recommendations:
177
+ for rec in recommendations:
178
+ st.write(f"- {rec}")
179
+ else:
180
+ st.write("No recommendations.")
181
+
182
+ # Display detailed report
183
+ st.subheader("Detailed Report")
184
+ issues = st.session_state.analysis_results.get("issues", [])
185
+ if issues:
186
+ for issue in issues:
187
+ severity = issue.get('severity', 'N/A').lower()
188
+ message = f"**{issue.get('severity', 'N/A').upper()}**: {issue.get('message', 'N/A')}"
189
+
190
+ if severity == 'critical':
191
+ st.error(message)
192
+ elif severity == 'warning':
193
+ st.warning(message)
194
+ elif severity == 'info':
195
+ st.info(message)
196
+ else:
197
+ st.success(message)
198
+
199
+ st.write(f"**Location**: {issue.get('location', 'N/A')}")
200
+ st.write(f"**Suggestion**: {issue.get('suggestion', 'N/A')}")
201
+ st.divider()
202
+ else:
203
+ st.success("No issues found.")
204
+
205
+ # Footer
206
+ st.markdown("---")
207
+ st.markdown("© Graduate Center, CUNY. Developed to assist with dissertation and thesis review.")
config.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ import os
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional
8
+ from pydantic import BaseModel, Field
9
+
10
+ # Logging configuration
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
14
+ handlers=[
15
+ logging.FileHandler("dissistant.log"),
16
+ logging.StreamHandler()
17
+ ]
18
+ )
19
+
20
+ # Base directory
21
+ BASE_DIR = Path(__file__).resolve().parent
22
+
23
+ class Settings(BaseModel):
24
+ """Application settings"""
25
+ # Application settings
26
+ app_name: str = "Graduate Center Dissertation Compliance Assistant"
27
+ description: str = "A tool to check dissertations and theses for compliance with Graduate Center formatting and citation rules."
28
+ version: str = "0.1.0"
29
+ debug: bool = os.getenv("DEBUG", "False").lower() == "true" # Default to False if not set
30
+
31
+ # Paths
32
+ rules_dir: Path = BASE_DIR / "rules"
33
+ formatting_rules_path: Path = rules_dir / "formatting_rules.md"
34
+ citation_rules_path: Path = rules_dir / "citation_rules.md"
35
+ metadata_rules_path: Path = rules_dir / "metadata_rules.md"
36
+
37
+ # LLM settings
38
+ llm_provider: str = os.getenv("LLM_PROVIDER", "openrouter").lower() # 'local', 'openai', or 'openrouter'
39
+ llm_model_name: str = os.getenv("LLM_MODEL_NAME", "google/gemini-2.5-pro")
40
+ llm_base_url: str = os.getenv("LLM_API_BASE", "https://openrouter.ai/api/v1")
41
+ llm_api_key: str = os.getenv("LLM_API_KEY", "lm-studio") # Default for local LM Studio
42
+
43
+ # OpenAI specific settings
44
+ openai_api_key: Optional[str] = os.getenv("OPENAI_API_KEY")
45
+ openai_model: str = os.getenv("OPENAI_MODEL", "gpt-4")
46
+
47
+ # OpenRouter specific settings
48
+ openrouter_api_key: Optional[str] = os.getenv("OPENROUTER_API_KEY")
49
+
50
+ # Document processing settings
51
+ max_file_size_mb: int = 50 # Maximum file size in MB
52
+ supported_file_types: List[str] = ["pdf", "docx"]
53
+
54
+ # Citation styles
55
+ citation_styles: List[str] = ["APA", "MLA", "Chicago", "Custom"]
56
+ default_citation_style: str = "APA"
57
+
58
+ # Department-specific settings
59
+ departments: List[str] = [
60
+ "General",
61
+ "English",
62
+ "History",
63
+ "Psychology",
64
+ "Computer Science",
65
+ "Other"
66
+ ]
67
+
68
+ # LLM prompt templates
69
+ formatting_analysis_template: str = """
70
+ You are an expert in academic document formatting. Analyze the following document excerpt for compliance with the institutional formatting rules.
71
+
72
+ FORMATTING RULES:
73
+ {formatting_rules}
74
+
75
+ DOCUMENT METADATA:
76
+ {document_metadata}
77
+
78
+ DOCUMENT EXCERPT:
79
+ {document_excerpt}
80
+
81
+ Identify any formatting issues in the document. For each issue, provide:
82
+ 1. A description of the issue
83
+ 2. The location in the document
84
+ 3. The specific rule that is violated
85
+ 4. A suggestion for how to fix the issue
86
+ 5. The severity of the issue (critical, warning, or info)
87
+
88
+ Format your response as a JSON array of issues, with each issue having the following fields:
89
+ - "message": A clear description of the issue
90
+ - "location": Where in the document the issue occurs
91
+ - "rule": The specific rule that is violated
92
+ - "suggestion": How to fix the issue
93
+ - "severity": The severity level (critical, warning, or info)
94
+
95
+ If no issues are found, return an empty array.
96
+ """
97
+
98
+ citation_analysis_template: str = """
99
+ You are an expert in academic citation styles. Analyze the following document excerpt for compliance with the specified citation style.
100
+
101
+ CITATION STYLE: {citation_style}
102
+
103
+ CITATION STYLE GUIDELINES:
104
+ {citation_guidelines}
105
+
106
+ DOCUMENT EXCERPT:
107
+ {document_excerpt}
108
+
109
+ Identify any citation issues in the document. For each issue, provide:
110
+ 1. A description of the issue
111
+ 2. The problematic citation
112
+ 3. The page or location where it appears
113
+ 4. A suggestion for how to fix the issue
114
+ 5. The severity of the issue (critical, warning, or info)
115
+
116
+ Format your response as a JSON array of issues, with each issue having the following fields:
117
+ - "message": A clear description of the issue
118
+ - "citation": The problematic citation
119
+ - "page": The page or location where it appears
120
+ - "suggestion": How to fix the issue
121
+ - "severity": The severity level (critical, warning, or info)
122
+
123
+ If no issues are found, return an empty array.
124
+ """
125
+
126
+ metadata_analysis_template: str = """
127
+ You are an expert in academic document structure. Analyze the following document front matter for compliance with the institutional metadata requirements.
128
+
129
+ METADATA REQUIREMENTS:
130
+ {metadata_requirements}
131
+
132
+ DOCUMENT FRONT MATTER:
133
+ {front_matter}
134
+
135
+ Identify any metadata or front matter issues in the document. For each issue, provide:
136
+ 1. A description of the issue
137
+ 2. The specific element that is problematic
138
+ 3. A suggestion for how to fix the issue
139
+ 4. The severity of the issue (critical, warning, or info)
140
+
141
+ Format your response as a JSON array of issues, with each issue having the following fields:
142
+ - "message": A clear description of the issue
143
+ - "element": The specific element that is problematic
144
+ - "suggestion": How to fix the issue
145
+ - "severity": The severity level (critical, warning, or info)
146
+
147
+ If no issues are found, return an empty array.
148
+ """
149
+
150
+ overall_analysis_template: str = """
151
+ You are an expert in academic document formatting and citation. Review the following analysis results and provide an overall assessment of the document's compliance with institutional requirements.
152
+
153
+ FORMATTING ISSUES:
154
+ {formatting_issues}
155
+
156
+ CITATION ISSUES:
157
+ {citation_issues}
158
+
159
+ METADATA ISSUES:
160
+ {metadata_issues}
161
+
162
+ Provide:
163
+ 1. An overall assessment of the document's compliance
164
+ 2. A list of key recommendations for improving the document
165
+
166
+ Format your response as a JSON object with the following fields:
167
+ - "overall_assessment": A paragraph summarizing the document's compliance status
168
+ - "recommendations": An array of specific recommendations for improving the document
169
+
170
+ Be constructive and helpful in your assessment and recommendations.
171
+ """
172
+
173
+ # Instantiate settings
174
+ settings = Settings()
175
+
176
+ if __name__ == "__main__":
177
+ # Print out the settings for verification if run directly
178
+ print("Application Settings:")
179
+ for field_name, value in settings.model_dump().items():
180
+ if not isinstance(value, str) or len(value) < 100: # Skip printing long strings like templates
181
+ print(f" {field_name}: {value}")
dissistant.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 7465
modules/llm_interface.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Any
3
+ from config import settings
4
+ import xml.etree.ElementTree as ET
5
+ from openai import OpenAI
6
+ import base64
7
+ import re
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def _extract_xml_block(text: str, tag_name: str) -> str:
12
+ """
13
+ Extracts the last complete XML block from a string, ignoring surrounding text.
14
+ """
15
+ # This regex finds all occurrences of the specified XML block
16
+ matches = re.findall(f"<{tag_name}.*?</{tag_name}>", text, re.DOTALL)
17
+ if matches:
18
+ # Return the last match, which should be the assistant's response
19
+ return matches[-1]
20
+ logger.error(f"Could not find <{tag_name}> block in text: {text}")
21
+ return ""
22
+
23
+ def analyze_with_llm(
24
+ pdf_file: Any,
25
+ metadata: Dict[str, Any]
26
+ ) -> Dict[str, Any]:
27
+ """
28
+ Perform compliance analysis with an LLM using a single, unified prompt.
29
+ """
30
+ logger.info("Performing compliance analysis with LLM.")
31
+
32
+ # Create a unified prompt
33
+ unified_prompt = f"""
34
+ You are an expert in academic document formatting and citation. Your goal is to analyze the user's document for compliance with the Graduate Center's formatting rules and generate a comprehensive compliance report in XML format.
35
+
36
+ Your response MUST be in the following XML format. Do not include any other text or explanations outside of the XML structure.
37
+
38
+ <compliance_report>
39
+ <summary>
40
+ <overall_assessment></overall_assessment>
41
+ <total_issues></total_issues>
42
+ <critical_issues></critical_issues>
43
+ <warning_issues></warning_issues>
44
+ </summary>
45
+ <recommendations>
46
+ <recommendation></recommendation>
47
+ </recommendations>
48
+ <issues>
49
+ <issue severity="critical/warning/info">
50
+ <message></message>
51
+ <location></location>
52
+ <suggestion></suggestion>
53
+ </issue>
54
+ </issues>
55
+ </compliance_report>
56
+
57
+ **Formatting Rules to Enforce**
58
+
59
+ {get_formatting_rules()}
60
+
61
+ **Document Metadata**
62
+
63
+ {metadata}
64
+
65
+ **Instructions**
66
+
67
+ Please analyze the attached PDF document and generate the compliance report.
68
+
69
+ **Important Considerations for Analysis:**
70
+ * **Citation Style and Department:** Determine the citation style (e.g., APA, MLA, Chicago) and the author's department from the document's content. The document should follow the style manual for its discipline.
71
+ * **Page Numbering:** When reporting the location of an issue, use the page number exactly as it is written in the document (e.g., 'vii', '12'). Do not use the PDF reader's page count (unless necessary to clarify).
72
+ * **Visual Formatting:** When assessing visual properties like line spacing, margins, or font size from a PDF, be aware that text extraction can be imperfect. Base your findings on clear and consistent evidence throughout the document. Do not flag minor variations that could be due to PDF rendering. For example, only flag a line spacing issue if it is consistently incorrect across multiple pages and sections. Assume line spacing is correct unless it is obviously and consistently wrong.
73
+ * **Rule Interpretation:** Apply the formatting rules strictly but fairly. If a rule is ambiguous, note the ambiguity in your assessment.
74
+ * **Completeness:** Ensure that you check every rule against the document and that your report is complete.
75
+ """
76
+
77
+ # Initialize the OpenAI client
78
+ client = OpenAI(
79
+ base_url=settings.llm_base_url,
80
+ api_key=settings.openrouter_api_key,
81
+ )
82
+
83
+ # Read the PDF and encode it as base64
84
+ base64_pdf = base64.b64encode(pdf_file).decode('utf-8')
85
+
86
+ try:
87
+ completion = client.chat.completions.create(
88
+ model=settings.llm_model_name,
89
+ messages=[
90
+ {
91
+ "role": "user",
92
+ "content": [
93
+ {"type": "text", "text": unified_prompt},
94
+ {
95
+ "type": "file",
96
+ "file": {
97
+ "file_data": f"data:application/pdf;base64,{base64_pdf}"
98
+ }
99
+ }
100
+ ],
101
+ }
102
+ ],
103
+ )
104
+ raw_response = completion.choices[0].message.content
105
+ except Exception as e:
106
+ logger.error(f"An error occurred: {e}")
107
+ return {"error": "An error occurred while communicating with the LLM."}
108
+
109
+ clean_xml = _extract_xml_block(raw_response, "compliance_report")
110
+ if not clean_xml:
111
+ logger.error("Could not extract <compliance_report> XML block from the response.")
112
+ return {"error": "Could not extract <compliance_report> XML block from the response."}
113
+
114
+ logger.info(f"Final assembled report:\n{clean_xml}")
115
+
116
+ # Parse the final XML output
117
+ try:
118
+ root = ET.fromstring(clean_xml)
119
+
120
+ summary_node = root.find("summary")
121
+ summary = {
122
+ "overall_assessment": summary_node.findtext("overall_assessment", "No assessment available."),
123
+ "total_issues": summary_node.findtext("total_issues", "N/A"),
124
+ "critical_issues": summary_node.findtext("critical_issues", "N/A"),
125
+ "warning_issues": summary_node.findtext("warning_issues", "N/A"),
126
+ } if summary_node is not None else {}
127
+
128
+ issues = []
129
+ for issue_node in root.findall(".//issue"):
130
+ issues.append({
131
+ "severity": issue_node.get("severity"),
132
+ "message": issue_node.findtext("message"),
133
+ "location": issue_node.findtext("location"),
134
+ "suggestion": issue_node.findtext("suggestion"),
135
+ })
136
+
137
+ recommendations = [rec.text for rec in root.findall(".//recommendation")]
138
+
139
+ return {
140
+ "raw_xml": clean_xml,
141
+ "summary": summary,
142
+ "issues": issues,
143
+ "recommendations": recommendations,
144
+ }
145
+
146
+ except ET.ParseError as e:
147
+ logger.error(f"Failed to parse final LLM output: {e}", exc_info=True)
148
+ return {
149
+ "raw_xml": raw_response,
150
+ "error": "Failed to parse final LLM output."
151
+ }
152
+
153
+ def get_formatting_rules() -> str:
154
+ """
155
+ Load the formatting rules from the markdown file.
156
+ """
157
+ with open(settings.formatting_rules_path, "r") as f:
158
+ return f.read()
modules/report_generator.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Any
3
+ from fpdf import FPDF
4
+ import xml.etree.ElementTree as ET
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ def generate_report(analysis_results: Dict[str, Any]) -> Dict[str, Any]:
9
+ """
10
+ Generate a review report from the LLM's analysis results.
11
+ """
12
+ report = {
13
+ "summary": {},
14
+ "issues": [],
15
+ "recommendations": [],
16
+ "pdf_content": b""
17
+ }
18
+
19
+ # Parse the XML output
20
+ raw_xml = analysis_results.get("raw_xml", "").strip()
21
+ if not raw_xml:
22
+ logger.error("Received empty or missing XML for report generation.")
23
+ return report
24
+
25
+ try:
26
+ root = ET.fromstring(raw_xml)
27
+
28
+ # Extract summary
29
+ summary_node = root.find("summary")
30
+ if summary_node is not None:
31
+ report["summary"] = {
32
+ "overall_assessment": summary_node.findtext("overall_assessment", "N/A"),
33
+ "total_issues": summary_node.findtext("total_issues", "N/A"),
34
+ "critical_issues": summary_node.findtext("critical_issues", "N/A"),
35
+ "warning_issues": summary_node.findtext("warning_issues", "N/A"),
36
+ }
37
+
38
+ # Extract issues
39
+ for issue_node in root.findall(".//issue"):
40
+ issue = {
41
+ "severity": issue_node.get("severity"),
42
+ "message": issue_node.findtext("message"),
43
+ "location": issue_node.findtext("location"),
44
+ "suggestion": issue_node.findtext("suggestion")
45
+ }
46
+ report["issues"].append(issue)
47
+
48
+ # Extract recommendations
49
+ for rec_node in root.findall(".//recommendation"):
50
+ report["recommendations"].append(rec_node.text)
51
+
52
+ except ET.ParseError as e:
53
+ logger.error(f"Failed to parse XML for report generation: {e}", exc_info=True)
54
+ # Handle XML parsing errors
55
+ pass
56
+
57
+ # Generate PDF report
58
+ pdf = FPDF()
59
+ pdf.add_page()
60
+ pdf.set_font("Arial", size=12)
61
+
62
+ pdf.cell(200, 10, txt="Dissertation Review Report", ln=True, align="C")
63
+
64
+ # TODO: Add more details to the PDF report
65
+
66
+ report["pdf_content"] = pdf.output(dest='S').encode('latin-1')
67
+
68
+ return report
69
+
70
+ if __name__ == "__main__":
71
+ # This module is intended to be imported, not run directly.
72
+ pass
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pydantic
3
+ python-dotenv
4
+ PyPDF2
5
+ python-docx
6
+ pdf2image
7
+ pytesseract
8
+ openai
9
+ guidance
10
+ fpdf
rules/formatting_rules.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Graduate Center Dissertation Formatting Rules
2
+
3
+ ## Page Sequence
4
+ Adhere to the following page sequence when preparing your manuscript. Note that bibliographic references should be the last section of the document, appearing after all appendices, glossaries, illustrations, or other back matter.
5
+
6
+ 1. **Title page** (no pagination appears)*
7
+ 2. **Copyright page**
8
+ 3. **Approval page**
9
+ 4. **Abstract**
10
+ 5. **Preface, Foreword, and/or Acknowledgments** (optional)
11
+ 6. **Table of Contents** (required)
12
+ 7. **Lists of tables, illustrations, charts, figures, diagrams** (if applicable)
13
+ 8. **Digital Manifest** (required for all digital projects)
14
+ 9. **A Note on Technical Specifications** (required for all digital projects)
15
+ 10. **Body of Text** (begins with Arabic numeral 1)
16
+ 11. **Appendix or Appendices** (including Data Dictionary, if applicable)
17
+ 12. **Bibliography / References**
18
+ 13. **Autobiographical Statement** (optional)
19
+
20
+ * use the degree year regardless of deposit date (February 2025 grads should have 2025 on their title page even if the deposit is in 2024)
21
+
22
+ ## Text Format Guidelines
23
+
24
+ ### Abstracts
25
+ - There is no word limit for abstracts.
26
+ - For digital projects, abstracts should describe the project scope and include relevant URLs for associated elements such as videos, websites, or code repositories (e.g., GitHub link); if applicable, describe what data has been collected.
27
+ - Abstracts will be published in the CUNY Academic Works repository and ProQuest (if applicable) with author, title, and descriptive information, even if the work is embargoed.
28
+ - An abstract in English is required, even if the text is in a language other than English.
29
+
30
+ ### Approval Page
31
+ - The full title, author, manuscript statement, and month/year that the manuscript was approved are included on this page.
32
+ - The approval page lists the primary advisor(s), executive officer or program director, and supervisory committee (if applicable) for the work being submitted.
33
+
34
+ ### Color
35
+ - PDF and print reproductions may include color, and ProQuest reproductions will include any color submitted.
36
+
37
+ ### Data Dictionary (if applicable)
38
+ - For projects that involve code or datasets, a Data Dictionary or equivalent must be included.
39
+ - A Data Dictionary is typically located in the Appendix or in a separate file, and should be formatted according to your disciplinary conventions.
40
+ - If it is submitted as a separate file, include the file name and format in the Digital Manifest.
41
+ - Content could detail significant variables and critical functions.
42
+
43
+ ### Digital Manifest (Required for Digital Projects)
44
+ - All dissertations, theses, and capstone projects that contain digital projects must include a “Digital Manifest” in the preliminary pages.
45
+ - Formatted like a Table of Contents, this section provides a master list of all the components—print and digital—that constitute the project.
46
+ - For each component, indicate its file type, a brief description, and URL, if applicable.
47
+
48
+ ### A Note on Technical Specifications (Required for Digital Projects)
49
+ - All dissertations, theses, and capstone projects that contain digital projects must include a “Note on Technical Specifications” in the preliminary pages.
50
+ - This section provides a high-level overview of the project’s components and technical specifications, analogous to a “readme” file.
51
+ - Include information about components housed outside of the library deposit, such as GitHub repositories, and where to find the latest version of materials.
52
+
53
+ ### File Formats
54
+ - ProQuest requires Adobe PDF (not Word) for text.
55
+ - Rather than embedding audio and video files in the PDF body of a work, submit as supplemental files.
56
+
57
+ ### Font
58
+ - Any legible TruType type 1 font is accepted (except script, italic, or ornamental fonts) if equivalent in scale to Arial (10 pt) or Times New Roman (12 pt).
59
+ - Acceptable fonts and sizes for print and web include: Arial (10 pt), Courier New (10 pt), Georgia (11 pt), Times New Roman (12 pt), Trebuchet MS (10 pt), Verdana (10 pt).
60
+
61
+ ### Line-spacing
62
+ - Double-space abstract, dedication, acknowledgements, table of contents, and body of the manuscript.
63
+ - Follow your disciplinary style manual for single- or double-spacing block quotes, captions, items in tables, lists, graphs, charts.
64
+ - Single-space footnotes/endnotes.
65
+
66
+ ### Lists of Contents
67
+ - A table of contents is required.
68
+ - If illustrations, charts, diagrams, figures or other tables appear in the work, a list of each named element, with corresponding pagination, is required.
69
+
70
+ ### Margins
71
+ - left: 1”; right: 1″; top and bottom: 1″ for all text (except page numbers and headers/footers) and figures, footnotes/endnotes, and images
72
+
73
+ ### Page numbers
74
+ - must appear at least ¾” from any edge of page
75
+
76
+ ### Pagination
77
+ - Pages preliminary to the body of the text must be numbered with lowercase Roman numerals.
78
+ - Do not number the title page but count it in the preliminary pagination.
79
+ - The body of the text is numbered with Arabic numerals beginning with the first page of text and including illustrations, appendix, and bibliography.
80
+ - Except for the title page, all pages must be numbered.
81
+ - The numerals may appear in any location on the page (bottom middle, bottom right, upper right), but must be consistent throughout the work.
82
+
83
+ ### Quotations
84
+ - Consult copyright fair use guidelines.
85
+ - Include permission letters for use of copyrighted materials that exceeds fair use (photographs, charts, tables, etc.).
86
+ - Submit copyright permission letters as supplemental files as part of your online submission.
87
+
88
+ ### References / Bibliography
89
+ - In your references section, include the platforms, software libraries, and code used in your project. These can be separated from other bibliographic citations included in your manuscript if desired.
90
+
91
+ ### Style and Style Manuals
92
+ - Use the style manual for your discipline except when in conflict with these instructions.
93
+ - Consult dissertation advisors as necessary.
94
+
95
+ ### Title Page
96
+ - The title page must include the full dissertation title, the complete name of the author, the dissertation statement, and the year of the degree.
97
+ - Use words to spell out titles including formulas, symbols, superscripts, subscripts, and Greek letters.
98
+ - While prohibited from the title, symbols may be used throughout the text.
startup_dissistant.sh ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Startup script for Dissistant
3
+ # This script starts the Streamlit service and exposes it via Tailscale Serve
4
+
5
+ # Exit on error
6
+ set -e
7
+
8
+ # --- Configuration for Streamlit App ---
9
+ STREAMLIT_APP_FILE="app.py"
10
+ STREAMLIT_PORT="8502"
11
+ STREAMLIT_PID_FILE="dissistant.pid"
12
+ STREAMLIT_LOG_FILE="dissistant.log"
13
+ # --- End Configuration ---
14
+
15
+ # Check if UV is installed
16
+ if ! command -v uv &> /dev/null; then
17
+ echo "Error: UV is not installed. Please install UV first."
18
+ echo "You can install UV with: pip install uv"
19
+ exit 1
20
+ fi
21
+
22
+ # Create virtual environment if it doesn't exist
23
+ if [ ! -d ".venv" ]; then
24
+ echo "Creating virtual environment..."
25
+ uv venv
26
+ fi
27
+
28
+ # Activate virtual environment
29
+ echo "Activating virtual environment..."
30
+ source .venv/bin/activate
31
+
32
+ # Install dependencies
33
+ echo "Installing dependencies..."
34
+ uv pip install -r requirements.txt
35
+
36
+ # Kill any existing instances of the Streamlit app
37
+ echo "Stopping any existing instances of the Streamlit app..."
38
+ if [ -f "$STREAMLIT_PID_FILE" ]; then
39
+ OLD_STREAMLIT_PID=$(cat $STREAMLIT_PID_FILE)
40
+ if ps -p $OLD_STREAMLIT_PID > /dev/null; then
41
+ kill $OLD_STREAMLIT_PID
42
+ echo "Killed existing Streamlit app process with PID $OLD_STREAMLIT_PID"
43
+ sleep 1 # Give it time to shut down
44
+ else
45
+ echo "No running Streamlit app process found with PID $OLD_STREAMLIT_PID"
46
+ fi
47
+ fi
48
+ # Also try to kill any other streamlit processes for this specific app file and port
49
+ pkill -f "streamlit run $STREAMLIT_APP_FILE --server.port $STREAMLIT_PORT" || true
50
+ sleep 1
51
+
52
+ # Start the Streamlit app
53
+ echo "Starting Streamlit app on port $STREAMLIT_PORT..."
54
+ nohup streamlit run $STREAMLIT_APP_FILE --server.port $STREAMLIT_PORT --server.headless true > $STREAMLIT_LOG_FILE 2>&1 &
55
+ echo $! > $STREAMLIT_PID_FILE
56
+
57
+ # Check if the Streamlit service started successfully
58
+ sleep 3 # Give Streamlit a bit more time to start
59
+ if ! nc -z localhost $STREAMLIT_PORT; then
60
+ echo "Error: Failed to start Streamlit app on port $STREAMLIT_PORT."
61
+ cat $STREAMLIT_LOG_FILE # Output log file for debugging
62
+ exit 1
63
+ else
64
+ echo "Streamlit app started successfully on port $STREAMLIT_PORT."
65
+ fi
66
+
67
+ # Check if Tailscale is installed
68
+ if ! command -v tailscale &> /dev/null; then
69
+ echo "Warning: Tailscale is not installed. The app will only be available locally."
70
+ echo "Install Tailscale to expose the service over your tailnet."
71
+ else
72
+ # Expose the service via Tailscale Serve
73
+ echo "Exposing Streamlit app via Tailscale Serve on port $STREAMLIT_PORT..."
74
+ echo "Setting up Funnel on port 10000..."
75
+ tailscale funnel --https=10000 --bg localhost:$STREAMLIT_PORT
76
+
77
+ # Get the Tailscale hostname
78
+ HOSTNAME=$(tailscale status --json | jq -r '.Self.DNSName')
79
+ if [ -n "$HOSTNAME" ]; then
80
+ echo "App may be available at a Tailscale URL. Check 'tailscale status' for details."
81
+ echo "If using a funnel, it might be https://$HOSTNAME/"
82
+ else
83
+ echo "App is exposed via Tailscale Serve, but couldn't determine the primary hostname."
84
+ echo "Check 'tailscale status' for details."
85
+ fi
86
+ fi
87
+
88
+ echo "Dissistant is now running!"
89
+ echo "Local URL: http://localhost:$STREAMLIT_PORT"
90
+ echo "Log file: $STREAMLIT_LOG_FILE"
91
+ echo "PID file: $STREAMLIT_PID_FILE"
92
+ echo ""
93
+ echo "If Tailscale is active, the app should be accessible via a Tailscale funnel URL."
utils/llm_utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import httpx
3
+ from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ RETRYABLE_EXCEPTIONS = (
8
+ httpx.TimeoutException,
9
+ httpx.NetworkError,
10
+ )
11
+
12
+ @retry(
13
+ stop=stop_after_attempt(3),
14
+ wait=wait_exponential(multiplier=1, min=4, max=10),
15
+ retry=retry_if_exception_type(RETRYABLE_EXCEPTIONS),
16
+ before_sleep=lambda retry_state: logger.info(f"Retrying LLM call due to {retry_state.outcome.exception()}, attempt {retry_state.attempt_number + 1}...")
17
+ )
18
+ def call_llm_with_retry(llm, program):
19
+ """
20
+ Executes a guidance program with a given LLM, with retry logic.
21
+ The program is called with the llm, i.e., program(llm).
22
+ """
23
+ return program(llm)