shayan5422 commited on
Commit
a469ee1
·
verified ·
1 Parent(s): 3c2310b

Upload 9 files

Browse files
Files changed (9) hide show
  1. DEPLOYMENT_INSTRUCTIONS.txt +39 -0
  2. Dockerfile +32 -0
  3. README.md +126 -5
  4. app.py +33 -0
  5. converter.py +878 -0
  6. preserve_linebreaks.lua +29 -0
  7. requirements.txt +7 -0
  8. temp/.DS_Store +0 -0
  9. web_api.py +443 -0
DEPLOYMENT_INSTRUCTIONS.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # راهنمای Deploy کردن روی Hugging Face Spaces
3
+
4
+ ## مرحله ۱: ایجاد Space
5
+ 1. به https://huggingface.co/spaces بروید
6
+ 2. "Create new Space" را کلیک کنید
7
+ 3. نام Space را وارد کنید: docx-to-latex
8
+ 4. SDK: Python انتخاب کنید
9
+ 5. "Create Space" را کلیک کنید
10
+
11
+ ## مرحله ۲: آپلود فایل‌ها
12
+ تمام فایل‌های این پوشه را به Space آپلود کنید:
13
+
14
+ ### روش ۱: Git
15
+ ```bash
16
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/docx-to-latex
17
+ cd docx-to-latex
18
+ cp -r ../huggingface_deployment/* .
19
+ git add .
20
+ git commit -m "Add DOCX to LaTeX converter API"
21
+ git push
22
+ ```
23
+
24
+ ### روش ۲: Web Interface
25
+ فایل‌ها را drag & drop کنید در صفحه Space
26
+
27
+ ## مرحله ۳: تست
28
+ پس از deploy، API در آدرس زیر دردسترس خواهد بود:
29
+ https://YOUR_USERNAME-docx-to-latex.hf.space/api/health
30
+
31
+ ## فایل‌های کپی شده:
32
+ - app.py
33
+ - web_api.py
34
+ - converter.py
35
+ - requirements.txt
36
+ - README.md
37
+ - Dockerfile
38
+ - .gitignore
39
+ - preserve_linebreaks.lua
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ pandoc \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements and install Python dependencies
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy core application files
16
+ COPY app.py .
17
+ COPY web_api.py .
18
+ COPY converter.py .
19
+ COPY preserve_linebreaks.lua .
20
+
21
+ # Create necessary directories
22
+ RUN mkdir -p temp/uploads temp/outputs
23
+
24
+ # Expose port
25
+ EXPOSE 7860
26
+
27
+ # Set environment variables
28
+ ENV PYTHONPATH=/app
29
+ ENV PORT=7860
30
+
31
+ # Run the application
32
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,131 @@
1
  ---
2
- title: Docx To Latex
3
- emoji: 🐢
4
- colorFrom: pink
5
- colorTo: yellow
6
  sdk: docker
 
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: DOCX to LaTeX Converter
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
+ license: mit
10
  ---
11
 
12
+ # 📄 DOCX to LaTeX Converter API
13
+
14
+ تبدیل‌کننده حرفه‌ای فایل‌های Word (DOCX) به LaTeX با قابلیت‌های پیشرفته
15
+
16
+ A professional DOCX to LaTeX converter with advanced features and modern web interface.
17
+
18
+ ## 🌟 ویژگی‌ها / Features
19
+
20
+ ### فارسی
21
+ - ✅ تبدیل فایل‌های DOCX به LaTeX با کیفیت بالا
22
+ - ✅ استخراج و حفظ تصاویر
23
+ - ✅ سازگار با Overleaf
24
+ - ✅ حفظ فرمت‌ها و استایل‌ها
25
+ - ✅ تولید فهرست مطالب خودکار
26
+ - ✅ دانلود فایل کامل در قالب ZIP
27
+ - ✅ رابط API ساده و قدرتمند
28
+ - ✅ اجرا رایگان روی Hugging Face Spaces
29
+
30
+ ### English
31
+ - ✅ High-quality DOCX to LaTeX conversion
32
+ - ✅ Image extraction and preservation
33
+ - ✅ Overleaf compatibility
34
+ - ✅ Style and formatting preservation
35
+ - ✅ Automatic table of contents generation
36
+ - ✅ Complete ZIP package download
37
+ - ✅ Simple and powerful API interface
38
+ - ✅ Free hosting on Hugging Face Spaces
39
+
40
+ ## 🚀 استفاده / Usage
41
+
42
+ ### API Endpoints
43
+
44
+ #### 1. Health Check
45
+ ```bash
46
+ GET /api/health
47
+ ```
48
+
49
+ #### 2. Upload File
50
+ ```bash
51
+ POST /api/upload
52
+ Content-Type: multipart/form-data
53
+ Body: file (DOCX file)
54
+ ```
55
+
56
+ #### 3. Convert Document
57
+ ```bash
58
+ POST /api/convert
59
+ Content-Type: application/json
60
+ Body: {
61
+ "task_id": "string",
62
+ "output_filename": "string",
63
+ "options": {
64
+ "generateToc": boolean,
65
+ "extractMedia": boolean,
66
+ "overleafCompatible": boolean,
67
+ "preserveStyles": boolean,
68
+ "preserveLineBreaks": boolean
69
+ }
70
+ }
71
+ ```
72
+
73
+ #### 4. Download Complete Package
74
+ ```bash
75
+ GET /api/download-complete/{task_id}
76
+ ```
77
+
78
+ ### مثال استفاده / Example Usage
79
+
80
+ ```python
81
+ import requests
82
+
83
+ # Upload file
84
+ with open('document.docx', 'rb') as f:
85
+ response = requests.post('https://YOUR_USERNAME-docx-to-latex.hf.space/api/upload',
86
+ files={'file': f})
87
+ task_id = response.json()['task_id']
88
+
89
+ # Convert
90
+ convert_response = requests.post('https://YOUR_USERNAME-docx-to-latex.hf.space/api/convert',
91
+ json={
92
+ 'task_id': task_id,
93
+ 'options': {
94
+ 'generateToc': True,
95
+ 'extractMedia': True,
96
+ 'overleafCompatible': True
97
+ }
98
+ })
99
+
100
+ # Download complete package
101
+ download_response = requests.get(f'https://YOUR_USERNAME-docx-to-latex.hf.space/api/download-complete/{task_id}')
102
+ with open('converted_package.zip', 'wb') as f:
103
+ f.write(download_response.content)
104
+ ```
105
+
106
+ ## 🔧 نصب محلی / Local Installation
107
+
108
+ ```bash
109
+ git clone https://github.com/YOUR_USERNAME/docx-to-latex.git
110
+ cd docx-to-latex
111
+ pip install -r requirements.txt
112
+ python app.py
113
+ ```
114
+
115
+ ## 📚 مستندات / Documentation
116
+
117
+ این API امکان تبدیل فایل‌های Word به LaTeX با حفظ فرمت‌ها، تصاویر و جداول را فراهم می‌کند. خروجی نهایی شامل فایل LaTeX و پوشه تصاویر در قالب ZIP است که مستقیماً در Overleaf قابل استفاده است.
118
+
119
+ This API provides seamless conversion from Word documents to LaTeX while preserving formatting, images, and tables. The final output includes the LaTeX file and media folder in a ZIP package ready for use in Overleaf.
120
+
121
+ ## 🤝 مشارکت / Contributing
122
+
123
+ مشارکت‌ها خوشحال دریافت می‌شوند! لطفاً Issue ایجاد کرده یا Pull Request ارسال کنید.
124
+
125
+ Contributions are welcome! Please feel free to submit issues or pull requests.
126
+
127
+ ## 📄 مجوز / License
128
+
129
+ MIT License - برای جزئیات فایل LICENSE را مشاهده کنید.
130
+
131
+ MIT License - see LICENSE file for details.
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ DOCX to LaTeX Converter API
4
+ Main entry point for Hugging Face Spaces deployment
5
+ """
6
+
7
+ import os
8
+ import sys
9
+
10
+ # Set up environment for Hugging Face Spaces
11
+ if 'SPACE_ID' in os.environ:
12
+ # Running on Hugging Face Spaces
13
+ PORT = int(os.environ.get('PORT', 7860))
14
+ HOST = '0.0.0.0'
15
+ else:
16
+ # Running locally
17
+ PORT = 5001
18
+ HOST = '127.0.0.1'
19
+
20
+ # Import the Flask app
21
+ from web_api import app
22
+
23
+ if __name__ == "__main__":
24
+ print(f"🚀 Starting DOCX to LaTeX Converter API")
25
+ print(f"🌐 Server running on http://{HOST}:{PORT}")
26
+ print(f"📖 Health check: http://{HOST}:{PORT}/api/health")
27
+ print(f"📚 API Documentation: https://huggingface.co/spaces/YOUR_USERNAME/docx-to-latex")
28
+
29
+ app.run(
30
+ host=HOST,
31
+ port=PORT,
32
+ debug=False # Disable debug in production
33
+ )
converter.py ADDED
@@ -0,0 +1,878 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pypandoc
2
+ import os
3
+ import re
4
+ import tempfile
5
+
6
+ def convert_docx_to_latex(
7
+ docx_path: str,
8
+ latex_path: str,
9
+ generate_toc: bool = False,
10
+ extract_media_to_path: str = None,
11
+ latex_template_path: str = None,
12
+ overleaf_compatible: bool = False,
13
+ preserve_styles: bool = True,
14
+ preserve_linebreaks: bool = True
15
+ ) -> tuple[bool, str]:
16
+ """
17
+ Converts a DOCX file to a LaTeX file using pypandoc with enhanced features.
18
+
19
+ Args:
20
+ docx_path: Path to the input .docx file.
21
+ latex_path: Path to save the output .tex file.
22
+ generate_toc: If True, attempts to generate a Table of Contents.
23
+ extract_media_to_path: If specified, path to extract media to (e.g., "./media").
24
+ latex_template_path: If specified, path to a custom Pandoc LaTeX template file.
25
+ overleaf_compatible: If True, makes images work in Overleaf with relative paths.
26
+ preserve_styles: If True, preserves document styles like centering and alignment.
27
+ preserve_linebreaks: If True, preserves line breaks and proper list formatting.
28
+
29
+ Returns:
30
+ A tuple (success: bool, message: str).
31
+ """
32
+ extra_args = []
33
+
34
+ # Ensure standalone document (not fragment)
35
+ extra_args.append("--standalone")
36
+
37
+ # Basic options
38
+ if generate_toc:
39
+ extra_args.append("--toc")
40
+ if extract_media_to_path:
41
+ extra_args.append(f"--extract-media={extract_media_to_path}")
42
+ if latex_template_path and os.path.isfile(latex_template_path):
43
+ extra_args.append(f"--template={latex_template_path}")
44
+ elif latex_template_path:
45
+ pass # Template not found, Pandoc will handle the error
46
+
47
+ # Enhanced features
48
+ if overleaf_compatible:
49
+ extra_args.extend([
50
+ "--resource-path=./",
51
+ "--default-image-extension=png"
52
+ ])
53
+
54
+ if preserve_styles:
55
+ extra_args.extend([
56
+ "--from=docx+styles",
57
+ "--wrap=preserve",
58
+ "--columns=72",
59
+ "--strip-comments" # Remove comments that might cause highlighting
60
+ ])
61
+
62
+ if preserve_linebreaks:
63
+ extra_args.extend([
64
+ "--preserve-tabs",
65
+ "--wrap=preserve",
66
+ "--reference-doc=" + docx_path # Use original Word doc as reference for formatting
67
+ ])
68
+
69
+ # Create minimal Lua filter that preserves Word's original line breaks
70
+ lua_filter_content = '''
71
+ function Para(elem)
72
+ -- Preserve all line breaks exactly as they appear in Word
73
+ -- This maintains Word's original pagination and formatting
74
+ local new_content = {}
75
+
76
+ for i, item in ipairs(elem.content) do
77
+ if item.t == "SoftBreak" then
78
+ -- Convert all soft breaks to line breaks to match Word's formatting
79
+ table.insert(new_content, pandoc.LineBreak())
80
+ else
81
+ table.insert(new_content, item)
82
+ end
83
+ end
84
+
85
+ elem.content = new_content
86
+ return elem
87
+ end
88
+
89
+ function LineBlock(elem)
90
+ -- Preserve line blocks exactly as they are
91
+ return elem
92
+ end
93
+
94
+ function Span(elem)
95
+ -- Remove unwanted highlighting and formatting
96
+ if elem.attributes and elem.attributes.style then
97
+ -- Remove background colors and highlighting
98
+ local style = elem.attributes.style
99
+ if string.find(style, "background") or string.find(style, "highlight") then
100
+ elem.attributes.style = nil
101
+ end
102
+ end
103
+ return elem
104
+ end
105
+
106
+ function Div(elem)
107
+ -- Remove unwanted div formatting that causes highlighting
108
+ if elem.attributes and elem.attributes.style then
109
+ local style = elem.attributes.style
110
+ if string.find(style, "background") or string.find(style, "highlight") then
111
+ elem.attributes.style = nil
112
+ end
113
+ end
114
+ return elem
115
+ end
116
+
117
+ function RawBlock(elem)
118
+ -- Preserve raw LaTeX blocks
119
+ if elem.format == "latex" then
120
+ return elem
121
+ end
122
+ end
123
+ '''
124
+
125
+ # Create temporary Lua filter file
126
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.lua', delete=False) as f:
127
+ f.write(lua_filter_content)
128
+ lua_filter_path = f.name
129
+
130
+ extra_args.append(f"--lua-filter={lua_filter_path}")
131
+
132
+ try:
133
+ # Perform conversion
134
+ pypandoc.convert_file(docx_path, 'latex', outputfile=latex_path, extra_args=extra_args)
135
+
136
+ # Clean up temporary Lua filter if created
137
+ if preserve_linebreaks and 'lua_filter_path' in locals():
138
+ try:
139
+ os.unlink(lua_filter_path)
140
+ except OSError:
141
+ pass
142
+
143
+ # Apply post-processing enhancements (always applied for Unicode conversion)
144
+ _apply_post_processing(latex_path, overleaf_compatible, preserve_styles, preserve_linebreaks, extract_media_to_path)
145
+
146
+ # Generate status message
147
+ enhancements = []
148
+ if overleaf_compatible:
149
+ enhancements.append("Overleaf compatibility")
150
+ if preserve_styles:
151
+ enhancements.append("style preservation")
152
+ if preserve_linebreaks:
153
+ enhancements.append("line break preservation")
154
+
155
+ if enhancements:
156
+ enhancement_msg = f" with {', '.join(enhancements)}"
157
+ else:
158
+ enhancement_msg = ""
159
+
160
+ return True, f"Conversion successful{enhancement_msg}!"
161
+
162
+ except RuntimeError as e:
163
+ # Clean up temporary Lua filter if created
164
+ if preserve_linebreaks and 'lua_filter_path' in locals():
165
+ try:
166
+ os.unlink(lua_filter_path)
167
+ except OSError:
168
+ pass
169
+ return False, f"RuntimeError: Could not execute Pandoc. Please ensure Pandoc is installed and in your system's PATH. Error: {e}"
170
+ except Exception as e:
171
+ # Clean up temporary Lua filter if created
172
+ if preserve_linebreaks and 'lua_filter_path' in locals():
173
+ try:
174
+ os.unlink(lua_filter_path)
175
+ except OSError:
176
+ pass
177
+ return False, f"Conversion failed: {e}"
178
+
179
+ def _apply_post_processing(latex_path: str, overleaf_compatible: bool, preserve_styles: bool, preserve_linebreaks: bool, extract_media_to_path: str = None):
180
+ """
181
+ Apply post-processing enhancements to the generated LaTeX file.
182
+ """
183
+ try:
184
+ with open(latex_path, 'r', encoding='utf-8') as f:
185
+ content = f.read()
186
+
187
+ # Always inject essential packages for compilation compatibility
188
+ content = _inject_essential_packages(content)
189
+
190
+ # Fix mixed mathematical expressions first to remove duplicated text
191
+ content = _fix_mixed_mathematical_expressions(content)
192
+
193
+ # Convert Unicode mathematical characters to LaTeX equivalents (always applied)
194
+ content = _convert_unicode_math_characters(content)
195
+
196
+ # Apply additional Unicode cleanup as a safety net
197
+ content = _additional_unicode_cleanup(content)
198
+
199
+ # Apply overleaf compatibility fixes
200
+ if overleaf_compatible:
201
+ content = _fix_image_paths_for_overleaf(content, extract_media_to_path)
202
+
203
+ # Apply style preservation enhancements
204
+ if preserve_styles:
205
+ content = _inject_latex_packages(content)
206
+ content = _add_centering_commands(content)
207
+
208
+ # Apply line break preservation fixes
209
+ if preserve_linebreaks:
210
+ content = _fix_line_breaks_and_spacing(content)
211
+
212
+ # Remove unwanted formatting and highlighting
213
+ content = _remove_unwanted_formatting(content)
214
+
215
+ # Fix common LaTeX compilation issues
216
+ content = _fix_compilation_issues(content)
217
+
218
+ # Write back the processed content
219
+ with open(latex_path, 'w', encoding='utf-8') as f:
220
+ f.write(content)
221
+
222
+ except Exception as e:
223
+ # Post-processing failures shouldn't break the conversion
224
+ print(f"Warning: Post-processing failed: {e}")
225
+
226
+ def _inject_essential_packages(content: str) -> str:
227
+ """
228
+ Inject essential packages that are always needed for compilation.
229
+ """
230
+ # Core packages that Pandoc might not include but are often needed
231
+ essential_packages = [
232
+ r'\usepackage[utf8]{inputenc}', # UTF-8 input encoding
233
+ r'\usepackage[T1]{fontenc}', # Font encoding
234
+ r'\usepackage{graphicx}', # For images
235
+ r'\usepackage{longtable}', # For tables
236
+ r'\usepackage{booktabs}', # Better table formatting
237
+ r'\usepackage{hyperref}', # For links (if not already included)
238
+ r'\usepackage{amsmath}', # Mathematical formatting
239
+ r'\usepackage{amssymb}', # Mathematical symbols
240
+ r'\usepackage{textcomp}', # Additional text symbols
241
+ ]
242
+
243
+ documentclass_pattern = r'\\documentclass(?:\[[^\]]*\])?\{[^}]+\}'
244
+ documentclass_match = re.search(documentclass_pattern, content)
245
+
246
+ if documentclass_match:
247
+ insert_pos = documentclass_match.end()
248
+
249
+ packages_to_insert = []
250
+ for package in essential_packages:
251
+ package_name = package.split('{')[1].split('}')[0].split(']')[0] # Extract package name
252
+ if f'usepackage' not in content or package_name not in content:
253
+ packages_to_insert.append(package)
254
+
255
+ if packages_to_insert:
256
+ package_block = '\n% Essential packages for compilation\n' + '\n'.join(packages_to_insert) + '\n'
257
+ content = content[:insert_pos] + package_block + content[insert_pos:]
258
+
259
+ # Add Unicode character definitions to handle any remaining problematic characters
260
+ unicode_definitions = r'''
261
+ % Unicode character definitions for LaTeX compatibility
262
+ \DeclareUnicodeCharacter{2003}{ } % Em space
263
+ \DeclareUnicodeCharacter{2002}{ } % En space
264
+ \DeclareUnicodeCharacter{2009}{ } % Thin space
265
+ \DeclareUnicodeCharacter{200A}{ } % Hair space
266
+ \DeclareUnicodeCharacter{2004}{ } % Three-per-em space
267
+ \DeclareUnicodeCharacter{2005}{ } % Four-per-em space
268
+ \DeclareUnicodeCharacter{2006}{ } % Six-per-em space
269
+ \DeclareUnicodeCharacter{2008}{ } % Punctuation space
270
+ \DeclareUnicodeCharacter{202F}{ } % Narrow no-break space
271
+ \DeclareUnicodeCharacter{2212}{-} % Unicode minus sign
272
+ \DeclareUnicodeCharacter{2010}{-} % Hyphen
273
+ \DeclareUnicodeCharacter{2011}{-} % Non-breaking hyphen
274
+ \DeclareUnicodeCharacter{2013}{--} % En dash
275
+ \DeclareUnicodeCharacter{2014}{---}% Em dash
276
+ '''
277
+
278
+ # Insert Unicode definitions after packages but before \begin{document}
279
+ begin_doc_match = re.search(r'\\begin\{document\}', content)
280
+ if begin_doc_match:
281
+ insert_pos_unicode = begin_doc_match.start()
282
+ content = content[:insert_pos_unicode] + unicode_definitions + '\n' + content[insert_pos_unicode:]
283
+
284
+ return content
285
+
286
+ def _convert_unicode_math_characters(content: str) -> str:
287
+ """
288
+ Convert Unicode mathematical characters to their LaTeX equivalents.
289
+ """
290
+ # Dictionary of Unicode characters to LaTeX commands
291
+ unicode_to_latex = {
292
+ # Mathematical operators
293
+ 'Δ': r'$\Delta$', # U+0394 - Greek capital letter delta
294
+ 'δ': r'$\delta$', # U+03B4 - Greek small letter delta
295
+ '∑': r'$\sum$', # U+2211 - N-ary summation
296
+ '∏': r'$\prod$', # U+220F - N-ary product
297
+ '∫': r'$\int$', # U+222B - Integral
298
+ '∂': r'$\partial$', # U+2202 - Partial differential
299
+ '∇': r'$\nabla$', # U+2207 - Nabla
300
+ '√': r'$\sqrt{}$', # U+221A - Square root
301
+ '∞': r'$\infty$', # U+221E - Infinity
302
+
303
+ # Relations and equality
304
+ '≈': r'$\approx$', # U+2248 - Almost equal to
305
+ '≠': r'$\neq$', # U+2260 - Not equal to
306
+ '≤': r'$\leq$', # U+2264 - Less-than or equal to
307
+ '≥': r'$\geq$', # U+2265 - Greater-than or equal to
308
+ '±': r'$\pm$', # U+00B1 - Plus-minus sign
309
+ '∓': r'$\mp$', # U+2213 - Minus-or-plus sign
310
+ '×': r'$\times$', # U+00D7 - Multiplication sign
311
+ '÷': r'$\div$', # U+00F7 - Division sign
312
+ '⋅': r'$\cdot$', # U+22C5 - Dot operator
313
+
314
+ # Set theory and logic
315
+ '∈': r'$\in$', # U+2208 - Element of
316
+ '∉': r'$\notin$', # U+2209 - Not an element of
317
+ '⊂': r'$\subset$', # U+2282 - Subset of
318
+ '⊃': r'$\supset$', # U+2283 - Superset of
319
+ '⊆': r'$\subseteq$', # U+2286 - Subset of or equal to
320
+ '⊇': r'$\supseteq$', # U+2287 - Superset of or equal to
321
+ '∪': r'$\cup$', # U+222A - Union
322
+ '∩': r'$\cap$', # U+2229 - Intersection
323
+ '∅': r'$\emptyset$', # U+2205 - Empty set
324
+ '∀': r'$\forall$', # U+2200 - For all
325
+ '∃': r'$\exists$', # U+2203 - There exists
326
+
327
+ # Special symbols
328
+ '∣': r'$|$', # U+2223 - Divides
329
+ '∥': r'$\parallel$', # U+2225 - Parallel to
330
+ '⊥': r'$\perp$', # U+22A5 - Up tack (perpendicular)
331
+ '∠': r'$\angle$', # U+2220 - Angle
332
+ '°': r'$^\circ$', # U+00B0 - Degree sign
333
+
334
+ # Arrows
335
+ '→': r'$\rightarrow$', # U+2192 - Rightwards arrow
336
+ '←': r'$\leftarrow$', # U+2190 - Leftwards arrow
337
+ '↔': r'$\leftrightarrow$', # U+2194 - Left right arrow
338
+ '⇒': r'$\Rightarrow$', # U+21D2 - Rightwards double arrow
339
+ '⇐': r'$\Leftarrow$', # U+21D0 - Leftwards double arrow
340
+ '⇔': r'$\Leftrightarrow$', # U+21D4 - Left right double arrow
341
+
342
+ # Accents and diacritics
343
+ 'ˉ': r'$\bar{}$', # U+02C9 - Modifier letter macron
344
+ 'ˆ': r'$\hat{}$', # U+02C6 - Modifier letter circumflex accent
345
+ 'ˇ': r'$\check{}$', # U+02C7 - Caron
346
+ '˜': r'$\tilde{}$', # U+02DC - Small tilde
347
+ '˙': r'$\dot{}$', # U+02D9 - Dot above
348
+ '¨': r'$\ddot{}$', # U+00A8 - Diaeresis
349
+
350
+ # Special minus and spaces - using explicit Unicode escape sequences
351
+ '−': r'-', # U+2212 - Minus sign (convert to regular hyphen)
352
+ '\u2003': r' ', # U+2003 - Em space (convert to regular space)
353
+ '\u2009': r' ', # U+2009 - Thin space (convert to regular space)
354
+ '\u2002': r' ', # U+2002 - En space (convert to regular space)
355
+ '\u2004': r' ', # U+2004 - Three-per-em space
356
+ '\u2005': r' ', # U+2005 - Four-per-em space
357
+ '\u2006': r' ', # U+2006 - Six-per-em space
358
+ '\u2008': r' ', # U+2008 - Punctuation space
359
+ '\u200A': r' ', # U+200A - Hair space
360
+ '\u202F': r' ', # U+202F - Narrow no-break space
361
+
362
+ # Greek letters (commonly used in math)
363
+ 'α': r'$\alpha$', # U+03B1
364
+ 'β': r'$\beta$', # U+03B2
365
+ 'γ': r'$\gamma$', # U+03B3
366
+ 'Γ': r'$\Gamma$', # U+0393
367
+ 'ε': r'$\varepsilon$', # U+03B5
368
+ 'ζ': r'$\zeta$', # U+03B6
369
+ 'η': r'$\eta$', # U+03B7
370
+ 'θ': r'$\theta$', # U+03B8
371
+ 'Θ': r'$\Theta$', # U+0398
372
+ 'ι': r'$\iota$', # U+03B9
373
+ 'κ': r'$\kappa$', # U+03BA
374
+ 'λ': r'$\lambda$', # U+03BB
375
+ 'Λ': r'$\Lambda$', # U+039B
376
+ 'μ': r'$\mu$', # U+03BC
377
+ 'ν': r'$\nu$', # U+03BD
378
+ 'ξ': r'$\xi$', # U+03BE
379
+ 'Ξ': r'$\Xi$', # U+039E
380
+ 'π': r'$\pi$', # U+03C0
381
+ 'Π': r'$\Pi$', # U+03A0
382
+ 'ρ': r'$\rho$', # U+03C1
383
+ 'σ': r'$\sigma$', # U+03C3
384
+ 'Σ': r'$\Sigma$', # U+03A3
385
+ 'τ': r'$\tau$', # U+03C4
386
+ 'υ': r'$\upsilon$', # U+03C5
387
+ 'Υ': r'$\Upsilon$', # U+03A5
388
+ 'φ': r'$\varphi$', # U+03C6
389
+ 'Φ': r'$\Phi$', # U+03A6
390
+ 'χ': r'$\chi$', # U+03C7
391
+ 'ψ': r'$\psi$', # U+03C8
392
+ 'Ψ': r'$\Psi$', # U+03A8
393
+ 'ω': r'$\omega$', # U+03C9
394
+ 'Ω': r'$\Omega$', # U+03A9
395
+ }
396
+
397
+ # Apply conversions
398
+ for unicode_char, latex_cmd in unicode_to_latex.items():
399
+ if unicode_char in content:
400
+ content = content.replace(unicode_char, latex_cmd)
401
+
402
+ # Additional aggressive Unicode space cleanup using regex
403
+ # Handle various Unicode spaces more comprehensively
404
+ content = re.sub(r'[\u2000-\u200F\u2028-\u202F\u205F\u3000]', ' ', content) # All Unicode spaces
405
+
406
+ # Handle specific problematic Unicode characters that might not be in our dictionary
407
+ content = re.sub(r'[\u2010-\u2015]', '-', content) # Various Unicode dashes
408
+ content = re.sub(r'[\u2212]', '-', content) # Unicode minus sign
409
+
410
+ # Handle specific cases where characters might appear in math environments
411
+ # Fix double math mode (e.g., $\alpha$ inside already math mode)
412
+ content = re.sub(r'\$\$([^$]+)\$\$', r'$\1$', content) # Convert display math to inline
413
+ content = re.sub(r'\$\$([^$]*)\$([^$]*)\$\$', r'$\1\2$', content) # Fix broken math
414
+
415
+ # Fix bar notation that might have been broken
416
+ content = re.sub(r'\$\\bar\{\}\$([a-zA-Z])', r'$\\bar{\1}$', content)
417
+ content = re.sub(r'([a-zA-Z])\$\\bar\{\}\$', r'$\\bar{\1}$', content)
418
+
419
+ return content
420
+
421
+ def _additional_unicode_cleanup(content: str) -> str:
422
+ """
423
+ Additional aggressive Unicode cleanup to handle any characters that slip through.
424
+ """
425
+ # Convert all common problematic Unicode spaces to regular spaces
426
+ # This covers a wider range than the dictionary approach
427
+ unicode_spaces = [
428
+ '\u00A0', # Non-breaking space
429
+ '\u1680', # Ogham space mark
430
+ '\u2000', # En quad
431
+ '\u2001', # Em quad
432
+ '\u2002', # En space
433
+ '\u2003', # Em space
434
+ '\u2004', # Three-per-em space
435
+ '\u2005', # Four-per-em space
436
+ '\u2006', # Six-per-em space
437
+ '\u2007', # Figure space
438
+ '\u2008', # Punctuation space
439
+ '\u2009', # Thin space
440
+ '\u200A', # Hair space
441
+ '\u200B', # Zero width space
442
+ '\u202F', # Narrow no-break space
443
+ '\u205F', # Medium mathematical space
444
+ '\u3000', # Ideographic space
445
+ ]
446
+
447
+ for unicode_space in unicode_spaces:
448
+ content = content.replace(unicode_space, ' ')
449
+
450
+ # Convert Unicode dashes
451
+ unicode_dashes = [
452
+ '\u2010', # Hyphen
453
+ '\u2011', # Non-breaking hyphen
454
+ '\u2012', # Figure dash
455
+ '\u2013', # En dash
456
+ '\u2014', # Em dash
457
+ '\u2015', # Horizontal bar
458
+ '\u2212', # Minus sign
459
+ ]
460
+
461
+ for unicode_dash in unicode_dashes:
462
+ if unicode_dash in ['\u2013', '\u2014']: # En and Em dashes
463
+ content = content.replace(unicode_dash, '--')
464
+ else:
465
+ content = content.replace(unicode_dash, '-')
466
+
467
+ # Use regex for any remaining problematic characters
468
+ # Remove or replace any remaining Unicode characters that commonly cause issues
469
+ content = re.sub(r'[\u2000-\u200F\u2028-\u202F\u205F\u3000]', ' ', content)
470
+ content = re.sub(r'[\u2010-\u2015\u2212]', '-', content)
471
+
472
+ return content
473
+
474
+ def _fix_mixed_mathematical_expressions(content: str) -> str:
475
+ """
476
+ Removes duplicated plain-text versions of mathematical expressions
477
+ that Pandoc sometimes generates alongside the LaTeX version by deleting
478
+ the plain text part when it is immediately followed by the LaTeX part.
479
+ """
480
+
481
+ processed_content = content
482
+
483
+ # A list of compiled regex patterns.
484
+ # Each pattern matches a plain-text formula but only if it's followed
485
+ # by its corresponding LaTeX version (using a positive lookahead).
486
+ patterns_to_remove = [
487
+ # Pattern for: hq,k=x[nq,k]...h_{q,k} = x[n_{q,k}]...
488
+ re.compile(r'h[qrs],k=x\[n[qrs],k\](?:,h[qrs],k=x\[n[qrs],k\])*\s*' +
489
+ r'(?=h_{q,k}\s*=\s*x\\\[n_{q,k}\\\],)', re.UNICODE),
490
+
491
+ # Pattern for: ∆hq,r,k=hq,k-hr,k...\Delta h_{q,r,k} = ...
492
+ re.compile(r'(?:∆h[qrs],[qrs],k=h[qrs],k-h[qrs],k\s*)+' +
493
+ r'(?=\\Delta\s*h_{q,r,k})', re.UNICODE),
494
+
495
+ # Pattern for: RRk=tr,k+1-tr,kRR_k = ...
496
+ re.compile(r'RRk=tr,k\+1-tr,k\s*' +
497
+ r'(?=RR_k\s*=\s*t_{r,k\+1})', re.UNICODE),
498
+
499
+ # Pattern for: Tmed=median{RRk}T_{\mathrm{med}}
500
+ re.compile(r'Tmed=median\{RRk\}\s*' +
501
+ r'(?=T_{\\mathrm{med}}\s*=\s*\\mathrm{median}\\{RR_k\\})', re.UNICODE),
502
+
503
+ # Pattern for: Tk=[tr,k-Tmed2, tr,k+Tmed2]\mathcal{T}_k
504
+ re.compile(r'Tk=\[tr,k-Tmed2,.*?tr,k\+Tmed2\]\s*' +
505
+ r'(?=\\mathcal\{T\}_k\s*=\s*\\\[t_{r,k})', re.UNICODE | re.DOTALL),
506
+
507
+ # Pattern for: h¯k=1|Ik|∑n∈Ikx[n]\bar h_k
508
+ re.compile(r'h¯k=1\|Ik\|∑n∈Ikx\[n\]\s*' +
509
+ r'(?=\\bar\s*h_k\s*=\s*\\frac)', re.UNICODE),
510
+
511
+ # Pattern for: Mrs=median{∆hr,s,k}M_{rs}
512
+ re.compile(r'Mrs=median\{∆hr,s,k\}\s*' +
513
+ r'(?=M_{rs}\s*=\s*\\mathrm{median})', re.UNICODE),
514
+
515
+ # Pattern for: ∆h¯k=h¯k-Mrs\Delta\bar h_k
516
+ re.compile(r'∆h¯k=h¯k-Mrs\s*' +
517
+ r'(?=\\Delta\\bar\s*h_k\s*=\s*\\bar\s*h_k)', re.UNICODE),
518
+ ]
519
+
520
+ for pattern in patterns_to_remove:
521
+ processed_content = pattern.sub('', processed_content)
522
+
523
+ return processed_content
524
+
525
+ def _fix_compilation_issues(content: str) -> str:
526
+ """
527
+ Fix common LaTeX compilation issues.
528
+ """
529
+ # Fix \tightlist command if not defined
530
+ if r'\tightlist' in content and r'\providecommand{\tightlist}' not in content:
531
+ tightlist_def = r'''
532
+ % Define \tightlist command for lists
533
+ \providecommand{\tightlist}{%
534
+ \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
535
+ '''
536
+ # Insert after packages but before \begin{document}
537
+ begin_doc_match = re.search(r'\\begin\{document\}', content)
538
+ if begin_doc_match:
539
+ insert_pos = begin_doc_match.start()
540
+ content = content[:insert_pos] + tightlist_def + '\n' + content[insert_pos:]
541
+
542
+ # Fix \euro command if used but not defined
543
+ if r'\euro' in content and r'usepackage{eurosym}' not in content:
544
+ content = re.sub(
545
+ r'(\\usepackage\{[^}]+\}\s*\n)',
546
+ r'\1\\usepackage{eurosym}\n',
547
+ content,
548
+ count=1
549
+ )
550
+
551
+ # Fix undefined references to figures/tables
552
+ content = re.sub(r'\\ref\{fig:([^}]+)\}', r'Figure~\\ref{fig:\1}', content)
553
+ content = re.sub(r'\\ref\{tab:([^}]+)\}', r'Table~\\ref{tab:\1}', content)
554
+
555
+ # Ensure proper figure placement
556
+ if r'\begin{figure}' in content:
557
+ content = re.sub(
558
+ r'\\begin\{figure\}(?!\[)',
559
+ r'\\begin{figure}[htbp]',
560
+ content
561
+ )
562
+
563
+ # Ensure proper table placement
564
+ if r'\begin{table}' in content:
565
+ content = re.sub(
566
+ r'\\begin\{table\}(?!\[)',
567
+ r'\\begin{table}[htbp]',
568
+ content
569
+ )
570
+
571
+ return content
572
+
573
+ def _fix_image_paths_for_overleaf(content: str, extract_media_to_path: str = None) -> str:
574
+ """
575
+ Convert absolute image paths to relative paths for Overleaf compatibility.
576
+ """
577
+ if extract_media_to_path:
578
+ # Extract the media directory name
579
+ media_dir = os.path.basename(extract_media_to_path.rstrip('/'))
580
+
581
+ # Fix paths with task IDs like: task_id_media/media/image.png -> media/image.png
582
+ # Pattern: \includegraphics{any_path/task_id_media/media/image.ext}
583
+ # Replace with: \includegraphics{media/image.ext}
584
+ pattern1 = r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[a-f0-9\-]+_media[/\\]media[/\\]([^{}]+)\}'
585
+ replacement1 = r'\\includegraphics\1{media/\2}'
586
+ content = re.sub(pattern1, replacement1, content)
587
+
588
+ # Fix paths like: task_id_media/media/image.png -> media/image.png (without includegraphics)
589
+ pattern2 = r'[a-f0-9\-]+_media[/\\]media[/\\]'
590
+ replacement2 = r'media/'
591
+ content = re.sub(pattern2, replacement2, content)
592
+
593
+ # Also handle regular media paths: /absolute/path/to/media/image.ext -> media/image.ext
594
+ pattern3 = r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[/\\]' + re.escape(media_dir) + r'[/\\]([^{}]+)\}'
595
+ replacement3 = r'\\includegraphics\1{' + media_dir + r'/\2}'
596
+ content = re.sub(pattern3, replacement3, content)
597
+
598
+ return content
599
+
600
+ def _remove_unwanted_formatting(content: str) -> str:
601
+ """
602
+ Remove unwanted highlighting and formatting that causes visual issues.
603
+ """
604
+ # Remove highlighting commands
605
+ content = re.sub(r'\\colorbox\{[^}]*\}\{([^}]*)\}', r'\1', content)
606
+ content = re.sub(r'\\hl\{([^}]*)\}', r'\1', content)
607
+ content = re.sub(r'\\texthl\{([^}]*)\}', r'\1', content)
608
+ content = re.sub(r'\\hlc\[[^\]]*\]\{([^}]*)\}', r'\1', content)
609
+
610
+ # Remove table cell coloring
611
+ content = re.sub(r'\\cellcolor\{[^}]*\}', '', content)
612
+ content = re.sub(r'\\rowcolor\{[^}]*\}', '', content)
613
+ content = re.sub(r'\\columncolor\{[^}]*\}', '', content)
614
+
615
+ # Remove text background colors
616
+ content = re.sub(r'\\textcolor\{[^}]*\}\{([^}]*)\}', r'\1', content)
617
+ content = re.sub(r'\\color\{[^}]*\}', '', content)
618
+
619
+ # Remove box formatting that might cause highlighting
620
+ content = re.sub(r'\\fcolorbox\{[^}]*\}\{[^}]*\}\{([^}]*)\}', r'\1', content)
621
+ content = re.sub(r'\\framebox\[[^\]]*\]\{([^}]*)\}', r'\1', content)
622
+
623
+ # Remove soul package highlighting
624
+ content = re.sub(r'\\sethlcolor\{[^}]*\}', '', content)
625
+ content = re.sub(r'\\ul\{([^}]*)\}', r'\1', content) # Remove underline if causing issues
626
+
627
+ return content
628
+
629
+ def _inject_latex_packages(content: str) -> str:
630
+ """
631
+ Inject additional LaTeX packages needed for enhanced formatting.
632
+ """
633
+ # Essential packages for enhanced conversion
634
+ essential_packages = [
635
+ r'\usepackage{graphicx}', # For images - ensure it's included
636
+ r'\usepackage{longtable}', # For tables
637
+ r'\usepackage{booktabs}', # Better table formatting
638
+ r'\usepackage{array}', # Enhanced table formatting
639
+ r'\usepackage{calc}', # For calculations
640
+ r'\usepackage{url}', # For URLs
641
+ ]
642
+
643
+ # Style enhancement packages
644
+ style_packages = [
645
+ r'\usepackage{float}', # Better float positioning
646
+ r'\usepackage{adjustbox}', # For centering and scaling
647
+ r'\usepackage{caption}', # Better caption formatting
648
+ r'\usepackage{subcaption}', # For subfigures
649
+ r'\usepackage{tabularx}', # Flexible table widths
650
+ r'\usepackage{enumitem}', # Better list formatting
651
+ r'\usepackage{setspace}', # Line spacing control
652
+ r'\usepackage{ragged2e}', # Better text alignment
653
+ r'\usepackage{amsmath}', # Mathematical formatting
654
+ r'\usepackage{amssymb}', # Mathematical symbols
655
+ r'\usepackage{needspace}', # Prevent orphaned lines and improve page breaks
656
+ ]
657
+
658
+ all_packages = essential_packages + style_packages
659
+
660
+ # Find the position after \documentclass but before any existing \usepackage or \begin{document}
661
+ documentclass_pattern = r'\\documentclass(?:\[[^\]]*\])?\{[^}]+\}'
662
+ documentclass_match = re.search(documentclass_pattern, content)
663
+
664
+ if documentclass_match:
665
+ insert_pos = documentclass_match.end()
666
+
667
+ # Find the next significant LaTeX command to insert before it
668
+ # Look for existing \usepackage, \begin{document}, or other commands
669
+ remaining_content = content[insert_pos:]
670
+ next_command_match = re.search(r'\\(?:usepackage|begin\{document\}|title|author|date)', remaining_content)
671
+
672
+ if next_command_match:
673
+ insert_pos += next_command_match.start()
674
+
675
+ # Check which packages are not already included
676
+ packages_to_insert = []
677
+ for package in all_packages:
678
+ package_name = package.replace(r'\usepackage{', '').replace('}', '')
679
+ if f'usepackage{{{package_name}}}' not in content:
680
+ packages_to_insert.append(package)
681
+
682
+ if packages_to_insert:
683
+ # Add packages with proper spacing
684
+ package_block = '\n% Enhanced conversion packages\n' + '\n'.join(packages_to_insert) + '\n\n'
685
+ content = content[:insert_pos] + package_block + content[insert_pos:]
686
+
687
+ return content
688
+
689
+ def _add_centering_commands(content: str) -> str:
690
+ """
691
+ Add centering commands to figures and tables.
692
+ """
693
+ # Add \centering to figure environments
694
+ content = re.sub(
695
+ r'(\\begin\{figure\}(?:\[[^\]]*\])?)\s*\n',
696
+ r'\1\n\\centering\n',
697
+ content
698
+ )
699
+
700
+ # Add \centering to table environments
701
+ content = re.sub(
702
+ r'(\\begin\{table\}(?:\[[^\]]*\])?)\s*\n',
703
+ r'\1\n\\centering\n',
704
+ content
705
+ )
706
+
707
+ return content
708
+
709
+ def _fix_line_breaks_and_spacing(content: str) -> str:
710
+ """
711
+ Minimal fixes to preserve Word's original formatting and pagination.
712
+ """
713
+ # Remove unwanted highlighting and color commands
714
+ content = re.sub(r'\\colorbox\{[^}]*\}\{([^}]*)\}', r'\1', content)
715
+ content = re.sub(r'\\hl\{([^}]*)\}', r'\1', content)
716
+ content = re.sub(r'\\texthl\{([^}]*)\}', r'\1', content)
717
+ content = re.sub(r'\\cellcolor\{[^}]*\}', '', content)
718
+ content = re.sub(r'\\rowcolor\{[^}]*\}', '', content)
719
+
720
+ # Only fix critical spacing issues that break compilation
721
+ # Preserve Word's original line breaks and spacing as much as possible
722
+
723
+ # Ensure proper spacing around lists but don't change internal spacing
724
+ content = re.sub(r'\n\\begin\{enumerate\}\n\n', r'\n\n\\begin{enumerate}\n', content)
725
+ content = re.sub(r'\n\n\\end\{enumerate\}\n', r'\n\\end{enumerate}\n\n', content)
726
+ content = re.sub(r'\n\\begin\{itemize\}\n\n', r'\n\n\\begin{itemize}\n', content)
727
+ content = re.sub(r'\n\n\\end\{itemize\}\n', r'\n\\end{itemize}\n\n', content)
728
+
729
+ # Minimal section spacing - preserve Word's pagination
730
+ content = re.sub(r'\n(\\(?:sub)*section\{[^}]+\})\n\n', r'\n\n\1\n\n', content)
731
+
732
+ # Only remove excessive spacing (3+ line breaks) but preserve double breaks
733
+ content = re.sub(r'\n\n\n+', r'\n\n', content)
734
+
735
+ # Ensure proper spacing around figures and tables
736
+ content = re.sub(r'\n\\begin\{figure\}', r'\n\n\\begin{figure}', content)
737
+ content = re.sub(r'\\end\{figure\}\n([A-Z])', r'\\end{figure}\n\n\1', content)
738
+ content = re.sub(r'\n\\begin\{table\}', r'\n\n\\begin{table}', content)
739
+ content = re.sub(r'\\end\{table\}\n([A-Z])', r'\\end{table}\n\n\1', content)
740
+
741
+ return content
742
+
743
+ if __name__ == '__main__':
744
+ from docx import Document
745
+ from docx.shared import Inches
746
+ from PIL import Image
747
+ import shutil
748
+
749
+ # --- Helper Functions for DOCX and Template Creation ---
750
+ def create_dummy_image(filename, size=(60, 60), color="red", img_format="PNG"):
751
+ img = Image.new('RGB', size, color=color)
752
+ img.save(filename, img_format)
753
+ print(f"Created dummy image: {filename}")
754
+
755
+ def create_test_docx_with_styles(filename):
756
+ doc = Document()
757
+ doc.add_heading("Document with Enhanced Features", level=1)
758
+
759
+ # Add paragraph with text
760
+ p1 = doc.add_paragraph("This document tests enhanced features including:")
761
+
762
+ # Add numbered list
763
+ doc.add_paragraph("First numbered item", style='List Number')
764
+ doc.add_paragraph("Second numbered item", style='List Number')
765
+ doc.add_paragraph("Third numbered item", style='List Number')
766
+
767
+ # Add some text
768
+ doc.add_paragraph("Here is some regular text between lists.")
769
+
770
+ # Add bullet list
771
+ doc.add_paragraph("First bullet point", style='List Bullet')
772
+ doc.add_paragraph("Second bullet point", style='List Bullet')
773
+
774
+ doc.add_heading("Image Section", level=2)
775
+ doc.add_paragraph("Below is a test image:")
776
+
777
+ doc.save(filename)
778
+ print(f"Created test DOCX with styles: {filename}")
779
+
780
+ def create_complex_docx(filename, img1_path, img2_path):
781
+ doc = Document()
782
+ doc.add_heading("Complex Document Title", level=1)
783
+ doc.add_paragraph("Introduction to the complex document.")
784
+ doc.add_heading("Image Section", level=2)
785
+ doc.add_picture(img1_path, width=Inches(1.0))
786
+ doc.add_paragraph("Some text after the first image.")
787
+ doc.add_picture(img2_path, width=Inches(1.0))
788
+ doc.add_heading("Conclusion Section", level=2)
789
+ doc.add_paragraph("Final remarks.")
790
+ doc.save(filename)
791
+ print(f"Created complex DOCX: {filename}")
792
+
793
+ # --- Test Files ---
794
+ docx_styles = "test_enhanced_styles.docx"
795
+ docx_complex = "test_complex_enhanced.docx"
796
+ img1 = "dummy_img1.png"
797
+ img2 = "dummy_img2.jpg"
798
+
799
+ output_enhanced_test = "output_enhanced_test.tex"
800
+ output_overleaf_test = "output_overleaf_test.tex"
801
+ media_dir = "./media_enhanced"
802
+
803
+ all_test_files = [docx_styles, docx_complex, img1, img2, output_enhanced_test, output_overleaf_test]
804
+ all_test_dirs = [media_dir]
805
+
806
+ # --- Create Test Files ---
807
+ print("--- Setting up enhanced test files ---")
808
+ create_dummy_image(img1, color="blue", img_format="PNG")
809
+ create_dummy_image(img2, color="green", img_format="JPEG")
810
+ create_test_docx_with_styles(docx_styles)
811
+ create_complex_docx(docx_complex, img1, img2)
812
+ print("--- Enhanced test file setup complete ---")
813
+
814
+ # --- Test Enhanced Features ---
815
+ print("\n--- Testing Enhanced Features ---")
816
+
817
+ # Test 1: Style preservation and line breaks
818
+ print("\n--- Test 1: Enhanced Style Preservation ---")
819
+ success, msg = convert_docx_to_latex(
820
+ docx_styles,
821
+ output_enhanced_test,
822
+ generate_toc=True,
823
+ preserve_styles=True,
824
+ preserve_linebreaks=True
825
+ )
826
+ print(f"Enhanced Test: {success}, Msg: {msg}")
827
+
828
+ if success and os.path.exists(output_enhanced_test):
829
+ with open(output_enhanced_test, 'r') as f:
830
+ content = f.read()
831
+ checks = {
832
+ 'packages': any(pkg in content for pkg in ['\\usepackage{float}', '\\usepackage{enumitem}']),
833
+ 'toc': '\\tableofcontents' in content,
834
+ 'sections': '\\section' in content,
835
+ 'lists': '\\begin{enumerate}' in content or '\\begin{itemize}' in content
836
+ }
837
+ print(f"Enhanced verification: {checks}")
838
+
839
+ # Test 2: Overleaf compatibility with images
840
+ print("\n--- Test 2: Overleaf Compatibility ---")
841
+ success, msg = convert_docx_to_latex(
842
+ docx_complex,
843
+ output_overleaf_test,
844
+ extract_media_to_path=media_dir,
845
+ overleaf_compatible=True,
846
+ preserve_styles=True,
847
+ preserve_linebreaks=True
848
+ )
849
+ print(f"Overleaf Test: {success}, Msg: {msg}")
850
+
851
+ if success and os.path.exists(output_overleaf_test):
852
+ with open(output_overleaf_test, 'r') as f:
853
+ content = f.read()
854
+ media_check = 'media/' in content and '\\includegraphics' in content
855
+ print(f"Overleaf compatibility check - relative paths: {media_check}")
856
+
857
+ media_files_exist = os.path.exists(os.path.join(media_dir, 'media'))
858
+ print(f"Media files extracted: {media_files_exist}")
859
+
860
+ # --- Cleanup ---
861
+ print("\n--- Cleaning up enhanced test files ---")
862
+ for f_path in all_test_files:
863
+ if os.path.exists(f_path):
864
+ try:
865
+ os.remove(f_path)
866
+ print(f"Removed: {f_path}")
867
+ except Exception as e:
868
+ print(f"Error removing {f_path}: {e}")
869
+
870
+ for d_path in all_test_dirs:
871
+ if os.path.isdir(d_path):
872
+ try:
873
+ shutil.rmtree(d_path)
874
+ print(f"Removed directory: {d_path}")
875
+ except Exception as e:
876
+ print(f"Error removing {d_path}: {e}")
877
+
878
+ print("--- Enhanced testing completed ---")
preserve_linebreaks.lua ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ -- preserve_linebreaks.lua
3
+ -- Filter for better preservation of line breaks and paragraph structure
4
+
5
+ function LineBreak(el)
6
+ return pandoc.RawInline("latex", "\\\\")
7
+ end
8
+
9
+ function SoftBreak(el)
10
+ return pandoc.RawInline("latex", " ")
11
+ end
12
+
13
+ function Para(el)
14
+ -- Add proper spacing for numbered lists and paragraph breaks
15
+ if #el.content > 0 then
16
+ return pandoc.Para(el.content)
17
+ end
18
+ end
19
+
20
+ -- Improve list formatting
21
+ function OrderedList(el)
22
+ -- Ensure proper spacing in numbered lists
23
+ return el
24
+ end
25
+
26
+ function BulletList(el)
27
+ -- Ensure proper spacing in bullet lists
28
+ return el
29
+ end
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ flask==2.3.3
2
+ flask-cors==4.0.0
3
+ pypandoc==1.13
4
+ python-docx==0.8.11
5
+ Pillow==10.0.0
6
+ werkzeug==2.3.7
7
+ gunicorn==21.2.0
temp/.DS_Store ADDED
Binary file (6.15 kB). View file
 
web_api.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, send_file
2
+ from flask_cors import CORS
3
+ import os
4
+ import tempfile
5
+ import uuid
6
+ from werkzeug.utils import secure_filename
7
+ from converter import convert_docx_to_latex
8
+ import shutil
9
+
10
+ app = Flask(__name__)
11
+ CORS(app) # Enable CORS for all routes
12
+
13
+ # Configuration
14
+ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
15
+ UPLOAD_FOLDER = 'temp/uploads'
16
+ OUTPUT_FOLDER = 'temp/outputs'
17
+
18
+ # Ensure directories exist
19
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
20
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
21
+
22
+ # Store conversion tasks
23
+ conversion_tasks = {}
24
+
25
+ @app.route('/api/health', methods=['GET'])
26
+ def health_check():
27
+ """Health check endpoint"""
28
+ return jsonify({'status': 'healthy', 'message': 'DOCX to LaTeX API is running'})
29
+
30
+ @app.route('/api/upload', methods=['POST'])
31
+ def upload_file():
32
+ """Handle file upload"""
33
+ try:
34
+ if 'file' not in request.files:
35
+ return jsonify({'error': 'No file provided'}), 400
36
+
37
+ file = request.files['file']
38
+ if file.filename == '':
39
+ return jsonify({'error': 'No file selected'}), 400
40
+
41
+ if not file.filename.lower().endswith('.docx'):
42
+ return jsonify({'error': 'Only DOCX files are allowed'}), 400
43
+
44
+ # Generate unique task ID
45
+ task_id = str(uuid.uuid4())
46
+
47
+ # Save uploaded file
48
+ filename = secure_filename(file.filename)
49
+ file_path = os.path.join(UPLOAD_FOLDER, f"{task_id}_{filename}")
50
+ file.save(file_path)
51
+
52
+ # Store task info
53
+ conversion_tasks[task_id] = {
54
+ 'status': 'uploaded',
55
+ 'original_filename': filename,
56
+ 'file_path': file_path,
57
+ 'output_filename': filename.replace('.docx', '.tex'),
58
+ 'created_at': os.path.getctime(file_path)
59
+ }
60
+
61
+ return jsonify({
62
+ 'task_id': task_id,
63
+ 'filename': filename,
64
+ 'status': 'uploaded',
65
+ 'message': 'File uploaded successfully'
66
+ })
67
+
68
+ except Exception as e:
69
+ return jsonify({'error': f'Upload failed: {str(e)}'}), 500
70
+
71
+ @app.route('/api/convert', methods=['POST'])
72
+ def convert_document():
73
+ """Convert DOCX to LaTeX"""
74
+ try:
75
+ data = request.get_json()
76
+
77
+ if not data or 'task_id' not in data:
78
+ return jsonify({'error': 'Task ID is required'}), 400
79
+
80
+ task_id = data['task_id']
81
+
82
+ if task_id not in conversion_tasks:
83
+ return jsonify({'error': 'Invalid task ID'}), 404
84
+
85
+ task = conversion_tasks[task_id]
86
+
87
+ if task['status'] != 'uploaded':
88
+ return jsonify({'error': 'Task is not in uploadable state'}), 400
89
+
90
+ # Get conversion options
91
+ options = data.get('options', {})
92
+ output_filename = data.get('output_filename', task['output_filename'])
93
+
94
+ # Update task status
95
+ task['status'] = 'converting'
96
+ task['output_filename'] = output_filename
97
+
98
+ # Prepare output paths
99
+ output_path = os.path.join(OUTPUT_FOLDER, f"{task_id}_{output_filename}")
100
+ media_path = os.path.join(OUTPUT_FOLDER, f"{task_id}_media")
101
+
102
+ # Perform conversion
103
+ success, message = convert_docx_to_latex(
104
+ docx_path=task['file_path'],
105
+ latex_path=output_path,
106
+ generate_toc=options.get('generateToc', False),
107
+ extract_media_to_path=media_path if options.get('extractMedia', True) else None,
108
+ latex_template_path=None, # Could be added later for custom templates
109
+ overleaf_compatible=options.get('overleafCompatible', True),
110
+ preserve_styles=options.get('preserveStyles', True),
111
+ preserve_linebreaks=options.get('preserveLineBreaks', True)
112
+ )
113
+
114
+ if success:
115
+ task['status'] = 'completed'
116
+ task['output_path'] = output_path
117
+ task['media_path'] = media_path if os.path.exists(media_path) else None
118
+ task['conversion_message'] = message
119
+
120
+ return jsonify({
121
+ 'task_id': task_id,
122
+ 'status': 'completed',
123
+ 'message': message,
124
+ 'output_filename': output_filename,
125
+ 'has_media': os.path.exists(media_path)
126
+ })
127
+ else:
128
+ task['status'] = 'failed'
129
+ task['error_message'] = message
130
+
131
+ return jsonify({
132
+ 'task_id': task_id,
133
+ 'status': 'failed',
134
+ 'error': message
135
+ }), 500
136
+
137
+ except Exception as e:
138
+ # Update task status if possible
139
+ if 'task_id' in locals() and task_id in conversion_tasks:
140
+ conversion_tasks[task_id]['status'] = 'failed'
141
+ conversion_tasks[task_id]['error_message'] = str(e)
142
+
143
+ return jsonify({'error': f'Conversion failed: {str(e)}'}), 500
144
+
145
+ @app.route('/api/download/<task_id>', methods=['GET'])
146
+ def download_file(task_id):
147
+ """Download converted LaTeX file"""
148
+ try:
149
+ if task_id not in conversion_tasks:
150
+ return jsonify({'error': 'Invalid task ID'}), 404
151
+
152
+ task = conversion_tasks[task_id]
153
+
154
+ if task['status'] != 'completed':
155
+ return jsonify({'error': 'Conversion not completed'}), 400
156
+
157
+ if not os.path.exists(task['output_path']):
158
+ return jsonify({'error': 'Output file not found'}), 404
159
+
160
+ return send_file(
161
+ task['output_path'],
162
+ as_attachment=True,
163
+ download_name=task['output_filename'],
164
+ mimetype='text/plain'
165
+ )
166
+
167
+ except Exception as e:
168
+ return jsonify({'error': f'Download failed: {str(e)}'}), 500
169
+
170
+ @app.route('/api/download-media/<task_id>', methods=['GET'])
171
+ def download_media(task_id):
172
+ """Download media files as a ZIP archive"""
173
+ try:
174
+ if task_id not in conversion_tasks:
175
+ return jsonify({'error': 'Invalid task ID'}), 404
176
+
177
+ task = conversion_tasks[task_id]
178
+
179
+ if task['status'] != 'completed':
180
+ return jsonify({'error': 'Conversion not completed'}), 400
181
+
182
+ if not task.get('media_path') or not os.path.exists(task['media_path']):
183
+ return jsonify({'error': 'No media files found'}), 404
184
+
185
+ # Create a ZIP file of the media directory
186
+ zip_path = task['media_path'] + '.zip'
187
+ shutil.make_archive(task['media_path'], 'zip', task['media_path'])
188
+
189
+ return send_file(
190
+ zip_path,
191
+ as_attachment=True,
192
+ download_name=f"{task['output_filename'].replace('.tex', '')}_media.zip",
193
+ mimetype='application/zip'
194
+ )
195
+
196
+ except Exception as e:
197
+ return jsonify({'error': f'Media download failed: {str(e)}'}), 500
198
+
199
+ @app.route('/api/download-complete/<task_id>', methods=['GET'])
200
+ def download_complete_package(task_id):
201
+ """Download complete package (LaTeX + media) as a ZIP archive"""
202
+ try:
203
+ if task_id not in conversion_tasks:
204
+ return jsonify({'error': 'Invalid task ID'}), 404
205
+
206
+ task = conversion_tasks[task_id]
207
+
208
+ if task['status'] != 'completed':
209
+ return jsonify({'error': 'Conversion not completed'}), 400
210
+
211
+ if not os.path.exists(task['output_path']):
212
+ return jsonify({'error': 'Output file not found'}), 404
213
+
214
+ # Create a temporary directory for the complete package
215
+ import tempfile
216
+ base_name = task['output_filename'].replace('.tex', '')
217
+
218
+ with tempfile.TemporaryDirectory() as temp_dir:
219
+ package_dir = os.path.join(temp_dir, base_name)
220
+ os.makedirs(package_dir, exist_ok=True)
221
+
222
+ # Copy and fix LaTeX file for Overleaf compatibility
223
+ latex_dest = os.path.join(package_dir, task['output_filename'])
224
+
225
+ # Read the original LaTeX file
226
+ with open(task['output_path'], 'r', encoding='utf-8') as f:
227
+ latex_content = f.read()
228
+
229
+ # Fix image paths to use relative paths suitable for Overleaf
230
+ # Convert paths like: task_id_media/media/image.png -> media/image.png
231
+ import re
232
+
233
+ # Fix paths with task IDs
234
+ latex_content = re.sub(
235
+ r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[a-f0-9\-]+_media[/\\]media[/\\]([^{}]+)\}',
236
+ r'\\includegraphics\1{media/\2}',
237
+ latex_content
238
+ )
239
+
240
+ # Fix any remaining absolute paths
241
+ latex_content = re.sub(
242
+ r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[/\\]media[/\\]([^{}]+)\}',
243
+ r'\\includegraphics\1{media/\2}',
244
+ latex_content
245
+ )
246
+
247
+ # Write the fixed LaTeX file
248
+ with open(latex_dest, 'w', encoding='utf-8') as f:
249
+ f.write(latex_content)
250
+
251
+ # Copy media files if they exist
252
+ if task.get('media_path') and os.path.exists(task['media_path']):
253
+ media_dest = os.path.join(package_dir, 'media')
254
+
255
+ # Check if there's a nested media folder structure
256
+ inner_media = os.path.join(task['media_path'], 'media')
257
+ if os.path.exists(inner_media):
258
+ # Copy from the inner media folder to avoid media/media/ nesting
259
+ shutil.copytree(inner_media, media_dest)
260
+ else:
261
+ # Copy the media_path directly if no nesting
262
+ shutil.copytree(task['media_path'], media_dest)
263
+
264
+ # Create README file
265
+ readme_content = f"""# {base_name} - DOCX to LaTeX Conversion
266
+
267
+ ## Package Contents:
268
+
269
+ 1. **{task['output_filename']}** - Main LaTeX file
270
+ 2. **media/** - Images and media files (if any)
271
+
272
+ ## How to Use:
273
+
274
+ ### Compiling LaTeX:
275
+ ```bash
276
+ pdflatex {task['output_filename']}
277
+ ```
278
+
279
+ ### For Overleaf:
280
+ 1. Upload all files to a new Overleaf project
281
+ 2. Set main file: {task['output_filename']}
282
+ 3. Compile the project
283
+
284
+ ### Local Compilation:
285
+ ```bash
286
+ # Basic compilation
287
+ pdflatex {task['output_filename']}
288
+
289
+ # For bibliography and cross-references
290
+ pdflatex {task['output_filename']}
291
+ bibtex {task['output_filename'].replace('.tex', '')}
292
+ pdflatex {task['output_filename']}
293
+ pdflatex {task['output_filename']}
294
+ ```
295
+
296
+ ## Features:
297
+ - Enhanced formatting preservation
298
+ - Overleaf compatibility
299
+ - Automatic image path fixing
300
+ - Unicode character conversion
301
+ - Mathematical expression optimization
302
+
303
+ ## Generated by:
304
+ DOCX to LaTeX Web Converter
305
+ https://github.com/your-username/docx-to-latex
306
+ """
307
+
308
+ readme_path = os.path.join(package_dir, 'README.txt')
309
+ with open(readme_path, 'w', encoding='utf-8') as f:
310
+ f.write(readme_content)
311
+
312
+ # Create ZIP file
313
+ zip_path = os.path.join(temp_dir, f"{base_name}_complete.zip")
314
+ shutil.make_archive(zip_path.replace('.zip', ''), 'zip', package_dir)
315
+
316
+ return send_file(
317
+ zip_path,
318
+ as_attachment=True,
319
+ download_name=f"{base_name}_complete.zip",
320
+ mimetype='application/zip'
321
+ )
322
+
323
+ except Exception as e:
324
+ return jsonify({'error': f'Complete package download failed: {str(e)}'}), 500
325
+
326
+ @app.route('/api/status/<task_id>', methods=['GET'])
327
+ def get_task_status(task_id):
328
+ """Get conversion task status"""
329
+ try:
330
+ if task_id not in conversion_tasks:
331
+ return jsonify({'error': 'Invalid task ID'}), 404
332
+
333
+ task = conversion_tasks[task_id]
334
+
335
+ response_data = {
336
+ 'task_id': task_id,
337
+ 'status': task['status'],
338
+ 'original_filename': task['original_filename'],
339
+ 'output_filename': task.get('output_filename', ''),
340
+ }
341
+
342
+ if task['status'] == 'completed':
343
+ response_data['message'] = task.get('conversion_message', 'Conversion completed successfully')
344
+ response_data['has_media'] = task.get('media_path') and os.path.exists(task['media_path'])
345
+ elif task['status'] == 'failed':
346
+ response_data['error'] = task.get('error_message', 'Conversion failed')
347
+
348
+ return jsonify(response_data)
349
+
350
+ except Exception as e:
351
+ return jsonify({'error': f'Status check failed: {str(e)}'}), 500
352
+
353
+ @app.route('/api/cleanup/<task_id>', methods=['DELETE'])
354
+ def cleanup_task(task_id):
355
+ """Clean up task files"""
356
+ try:
357
+ if task_id not in conversion_tasks:
358
+ return jsonify({'error': 'Invalid task ID'}), 404
359
+
360
+ task = conversion_tasks[task_id]
361
+
362
+ # Remove uploaded file
363
+ if os.path.exists(task['file_path']):
364
+ os.remove(task['file_path'])
365
+
366
+ # Remove output file
367
+ if task.get('output_path') and os.path.exists(task['output_path']):
368
+ os.remove(task['output_path'])
369
+
370
+ # Remove media directory
371
+ if task.get('media_path') and os.path.exists(task['media_path']):
372
+ shutil.rmtree(task['media_path'])
373
+
374
+ # Remove media ZIP if it exists
375
+ media_zip = task.get('media_path', '') + '.zip'
376
+ if os.path.exists(media_zip):
377
+ os.remove(media_zip)
378
+
379
+ # Remove task from memory
380
+ del conversion_tasks[task_id]
381
+
382
+ return jsonify({'message': 'Task cleaned up successfully'})
383
+
384
+ except Exception as e:
385
+ return jsonify({'error': f'Cleanup failed: {str(e)}'}), 500
386
+
387
+ @app.route('/api/tasks', methods=['GET'])
388
+ def list_tasks():
389
+ """List all conversion tasks (for debugging)"""
390
+ try:
391
+ tasks_summary = {}
392
+ for task_id, task in conversion_tasks.items():
393
+ tasks_summary[task_id] = {
394
+ 'status': task['status'],
395
+ 'original_filename': task['original_filename'],
396
+ 'output_filename': task.get('output_filename', ''),
397
+ 'created_at': task.get('created_at', 0)
398
+ }
399
+
400
+ return jsonify(tasks_summary)
401
+
402
+ except Exception as e:
403
+ return jsonify({'error': f'Failed to list tasks: {str(e)}'}), 500
404
+
405
+ # Cleanup old files on startup
406
+ def cleanup_old_files():
407
+ """Remove old temporary files"""
408
+ try:
409
+ import time
410
+ current_time = time.time()
411
+ cutoff_time = current_time - (24 * 60 * 60) # 24 hours ago
412
+
413
+ for folder in [UPLOAD_FOLDER, OUTPUT_FOLDER]:
414
+ if os.path.exists(folder):
415
+ for filename in os.listdir(folder):
416
+ file_path = os.path.join(folder, filename)
417
+ if os.path.isfile(file_path):
418
+ file_time = os.path.getctime(file_path)
419
+ if file_time < cutoff_time:
420
+ os.remove(file_path)
421
+ elif os.path.isdir(file_path):
422
+ dir_time = os.path.getctime(file_path)
423
+ if dir_time < cutoff_time:
424
+ shutil.rmtree(file_path)
425
+ except Exception as e:
426
+ print(f"Warning: Failed to cleanup old files: {e}")
427
+
428
+ if __name__ == '__main__':
429
+ # Cleanup old files on startup
430
+ cleanup_old_files()
431
+
432
+ # Run the Flask app
433
+ print("Starting DOCX to LaTeX API server...")
434
+ print("API endpoints:")
435
+ print(" POST /api/upload - Upload DOCX file")
436
+ print(" POST /api/convert - Convert to LaTeX")
437
+ print(" GET /api/download/<task_id> - Download LaTeX file")
438
+ print(" GET /api/download-media/<task_id> - Download media files")
439
+ print(" GET /api/status/<task_id> - Get conversion status")
440
+ print(" DELETE /api/cleanup/<task_id> - Cleanup task files")
441
+ print(" GET /api/health - Health check")
442
+
443
+ app.run(debug=True, host='0.0.0.0', port=5000)