Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- DEPLOYMENT_INSTRUCTIONS.txt +39 -0
- Dockerfile +32 -0
- README.md +126 -5
- app.py +33 -0
- converter.py +878 -0
- preserve_linebreaks.lua +29 -0
- requirements.txt +7 -0
- temp/.DS_Store +0 -0
- web_api.py +443 -0
DEPLOYMENT_INSTRUCTIONS.txt
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# راهنمای Deploy کردن روی Hugging Face Spaces
|
3 |
+
|
4 |
+
## مرحله ۱: ایجاد Space
|
5 |
+
1. به https://huggingface.co/spaces بروید
|
6 |
+
2. "Create new Space" را کلیک کنید
|
7 |
+
3. نام Space را وارد کنید: docx-to-latex
|
8 |
+
4. SDK: Python انتخاب کنید
|
9 |
+
5. "Create Space" را کلیک کنید
|
10 |
+
|
11 |
+
## مرحله ۲: آپلود فایلها
|
12 |
+
تمام فایلهای این پوشه را به Space آپلود کنید:
|
13 |
+
|
14 |
+
### روش ۱: Git
|
15 |
+
```bash
|
16 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/docx-to-latex
|
17 |
+
cd docx-to-latex
|
18 |
+
cp -r ../huggingface_deployment/* .
|
19 |
+
git add .
|
20 |
+
git commit -m "Add DOCX to LaTeX converter API"
|
21 |
+
git push
|
22 |
+
```
|
23 |
+
|
24 |
+
### روش ۲: Web Interface
|
25 |
+
فایلها را drag & drop کنید در صفحه Space
|
26 |
+
|
27 |
+
## مرحله ۳: تست
|
28 |
+
پس از deploy، API در آدرس زیر دردسترس خواهد بود:
|
29 |
+
https://YOUR_USERNAME-docx-to-latex.hf.space/api/health
|
30 |
+
|
31 |
+
## فایلهای کپی شده:
|
32 |
+
- app.py
|
33 |
+
- web_api.py
|
34 |
+
- converter.py
|
35 |
+
- requirements.txt
|
36 |
+
- README.md
|
37 |
+
- Dockerfile
|
38 |
+
- .gitignore
|
39 |
+
- preserve_linebreaks.lua
|
Dockerfile
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
# Set working directory
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
# Install system dependencies
|
7 |
+
RUN apt-get update && apt-get install -y \
|
8 |
+
pandoc \
|
9 |
+
&& rm -rf /var/lib/apt/lists/*
|
10 |
+
|
11 |
+
# Copy requirements and install Python dependencies
|
12 |
+
COPY requirements.txt .
|
13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
14 |
+
|
15 |
+
# Copy core application files
|
16 |
+
COPY app.py .
|
17 |
+
COPY web_api.py .
|
18 |
+
COPY converter.py .
|
19 |
+
COPY preserve_linebreaks.lua .
|
20 |
+
|
21 |
+
# Create necessary directories
|
22 |
+
RUN mkdir -p temp/uploads temp/outputs
|
23 |
+
|
24 |
+
# Expose port
|
25 |
+
EXPOSE 7860
|
26 |
+
|
27 |
+
# Set environment variables
|
28 |
+
ENV PYTHONPATH=/app
|
29 |
+
ENV PORT=7860
|
30 |
+
|
31 |
+
# Run the application
|
32 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,10 +1,131 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
|
|
7 |
pinned: false
|
|
|
8 |
---
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: DOCX to LaTeX Converter
|
3 |
+
emoji: 📄
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
+
app_port: 7860
|
8 |
pinned: false
|
9 |
+
license: mit
|
10 |
---
|
11 |
|
12 |
+
# 📄 DOCX to LaTeX Converter API
|
13 |
+
|
14 |
+
تبدیلکننده حرفهای فایلهای Word (DOCX) به LaTeX با قابلیتهای پیشرفته
|
15 |
+
|
16 |
+
A professional DOCX to LaTeX converter with advanced features and modern web interface.
|
17 |
+
|
18 |
+
## 🌟 ویژگیها / Features
|
19 |
+
|
20 |
+
### فارسی
|
21 |
+
- ✅ تبدیل فایلهای DOCX به LaTeX با کیفیت بالا
|
22 |
+
- ✅ استخراج و حفظ تصاویر
|
23 |
+
- ✅ سازگار با Overleaf
|
24 |
+
- ✅ حفظ فرمتها و استایلها
|
25 |
+
- ✅ تولید فهرست مطالب خودکار
|
26 |
+
- ✅ دانلود فایل کامل در قالب ZIP
|
27 |
+
- ✅ رابط API ساده و قدرتمند
|
28 |
+
- ✅ اجرا رایگان روی Hugging Face Spaces
|
29 |
+
|
30 |
+
### English
|
31 |
+
- ✅ High-quality DOCX to LaTeX conversion
|
32 |
+
- ✅ Image extraction and preservation
|
33 |
+
- ✅ Overleaf compatibility
|
34 |
+
- ✅ Style and formatting preservation
|
35 |
+
- ✅ Automatic table of contents generation
|
36 |
+
- ✅ Complete ZIP package download
|
37 |
+
- ✅ Simple and powerful API interface
|
38 |
+
- ✅ Free hosting on Hugging Face Spaces
|
39 |
+
|
40 |
+
## 🚀 استفاده / Usage
|
41 |
+
|
42 |
+
### API Endpoints
|
43 |
+
|
44 |
+
#### 1. Health Check
|
45 |
+
```bash
|
46 |
+
GET /api/health
|
47 |
+
```
|
48 |
+
|
49 |
+
#### 2. Upload File
|
50 |
+
```bash
|
51 |
+
POST /api/upload
|
52 |
+
Content-Type: multipart/form-data
|
53 |
+
Body: file (DOCX file)
|
54 |
+
```
|
55 |
+
|
56 |
+
#### 3. Convert Document
|
57 |
+
```bash
|
58 |
+
POST /api/convert
|
59 |
+
Content-Type: application/json
|
60 |
+
Body: {
|
61 |
+
"task_id": "string",
|
62 |
+
"output_filename": "string",
|
63 |
+
"options": {
|
64 |
+
"generateToc": boolean,
|
65 |
+
"extractMedia": boolean,
|
66 |
+
"overleafCompatible": boolean,
|
67 |
+
"preserveStyles": boolean,
|
68 |
+
"preserveLineBreaks": boolean
|
69 |
+
}
|
70 |
+
}
|
71 |
+
```
|
72 |
+
|
73 |
+
#### 4. Download Complete Package
|
74 |
+
```bash
|
75 |
+
GET /api/download-complete/{task_id}
|
76 |
+
```
|
77 |
+
|
78 |
+
### مثال استفاده / Example Usage
|
79 |
+
|
80 |
+
```python
|
81 |
+
import requests
|
82 |
+
|
83 |
+
# Upload file
|
84 |
+
with open('document.docx', 'rb') as f:
|
85 |
+
response = requests.post('https://YOUR_USERNAME-docx-to-latex.hf.space/api/upload',
|
86 |
+
files={'file': f})
|
87 |
+
task_id = response.json()['task_id']
|
88 |
+
|
89 |
+
# Convert
|
90 |
+
convert_response = requests.post('https://YOUR_USERNAME-docx-to-latex.hf.space/api/convert',
|
91 |
+
json={
|
92 |
+
'task_id': task_id,
|
93 |
+
'options': {
|
94 |
+
'generateToc': True,
|
95 |
+
'extractMedia': True,
|
96 |
+
'overleafCompatible': True
|
97 |
+
}
|
98 |
+
})
|
99 |
+
|
100 |
+
# Download complete package
|
101 |
+
download_response = requests.get(f'https://YOUR_USERNAME-docx-to-latex.hf.space/api/download-complete/{task_id}')
|
102 |
+
with open('converted_package.zip', 'wb') as f:
|
103 |
+
f.write(download_response.content)
|
104 |
+
```
|
105 |
+
|
106 |
+
## 🔧 نصب محلی / Local Installation
|
107 |
+
|
108 |
+
```bash
|
109 |
+
git clone https://github.com/YOUR_USERNAME/docx-to-latex.git
|
110 |
+
cd docx-to-latex
|
111 |
+
pip install -r requirements.txt
|
112 |
+
python app.py
|
113 |
+
```
|
114 |
+
|
115 |
+
## 📚 مستندات / Documentation
|
116 |
+
|
117 |
+
این API امکان تبدیل فایلهای Word به LaTeX با حفظ فرمتها، تصاویر و جداول را فراهم میکند. خروجی نهایی شامل فایل LaTeX و پوشه تصاویر در قالب ZIP است که مستقیماً در Overleaf قابل استفاده است.
|
118 |
+
|
119 |
+
This API provides seamless conversion from Word documents to LaTeX while preserving formatting, images, and tables. The final output includes the LaTeX file and media folder in a ZIP package ready for use in Overleaf.
|
120 |
+
|
121 |
+
## 🤝 مشارکت / Contributing
|
122 |
+
|
123 |
+
مشارکتها خوشحال دریافت میشوند! لطفاً Issue ایجاد کرده یا Pull Request ارسال کنید.
|
124 |
+
|
125 |
+
Contributions are welcome! Please feel free to submit issues or pull requests.
|
126 |
+
|
127 |
+
## 📄 مجوز / License
|
128 |
+
|
129 |
+
MIT License - برای جزئیات فایل LICENSE را مشاهده کنید.
|
130 |
+
|
131 |
+
MIT License - see LICENSE file for details.
|
app.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
DOCX to LaTeX Converter API
|
4 |
+
Main entry point for Hugging Face Spaces deployment
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
|
10 |
+
# Set up environment for Hugging Face Spaces
|
11 |
+
if 'SPACE_ID' in os.environ:
|
12 |
+
# Running on Hugging Face Spaces
|
13 |
+
PORT = int(os.environ.get('PORT', 7860))
|
14 |
+
HOST = '0.0.0.0'
|
15 |
+
else:
|
16 |
+
# Running locally
|
17 |
+
PORT = 5001
|
18 |
+
HOST = '127.0.0.1'
|
19 |
+
|
20 |
+
# Import the Flask app
|
21 |
+
from web_api import app
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
print(f"🚀 Starting DOCX to LaTeX Converter API")
|
25 |
+
print(f"🌐 Server running on http://{HOST}:{PORT}")
|
26 |
+
print(f"📖 Health check: http://{HOST}:{PORT}/api/health")
|
27 |
+
print(f"📚 API Documentation: https://huggingface.co/spaces/YOUR_USERNAME/docx-to-latex")
|
28 |
+
|
29 |
+
app.run(
|
30 |
+
host=HOST,
|
31 |
+
port=PORT,
|
32 |
+
debug=False # Disable debug in production
|
33 |
+
)
|
converter.py
ADDED
@@ -0,0 +1,878 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pypandoc
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import tempfile
|
5 |
+
|
6 |
+
def convert_docx_to_latex(
|
7 |
+
docx_path: str,
|
8 |
+
latex_path: str,
|
9 |
+
generate_toc: bool = False,
|
10 |
+
extract_media_to_path: str = None,
|
11 |
+
latex_template_path: str = None,
|
12 |
+
overleaf_compatible: bool = False,
|
13 |
+
preserve_styles: bool = True,
|
14 |
+
preserve_linebreaks: bool = True
|
15 |
+
) -> tuple[bool, str]:
|
16 |
+
"""
|
17 |
+
Converts a DOCX file to a LaTeX file using pypandoc with enhanced features.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
docx_path: Path to the input .docx file.
|
21 |
+
latex_path: Path to save the output .tex file.
|
22 |
+
generate_toc: If True, attempts to generate a Table of Contents.
|
23 |
+
extract_media_to_path: If specified, path to extract media to (e.g., "./media").
|
24 |
+
latex_template_path: If specified, path to a custom Pandoc LaTeX template file.
|
25 |
+
overleaf_compatible: If True, makes images work in Overleaf with relative paths.
|
26 |
+
preserve_styles: If True, preserves document styles like centering and alignment.
|
27 |
+
preserve_linebreaks: If True, preserves line breaks and proper list formatting.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
A tuple (success: bool, message: str).
|
31 |
+
"""
|
32 |
+
extra_args = []
|
33 |
+
|
34 |
+
# Ensure standalone document (not fragment)
|
35 |
+
extra_args.append("--standalone")
|
36 |
+
|
37 |
+
# Basic options
|
38 |
+
if generate_toc:
|
39 |
+
extra_args.append("--toc")
|
40 |
+
if extract_media_to_path:
|
41 |
+
extra_args.append(f"--extract-media={extract_media_to_path}")
|
42 |
+
if latex_template_path and os.path.isfile(latex_template_path):
|
43 |
+
extra_args.append(f"--template={latex_template_path}")
|
44 |
+
elif latex_template_path:
|
45 |
+
pass # Template not found, Pandoc will handle the error
|
46 |
+
|
47 |
+
# Enhanced features
|
48 |
+
if overleaf_compatible:
|
49 |
+
extra_args.extend([
|
50 |
+
"--resource-path=./",
|
51 |
+
"--default-image-extension=png"
|
52 |
+
])
|
53 |
+
|
54 |
+
if preserve_styles:
|
55 |
+
extra_args.extend([
|
56 |
+
"--from=docx+styles",
|
57 |
+
"--wrap=preserve",
|
58 |
+
"--columns=72",
|
59 |
+
"--strip-comments" # Remove comments that might cause highlighting
|
60 |
+
])
|
61 |
+
|
62 |
+
if preserve_linebreaks:
|
63 |
+
extra_args.extend([
|
64 |
+
"--preserve-tabs",
|
65 |
+
"--wrap=preserve",
|
66 |
+
"--reference-doc=" + docx_path # Use original Word doc as reference for formatting
|
67 |
+
])
|
68 |
+
|
69 |
+
# Create minimal Lua filter that preserves Word's original line breaks
|
70 |
+
lua_filter_content = '''
|
71 |
+
function Para(elem)
|
72 |
+
-- Preserve all line breaks exactly as they appear in Word
|
73 |
+
-- This maintains Word's original pagination and formatting
|
74 |
+
local new_content = {}
|
75 |
+
|
76 |
+
for i, item in ipairs(elem.content) do
|
77 |
+
if item.t == "SoftBreak" then
|
78 |
+
-- Convert all soft breaks to line breaks to match Word's formatting
|
79 |
+
table.insert(new_content, pandoc.LineBreak())
|
80 |
+
else
|
81 |
+
table.insert(new_content, item)
|
82 |
+
end
|
83 |
+
end
|
84 |
+
|
85 |
+
elem.content = new_content
|
86 |
+
return elem
|
87 |
+
end
|
88 |
+
|
89 |
+
function LineBlock(elem)
|
90 |
+
-- Preserve line blocks exactly as they are
|
91 |
+
return elem
|
92 |
+
end
|
93 |
+
|
94 |
+
function Span(elem)
|
95 |
+
-- Remove unwanted highlighting and formatting
|
96 |
+
if elem.attributes and elem.attributes.style then
|
97 |
+
-- Remove background colors and highlighting
|
98 |
+
local style = elem.attributes.style
|
99 |
+
if string.find(style, "background") or string.find(style, "highlight") then
|
100 |
+
elem.attributes.style = nil
|
101 |
+
end
|
102 |
+
end
|
103 |
+
return elem
|
104 |
+
end
|
105 |
+
|
106 |
+
function Div(elem)
|
107 |
+
-- Remove unwanted div formatting that causes highlighting
|
108 |
+
if elem.attributes and elem.attributes.style then
|
109 |
+
local style = elem.attributes.style
|
110 |
+
if string.find(style, "background") or string.find(style, "highlight") then
|
111 |
+
elem.attributes.style = nil
|
112 |
+
end
|
113 |
+
end
|
114 |
+
return elem
|
115 |
+
end
|
116 |
+
|
117 |
+
function RawBlock(elem)
|
118 |
+
-- Preserve raw LaTeX blocks
|
119 |
+
if elem.format == "latex" then
|
120 |
+
return elem
|
121 |
+
end
|
122 |
+
end
|
123 |
+
'''
|
124 |
+
|
125 |
+
# Create temporary Lua filter file
|
126 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.lua', delete=False) as f:
|
127 |
+
f.write(lua_filter_content)
|
128 |
+
lua_filter_path = f.name
|
129 |
+
|
130 |
+
extra_args.append(f"--lua-filter={lua_filter_path}")
|
131 |
+
|
132 |
+
try:
|
133 |
+
# Perform conversion
|
134 |
+
pypandoc.convert_file(docx_path, 'latex', outputfile=latex_path, extra_args=extra_args)
|
135 |
+
|
136 |
+
# Clean up temporary Lua filter if created
|
137 |
+
if preserve_linebreaks and 'lua_filter_path' in locals():
|
138 |
+
try:
|
139 |
+
os.unlink(lua_filter_path)
|
140 |
+
except OSError:
|
141 |
+
pass
|
142 |
+
|
143 |
+
# Apply post-processing enhancements (always applied for Unicode conversion)
|
144 |
+
_apply_post_processing(latex_path, overleaf_compatible, preserve_styles, preserve_linebreaks, extract_media_to_path)
|
145 |
+
|
146 |
+
# Generate status message
|
147 |
+
enhancements = []
|
148 |
+
if overleaf_compatible:
|
149 |
+
enhancements.append("Overleaf compatibility")
|
150 |
+
if preserve_styles:
|
151 |
+
enhancements.append("style preservation")
|
152 |
+
if preserve_linebreaks:
|
153 |
+
enhancements.append("line break preservation")
|
154 |
+
|
155 |
+
if enhancements:
|
156 |
+
enhancement_msg = f" with {', '.join(enhancements)}"
|
157 |
+
else:
|
158 |
+
enhancement_msg = ""
|
159 |
+
|
160 |
+
return True, f"Conversion successful{enhancement_msg}!"
|
161 |
+
|
162 |
+
except RuntimeError as e:
|
163 |
+
# Clean up temporary Lua filter if created
|
164 |
+
if preserve_linebreaks and 'lua_filter_path' in locals():
|
165 |
+
try:
|
166 |
+
os.unlink(lua_filter_path)
|
167 |
+
except OSError:
|
168 |
+
pass
|
169 |
+
return False, f"RuntimeError: Could not execute Pandoc. Please ensure Pandoc is installed and in your system's PATH. Error: {e}"
|
170 |
+
except Exception as e:
|
171 |
+
# Clean up temporary Lua filter if created
|
172 |
+
if preserve_linebreaks and 'lua_filter_path' in locals():
|
173 |
+
try:
|
174 |
+
os.unlink(lua_filter_path)
|
175 |
+
except OSError:
|
176 |
+
pass
|
177 |
+
return False, f"Conversion failed: {e}"
|
178 |
+
|
179 |
+
def _apply_post_processing(latex_path: str, overleaf_compatible: bool, preserve_styles: bool, preserve_linebreaks: bool, extract_media_to_path: str = None):
|
180 |
+
"""
|
181 |
+
Apply post-processing enhancements to the generated LaTeX file.
|
182 |
+
"""
|
183 |
+
try:
|
184 |
+
with open(latex_path, 'r', encoding='utf-8') as f:
|
185 |
+
content = f.read()
|
186 |
+
|
187 |
+
# Always inject essential packages for compilation compatibility
|
188 |
+
content = _inject_essential_packages(content)
|
189 |
+
|
190 |
+
# Fix mixed mathematical expressions first to remove duplicated text
|
191 |
+
content = _fix_mixed_mathematical_expressions(content)
|
192 |
+
|
193 |
+
# Convert Unicode mathematical characters to LaTeX equivalents (always applied)
|
194 |
+
content = _convert_unicode_math_characters(content)
|
195 |
+
|
196 |
+
# Apply additional Unicode cleanup as a safety net
|
197 |
+
content = _additional_unicode_cleanup(content)
|
198 |
+
|
199 |
+
# Apply overleaf compatibility fixes
|
200 |
+
if overleaf_compatible:
|
201 |
+
content = _fix_image_paths_for_overleaf(content, extract_media_to_path)
|
202 |
+
|
203 |
+
# Apply style preservation enhancements
|
204 |
+
if preserve_styles:
|
205 |
+
content = _inject_latex_packages(content)
|
206 |
+
content = _add_centering_commands(content)
|
207 |
+
|
208 |
+
# Apply line break preservation fixes
|
209 |
+
if preserve_linebreaks:
|
210 |
+
content = _fix_line_breaks_and_spacing(content)
|
211 |
+
|
212 |
+
# Remove unwanted formatting and highlighting
|
213 |
+
content = _remove_unwanted_formatting(content)
|
214 |
+
|
215 |
+
# Fix common LaTeX compilation issues
|
216 |
+
content = _fix_compilation_issues(content)
|
217 |
+
|
218 |
+
# Write back the processed content
|
219 |
+
with open(latex_path, 'w', encoding='utf-8') as f:
|
220 |
+
f.write(content)
|
221 |
+
|
222 |
+
except Exception as e:
|
223 |
+
# Post-processing failures shouldn't break the conversion
|
224 |
+
print(f"Warning: Post-processing failed: {e}")
|
225 |
+
|
226 |
+
def _inject_essential_packages(content: str) -> str:
|
227 |
+
"""
|
228 |
+
Inject essential packages that are always needed for compilation.
|
229 |
+
"""
|
230 |
+
# Core packages that Pandoc might not include but are often needed
|
231 |
+
essential_packages = [
|
232 |
+
r'\usepackage[utf8]{inputenc}', # UTF-8 input encoding
|
233 |
+
r'\usepackage[T1]{fontenc}', # Font encoding
|
234 |
+
r'\usepackage{graphicx}', # For images
|
235 |
+
r'\usepackage{longtable}', # For tables
|
236 |
+
r'\usepackage{booktabs}', # Better table formatting
|
237 |
+
r'\usepackage{hyperref}', # For links (if not already included)
|
238 |
+
r'\usepackage{amsmath}', # Mathematical formatting
|
239 |
+
r'\usepackage{amssymb}', # Mathematical symbols
|
240 |
+
r'\usepackage{textcomp}', # Additional text symbols
|
241 |
+
]
|
242 |
+
|
243 |
+
documentclass_pattern = r'\\documentclass(?:\[[^\]]*\])?\{[^}]+\}'
|
244 |
+
documentclass_match = re.search(documentclass_pattern, content)
|
245 |
+
|
246 |
+
if documentclass_match:
|
247 |
+
insert_pos = documentclass_match.end()
|
248 |
+
|
249 |
+
packages_to_insert = []
|
250 |
+
for package in essential_packages:
|
251 |
+
package_name = package.split('{')[1].split('}')[0].split(']')[0] # Extract package name
|
252 |
+
if f'usepackage' not in content or package_name not in content:
|
253 |
+
packages_to_insert.append(package)
|
254 |
+
|
255 |
+
if packages_to_insert:
|
256 |
+
package_block = '\n% Essential packages for compilation\n' + '\n'.join(packages_to_insert) + '\n'
|
257 |
+
content = content[:insert_pos] + package_block + content[insert_pos:]
|
258 |
+
|
259 |
+
# Add Unicode character definitions to handle any remaining problematic characters
|
260 |
+
unicode_definitions = r'''
|
261 |
+
% Unicode character definitions for LaTeX compatibility
|
262 |
+
\DeclareUnicodeCharacter{2003}{ } % Em space
|
263 |
+
\DeclareUnicodeCharacter{2002}{ } % En space
|
264 |
+
\DeclareUnicodeCharacter{2009}{ } % Thin space
|
265 |
+
\DeclareUnicodeCharacter{200A}{ } % Hair space
|
266 |
+
\DeclareUnicodeCharacter{2004}{ } % Three-per-em space
|
267 |
+
\DeclareUnicodeCharacter{2005}{ } % Four-per-em space
|
268 |
+
\DeclareUnicodeCharacter{2006}{ } % Six-per-em space
|
269 |
+
\DeclareUnicodeCharacter{2008}{ } % Punctuation space
|
270 |
+
\DeclareUnicodeCharacter{202F}{ } % Narrow no-break space
|
271 |
+
\DeclareUnicodeCharacter{2212}{-} % Unicode minus sign
|
272 |
+
\DeclareUnicodeCharacter{2010}{-} % Hyphen
|
273 |
+
\DeclareUnicodeCharacter{2011}{-} % Non-breaking hyphen
|
274 |
+
\DeclareUnicodeCharacter{2013}{--} % En dash
|
275 |
+
\DeclareUnicodeCharacter{2014}{---}% Em dash
|
276 |
+
'''
|
277 |
+
|
278 |
+
# Insert Unicode definitions after packages but before \begin{document}
|
279 |
+
begin_doc_match = re.search(r'\\begin\{document\}', content)
|
280 |
+
if begin_doc_match:
|
281 |
+
insert_pos_unicode = begin_doc_match.start()
|
282 |
+
content = content[:insert_pos_unicode] + unicode_definitions + '\n' + content[insert_pos_unicode:]
|
283 |
+
|
284 |
+
return content
|
285 |
+
|
286 |
+
def _convert_unicode_math_characters(content: str) -> str:
|
287 |
+
"""
|
288 |
+
Convert Unicode mathematical characters to their LaTeX equivalents.
|
289 |
+
"""
|
290 |
+
# Dictionary of Unicode characters to LaTeX commands
|
291 |
+
unicode_to_latex = {
|
292 |
+
# Mathematical operators
|
293 |
+
'Δ': r'$\Delta$', # U+0394 - Greek capital letter delta
|
294 |
+
'δ': r'$\delta$', # U+03B4 - Greek small letter delta
|
295 |
+
'∑': r'$\sum$', # U+2211 - N-ary summation
|
296 |
+
'∏': r'$\prod$', # U+220F - N-ary product
|
297 |
+
'∫': r'$\int$', # U+222B - Integral
|
298 |
+
'∂': r'$\partial$', # U+2202 - Partial differential
|
299 |
+
'∇': r'$\nabla$', # U+2207 - Nabla
|
300 |
+
'√': r'$\sqrt{}$', # U+221A - Square root
|
301 |
+
'∞': r'$\infty$', # U+221E - Infinity
|
302 |
+
|
303 |
+
# Relations and equality
|
304 |
+
'≈': r'$\approx$', # U+2248 - Almost equal to
|
305 |
+
'≠': r'$\neq$', # U+2260 - Not equal to
|
306 |
+
'≤': r'$\leq$', # U+2264 - Less-than or equal to
|
307 |
+
'≥': r'$\geq$', # U+2265 - Greater-than or equal to
|
308 |
+
'±': r'$\pm$', # U+00B1 - Plus-minus sign
|
309 |
+
'∓': r'$\mp$', # U+2213 - Minus-or-plus sign
|
310 |
+
'×': r'$\times$', # U+00D7 - Multiplication sign
|
311 |
+
'÷': r'$\div$', # U+00F7 - Division sign
|
312 |
+
'⋅': r'$\cdot$', # U+22C5 - Dot operator
|
313 |
+
|
314 |
+
# Set theory and logic
|
315 |
+
'∈': r'$\in$', # U+2208 - Element of
|
316 |
+
'∉': r'$\notin$', # U+2209 - Not an element of
|
317 |
+
'⊂': r'$\subset$', # U+2282 - Subset of
|
318 |
+
'⊃': r'$\supset$', # U+2283 - Superset of
|
319 |
+
'⊆': r'$\subseteq$', # U+2286 - Subset of or equal to
|
320 |
+
'⊇': r'$\supseteq$', # U+2287 - Superset of or equal to
|
321 |
+
'∪': r'$\cup$', # U+222A - Union
|
322 |
+
'∩': r'$\cap$', # U+2229 - Intersection
|
323 |
+
'∅': r'$\emptyset$', # U+2205 - Empty set
|
324 |
+
'∀': r'$\forall$', # U+2200 - For all
|
325 |
+
'∃': r'$\exists$', # U+2203 - There exists
|
326 |
+
|
327 |
+
# Special symbols
|
328 |
+
'∣': r'$|$', # U+2223 - Divides
|
329 |
+
'∥': r'$\parallel$', # U+2225 - Parallel to
|
330 |
+
'⊥': r'$\perp$', # U+22A5 - Up tack (perpendicular)
|
331 |
+
'∠': r'$\angle$', # U+2220 - Angle
|
332 |
+
'°': r'$^\circ$', # U+00B0 - Degree sign
|
333 |
+
|
334 |
+
# Arrows
|
335 |
+
'→': r'$\rightarrow$', # U+2192 - Rightwards arrow
|
336 |
+
'←': r'$\leftarrow$', # U+2190 - Leftwards arrow
|
337 |
+
'↔': r'$\leftrightarrow$', # U+2194 - Left right arrow
|
338 |
+
'⇒': r'$\Rightarrow$', # U+21D2 - Rightwards double arrow
|
339 |
+
'⇐': r'$\Leftarrow$', # U+21D0 - Leftwards double arrow
|
340 |
+
'⇔': r'$\Leftrightarrow$', # U+21D4 - Left right double arrow
|
341 |
+
|
342 |
+
# Accents and diacritics
|
343 |
+
'ˉ': r'$\bar{}$', # U+02C9 - Modifier letter macron
|
344 |
+
'ˆ': r'$\hat{}$', # U+02C6 - Modifier letter circumflex accent
|
345 |
+
'ˇ': r'$\check{}$', # U+02C7 - Caron
|
346 |
+
'˜': r'$\tilde{}$', # U+02DC - Small tilde
|
347 |
+
'˙': r'$\dot{}$', # U+02D9 - Dot above
|
348 |
+
'¨': r'$\ddot{}$', # U+00A8 - Diaeresis
|
349 |
+
|
350 |
+
# Special minus and spaces - using explicit Unicode escape sequences
|
351 |
+
'−': r'-', # U+2212 - Minus sign (convert to regular hyphen)
|
352 |
+
'\u2003': r' ', # U+2003 - Em space (convert to regular space)
|
353 |
+
'\u2009': r' ', # U+2009 - Thin space (convert to regular space)
|
354 |
+
'\u2002': r' ', # U+2002 - En space (convert to regular space)
|
355 |
+
'\u2004': r' ', # U+2004 - Three-per-em space
|
356 |
+
'\u2005': r' ', # U+2005 - Four-per-em space
|
357 |
+
'\u2006': r' ', # U+2006 - Six-per-em space
|
358 |
+
'\u2008': r' ', # U+2008 - Punctuation space
|
359 |
+
'\u200A': r' ', # U+200A - Hair space
|
360 |
+
'\u202F': r' ', # U+202F - Narrow no-break space
|
361 |
+
|
362 |
+
# Greek letters (commonly used in math)
|
363 |
+
'α': r'$\alpha$', # U+03B1
|
364 |
+
'β': r'$\beta$', # U+03B2
|
365 |
+
'γ': r'$\gamma$', # U+03B3
|
366 |
+
'Γ': r'$\Gamma$', # U+0393
|
367 |
+
'ε': r'$\varepsilon$', # U+03B5
|
368 |
+
'ζ': r'$\zeta$', # U+03B6
|
369 |
+
'η': r'$\eta$', # U+03B7
|
370 |
+
'θ': r'$\theta$', # U+03B8
|
371 |
+
'Θ': r'$\Theta$', # U+0398
|
372 |
+
'ι': r'$\iota$', # U+03B9
|
373 |
+
'κ': r'$\kappa$', # U+03BA
|
374 |
+
'λ': r'$\lambda$', # U+03BB
|
375 |
+
'Λ': r'$\Lambda$', # U+039B
|
376 |
+
'μ': r'$\mu$', # U+03BC
|
377 |
+
'ν': r'$\nu$', # U+03BD
|
378 |
+
'ξ': r'$\xi$', # U+03BE
|
379 |
+
'Ξ': r'$\Xi$', # U+039E
|
380 |
+
'π': r'$\pi$', # U+03C0
|
381 |
+
'Π': r'$\Pi$', # U+03A0
|
382 |
+
'ρ': r'$\rho$', # U+03C1
|
383 |
+
'σ': r'$\sigma$', # U+03C3
|
384 |
+
'Σ': r'$\Sigma$', # U+03A3
|
385 |
+
'τ': r'$\tau$', # U+03C4
|
386 |
+
'υ': r'$\upsilon$', # U+03C5
|
387 |
+
'Υ': r'$\Upsilon$', # U+03A5
|
388 |
+
'φ': r'$\varphi$', # U+03C6
|
389 |
+
'Φ': r'$\Phi$', # U+03A6
|
390 |
+
'χ': r'$\chi$', # U+03C7
|
391 |
+
'ψ': r'$\psi$', # U+03C8
|
392 |
+
'Ψ': r'$\Psi$', # U+03A8
|
393 |
+
'ω': r'$\omega$', # U+03C9
|
394 |
+
'Ω': r'$\Omega$', # U+03A9
|
395 |
+
}
|
396 |
+
|
397 |
+
# Apply conversions
|
398 |
+
for unicode_char, latex_cmd in unicode_to_latex.items():
|
399 |
+
if unicode_char in content:
|
400 |
+
content = content.replace(unicode_char, latex_cmd)
|
401 |
+
|
402 |
+
# Additional aggressive Unicode space cleanup using regex
|
403 |
+
# Handle various Unicode spaces more comprehensively
|
404 |
+
content = re.sub(r'[\u2000-\u200F\u2028-\u202F\u205F\u3000]', ' ', content) # All Unicode spaces
|
405 |
+
|
406 |
+
# Handle specific problematic Unicode characters that might not be in our dictionary
|
407 |
+
content = re.sub(r'[\u2010-\u2015]', '-', content) # Various Unicode dashes
|
408 |
+
content = re.sub(r'[\u2212]', '-', content) # Unicode minus sign
|
409 |
+
|
410 |
+
# Handle specific cases where characters might appear in math environments
|
411 |
+
# Fix double math mode (e.g., $\alpha$ inside already math mode)
|
412 |
+
content = re.sub(r'\$\$([^$]+)\$\$', r'$\1$', content) # Convert display math to inline
|
413 |
+
content = re.sub(r'\$\$([^$]*)\$([^$]*)\$\$', r'$\1\2$', content) # Fix broken math
|
414 |
+
|
415 |
+
# Fix bar notation that might have been broken
|
416 |
+
content = re.sub(r'\$\\bar\{\}\$([a-zA-Z])', r'$\\bar{\1}$', content)
|
417 |
+
content = re.sub(r'([a-zA-Z])\$\\bar\{\}\$', r'$\\bar{\1}$', content)
|
418 |
+
|
419 |
+
return content
|
420 |
+
|
421 |
+
def _additional_unicode_cleanup(content: str) -> str:
|
422 |
+
"""
|
423 |
+
Additional aggressive Unicode cleanup to handle any characters that slip through.
|
424 |
+
"""
|
425 |
+
# Convert all common problematic Unicode spaces to regular spaces
|
426 |
+
# This covers a wider range than the dictionary approach
|
427 |
+
unicode_spaces = [
|
428 |
+
'\u00A0', # Non-breaking space
|
429 |
+
'\u1680', # Ogham space mark
|
430 |
+
'\u2000', # En quad
|
431 |
+
'\u2001', # Em quad
|
432 |
+
'\u2002', # En space
|
433 |
+
'\u2003', # Em space
|
434 |
+
'\u2004', # Three-per-em space
|
435 |
+
'\u2005', # Four-per-em space
|
436 |
+
'\u2006', # Six-per-em space
|
437 |
+
'\u2007', # Figure space
|
438 |
+
'\u2008', # Punctuation space
|
439 |
+
'\u2009', # Thin space
|
440 |
+
'\u200A', # Hair space
|
441 |
+
'\u200B', # Zero width space
|
442 |
+
'\u202F', # Narrow no-break space
|
443 |
+
'\u205F', # Medium mathematical space
|
444 |
+
'\u3000', # Ideographic space
|
445 |
+
]
|
446 |
+
|
447 |
+
for unicode_space in unicode_spaces:
|
448 |
+
content = content.replace(unicode_space, ' ')
|
449 |
+
|
450 |
+
# Convert Unicode dashes
|
451 |
+
unicode_dashes = [
|
452 |
+
'\u2010', # Hyphen
|
453 |
+
'\u2011', # Non-breaking hyphen
|
454 |
+
'\u2012', # Figure dash
|
455 |
+
'\u2013', # En dash
|
456 |
+
'\u2014', # Em dash
|
457 |
+
'\u2015', # Horizontal bar
|
458 |
+
'\u2212', # Minus sign
|
459 |
+
]
|
460 |
+
|
461 |
+
for unicode_dash in unicode_dashes:
|
462 |
+
if unicode_dash in ['\u2013', '\u2014']: # En and Em dashes
|
463 |
+
content = content.replace(unicode_dash, '--')
|
464 |
+
else:
|
465 |
+
content = content.replace(unicode_dash, '-')
|
466 |
+
|
467 |
+
# Use regex for any remaining problematic characters
|
468 |
+
# Remove or replace any remaining Unicode characters that commonly cause issues
|
469 |
+
content = re.sub(r'[\u2000-\u200F\u2028-\u202F\u205F\u3000]', ' ', content)
|
470 |
+
content = re.sub(r'[\u2010-\u2015\u2212]', '-', content)
|
471 |
+
|
472 |
+
return content
|
473 |
+
|
474 |
+
def _fix_mixed_mathematical_expressions(content: str) -> str:
|
475 |
+
"""
|
476 |
+
Removes duplicated plain-text versions of mathematical expressions
|
477 |
+
that Pandoc sometimes generates alongside the LaTeX version by deleting
|
478 |
+
the plain text part when it is immediately followed by the LaTeX part.
|
479 |
+
"""
|
480 |
+
|
481 |
+
processed_content = content
|
482 |
+
|
483 |
+
# A list of compiled regex patterns.
|
484 |
+
# Each pattern matches a plain-text formula but only if it's followed
|
485 |
+
# by its corresponding LaTeX version (using a positive lookahead).
|
486 |
+
patterns_to_remove = [
|
487 |
+
# Pattern for: hq,k=x[nq,k]...h_{q,k} = x[n_{q,k}]...
|
488 |
+
re.compile(r'h[qrs],k=x\[n[qrs],k\](?:,h[qrs],k=x\[n[qrs],k\])*\s*' +
|
489 |
+
r'(?=h_{q,k}\s*=\s*x\\\[n_{q,k}\\\],)', re.UNICODE),
|
490 |
+
|
491 |
+
# Pattern for: ∆hq,r,k=hq,k-hr,k...\Delta h_{q,r,k} = ...
|
492 |
+
re.compile(r'(?:∆h[qrs],[qrs],k=h[qrs],k-h[qrs],k\s*)+' +
|
493 |
+
r'(?=\\Delta\s*h_{q,r,k})', re.UNICODE),
|
494 |
+
|
495 |
+
# Pattern for: RRk=tr,k+1-tr,kRR_k = ...
|
496 |
+
re.compile(r'RRk=tr,k\+1-tr,k\s*' +
|
497 |
+
r'(?=RR_k\s*=\s*t_{r,k\+1})', re.UNICODE),
|
498 |
+
|
499 |
+
# Pattern for: Tmed=median{RRk}T_{\mathrm{med}}
|
500 |
+
re.compile(r'Tmed=median\{RRk\}\s*' +
|
501 |
+
r'(?=T_{\\mathrm{med}}\s*=\s*\\mathrm{median}\\{RR_k\\})', re.UNICODE),
|
502 |
+
|
503 |
+
# Pattern for: Tk=[tr,k-Tmed2, tr,k+Tmed2]\mathcal{T}_k
|
504 |
+
re.compile(r'Tk=\[tr,k-Tmed2,.*?tr,k\+Tmed2\]\s*' +
|
505 |
+
r'(?=\\mathcal\{T\}_k\s*=\s*\\\[t_{r,k})', re.UNICODE | re.DOTALL),
|
506 |
+
|
507 |
+
# Pattern for: h¯k=1|Ik|∑n∈Ikx[n]\bar h_k
|
508 |
+
re.compile(r'h¯k=1\|Ik\|∑n∈Ikx\[n\]\s*' +
|
509 |
+
r'(?=\\bar\s*h_k\s*=\s*\\frac)', re.UNICODE),
|
510 |
+
|
511 |
+
# Pattern for: Mrs=median{∆hr,s,k}M_{rs}
|
512 |
+
re.compile(r'Mrs=median\{∆hr,s,k\}\s*' +
|
513 |
+
r'(?=M_{rs}\s*=\s*\\mathrm{median})', re.UNICODE),
|
514 |
+
|
515 |
+
# Pattern for: ∆h¯k=h¯k-Mrs\Delta\bar h_k
|
516 |
+
re.compile(r'∆h¯k=h¯k-Mrs\s*' +
|
517 |
+
r'(?=\\Delta\\bar\s*h_k\s*=\s*\\bar\s*h_k)', re.UNICODE),
|
518 |
+
]
|
519 |
+
|
520 |
+
for pattern in patterns_to_remove:
|
521 |
+
processed_content = pattern.sub('', processed_content)
|
522 |
+
|
523 |
+
return processed_content
|
524 |
+
|
525 |
+
def _fix_compilation_issues(content: str) -> str:
|
526 |
+
"""
|
527 |
+
Fix common LaTeX compilation issues.
|
528 |
+
"""
|
529 |
+
# Fix \tightlist command if not defined
|
530 |
+
if r'\tightlist' in content and r'\providecommand{\tightlist}' not in content:
|
531 |
+
tightlist_def = r'''
|
532 |
+
% Define \tightlist command for lists
|
533 |
+
\providecommand{\tightlist}{%
|
534 |
+
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
|
535 |
+
'''
|
536 |
+
# Insert after packages but before \begin{document}
|
537 |
+
begin_doc_match = re.search(r'\\begin\{document\}', content)
|
538 |
+
if begin_doc_match:
|
539 |
+
insert_pos = begin_doc_match.start()
|
540 |
+
content = content[:insert_pos] + tightlist_def + '\n' + content[insert_pos:]
|
541 |
+
|
542 |
+
# Fix \euro command if used but not defined
|
543 |
+
if r'\euro' in content and r'usepackage{eurosym}' not in content:
|
544 |
+
content = re.sub(
|
545 |
+
r'(\\usepackage\{[^}]+\}\s*\n)',
|
546 |
+
r'\1\\usepackage{eurosym}\n',
|
547 |
+
content,
|
548 |
+
count=1
|
549 |
+
)
|
550 |
+
|
551 |
+
# Fix undefined references to figures/tables
|
552 |
+
content = re.sub(r'\\ref\{fig:([^}]+)\}', r'Figure~\\ref{fig:\1}', content)
|
553 |
+
content = re.sub(r'\\ref\{tab:([^}]+)\}', r'Table~\\ref{tab:\1}', content)
|
554 |
+
|
555 |
+
# Ensure proper figure placement
|
556 |
+
if r'\begin{figure}' in content:
|
557 |
+
content = re.sub(
|
558 |
+
r'\\begin\{figure\}(?!\[)',
|
559 |
+
r'\\begin{figure}[htbp]',
|
560 |
+
content
|
561 |
+
)
|
562 |
+
|
563 |
+
# Ensure proper table placement
|
564 |
+
if r'\begin{table}' in content:
|
565 |
+
content = re.sub(
|
566 |
+
r'\\begin\{table\}(?!\[)',
|
567 |
+
r'\\begin{table}[htbp]',
|
568 |
+
content
|
569 |
+
)
|
570 |
+
|
571 |
+
return content
|
572 |
+
|
573 |
+
def _fix_image_paths_for_overleaf(content: str, extract_media_to_path: str = None) -> str:
|
574 |
+
"""
|
575 |
+
Convert absolute image paths to relative paths for Overleaf compatibility.
|
576 |
+
"""
|
577 |
+
if extract_media_to_path:
|
578 |
+
# Extract the media directory name
|
579 |
+
media_dir = os.path.basename(extract_media_to_path.rstrip('/'))
|
580 |
+
|
581 |
+
# Fix paths with task IDs like: task_id_media/media/image.png -> media/image.png
|
582 |
+
# Pattern: \includegraphics{any_path/task_id_media/media/image.ext}
|
583 |
+
# Replace with: \includegraphics{media/image.ext}
|
584 |
+
pattern1 = r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[a-f0-9\-]+_media[/\\]media[/\\]([^{}]+)\}'
|
585 |
+
replacement1 = r'\\includegraphics\1{media/\2}'
|
586 |
+
content = re.sub(pattern1, replacement1, content)
|
587 |
+
|
588 |
+
# Fix paths like: task_id_media/media/image.png -> media/image.png (without includegraphics)
|
589 |
+
pattern2 = r'[a-f0-9\-]+_media[/\\]media[/\\]'
|
590 |
+
replacement2 = r'media/'
|
591 |
+
content = re.sub(pattern2, replacement2, content)
|
592 |
+
|
593 |
+
# Also handle regular media paths: /absolute/path/to/media/image.ext -> media/image.ext
|
594 |
+
pattern3 = r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[/\\]' + re.escape(media_dir) + r'[/\\]([^{}]+)\}'
|
595 |
+
replacement3 = r'\\includegraphics\1{' + media_dir + r'/\2}'
|
596 |
+
content = re.sub(pattern3, replacement3, content)
|
597 |
+
|
598 |
+
return content
|
599 |
+
|
600 |
+
def _remove_unwanted_formatting(content: str) -> str:
|
601 |
+
"""
|
602 |
+
Remove unwanted highlighting and formatting that causes visual issues.
|
603 |
+
"""
|
604 |
+
# Remove highlighting commands
|
605 |
+
content = re.sub(r'\\colorbox\{[^}]*\}\{([^}]*)\}', r'\1', content)
|
606 |
+
content = re.sub(r'\\hl\{([^}]*)\}', r'\1', content)
|
607 |
+
content = re.sub(r'\\texthl\{([^}]*)\}', r'\1', content)
|
608 |
+
content = re.sub(r'\\hlc\[[^\]]*\]\{([^}]*)\}', r'\1', content)
|
609 |
+
|
610 |
+
# Remove table cell coloring
|
611 |
+
content = re.sub(r'\\cellcolor\{[^}]*\}', '', content)
|
612 |
+
content = re.sub(r'\\rowcolor\{[^}]*\}', '', content)
|
613 |
+
content = re.sub(r'\\columncolor\{[^}]*\}', '', content)
|
614 |
+
|
615 |
+
# Remove text background colors
|
616 |
+
content = re.sub(r'\\textcolor\{[^}]*\}\{([^}]*)\}', r'\1', content)
|
617 |
+
content = re.sub(r'\\color\{[^}]*\}', '', content)
|
618 |
+
|
619 |
+
# Remove box formatting that might cause highlighting
|
620 |
+
content = re.sub(r'\\fcolorbox\{[^}]*\}\{[^}]*\}\{([^}]*)\}', r'\1', content)
|
621 |
+
content = re.sub(r'\\framebox\[[^\]]*\]\{([^}]*)\}', r'\1', content)
|
622 |
+
|
623 |
+
# Remove soul package highlighting
|
624 |
+
content = re.sub(r'\\sethlcolor\{[^}]*\}', '', content)
|
625 |
+
content = re.sub(r'\\ul\{([^}]*)\}', r'\1', content) # Remove underline if causing issues
|
626 |
+
|
627 |
+
return content
|
628 |
+
|
629 |
+
def _inject_latex_packages(content: str) -> str:
|
630 |
+
"""
|
631 |
+
Inject additional LaTeX packages needed for enhanced formatting.
|
632 |
+
"""
|
633 |
+
# Essential packages for enhanced conversion
|
634 |
+
essential_packages = [
|
635 |
+
r'\usepackage{graphicx}', # For images - ensure it's included
|
636 |
+
r'\usepackage{longtable}', # For tables
|
637 |
+
r'\usepackage{booktabs}', # Better table formatting
|
638 |
+
r'\usepackage{array}', # Enhanced table formatting
|
639 |
+
r'\usepackage{calc}', # For calculations
|
640 |
+
r'\usepackage{url}', # For URLs
|
641 |
+
]
|
642 |
+
|
643 |
+
# Style enhancement packages
|
644 |
+
style_packages = [
|
645 |
+
r'\usepackage{float}', # Better float positioning
|
646 |
+
r'\usepackage{adjustbox}', # For centering and scaling
|
647 |
+
r'\usepackage{caption}', # Better caption formatting
|
648 |
+
r'\usepackage{subcaption}', # For subfigures
|
649 |
+
r'\usepackage{tabularx}', # Flexible table widths
|
650 |
+
r'\usepackage{enumitem}', # Better list formatting
|
651 |
+
r'\usepackage{setspace}', # Line spacing control
|
652 |
+
r'\usepackage{ragged2e}', # Better text alignment
|
653 |
+
r'\usepackage{amsmath}', # Mathematical formatting
|
654 |
+
r'\usepackage{amssymb}', # Mathematical symbols
|
655 |
+
r'\usepackage{needspace}', # Prevent orphaned lines and improve page breaks
|
656 |
+
]
|
657 |
+
|
658 |
+
all_packages = essential_packages + style_packages
|
659 |
+
|
660 |
+
# Find the position after \documentclass but before any existing \usepackage or \begin{document}
|
661 |
+
documentclass_pattern = r'\\documentclass(?:\[[^\]]*\])?\{[^}]+\}'
|
662 |
+
documentclass_match = re.search(documentclass_pattern, content)
|
663 |
+
|
664 |
+
if documentclass_match:
|
665 |
+
insert_pos = documentclass_match.end()
|
666 |
+
|
667 |
+
# Find the next significant LaTeX command to insert before it
|
668 |
+
# Look for existing \usepackage, \begin{document}, or other commands
|
669 |
+
remaining_content = content[insert_pos:]
|
670 |
+
next_command_match = re.search(r'\\(?:usepackage|begin\{document\}|title|author|date)', remaining_content)
|
671 |
+
|
672 |
+
if next_command_match:
|
673 |
+
insert_pos += next_command_match.start()
|
674 |
+
|
675 |
+
# Check which packages are not already included
|
676 |
+
packages_to_insert = []
|
677 |
+
for package in all_packages:
|
678 |
+
package_name = package.replace(r'\usepackage{', '').replace('}', '')
|
679 |
+
if f'usepackage{{{package_name}}}' not in content:
|
680 |
+
packages_to_insert.append(package)
|
681 |
+
|
682 |
+
if packages_to_insert:
|
683 |
+
# Add packages with proper spacing
|
684 |
+
package_block = '\n% Enhanced conversion packages\n' + '\n'.join(packages_to_insert) + '\n\n'
|
685 |
+
content = content[:insert_pos] + package_block + content[insert_pos:]
|
686 |
+
|
687 |
+
return content
|
688 |
+
|
689 |
+
def _add_centering_commands(content: str) -> str:
|
690 |
+
"""
|
691 |
+
Add centering commands to figures and tables.
|
692 |
+
"""
|
693 |
+
# Add \centering to figure environments
|
694 |
+
content = re.sub(
|
695 |
+
r'(\\begin\{figure\}(?:\[[^\]]*\])?)\s*\n',
|
696 |
+
r'\1\n\\centering\n',
|
697 |
+
content
|
698 |
+
)
|
699 |
+
|
700 |
+
# Add \centering to table environments
|
701 |
+
content = re.sub(
|
702 |
+
r'(\\begin\{table\}(?:\[[^\]]*\])?)\s*\n',
|
703 |
+
r'\1\n\\centering\n',
|
704 |
+
content
|
705 |
+
)
|
706 |
+
|
707 |
+
return content
|
708 |
+
|
709 |
+
def _fix_line_breaks_and_spacing(content: str) -> str:
|
710 |
+
"""
|
711 |
+
Minimal fixes to preserve Word's original formatting and pagination.
|
712 |
+
"""
|
713 |
+
# Remove unwanted highlighting and color commands
|
714 |
+
content = re.sub(r'\\colorbox\{[^}]*\}\{([^}]*)\}', r'\1', content)
|
715 |
+
content = re.sub(r'\\hl\{([^}]*)\}', r'\1', content)
|
716 |
+
content = re.sub(r'\\texthl\{([^}]*)\}', r'\1', content)
|
717 |
+
content = re.sub(r'\\cellcolor\{[^}]*\}', '', content)
|
718 |
+
content = re.sub(r'\\rowcolor\{[^}]*\}', '', content)
|
719 |
+
|
720 |
+
# Only fix critical spacing issues that break compilation
|
721 |
+
# Preserve Word's original line breaks and spacing as much as possible
|
722 |
+
|
723 |
+
# Ensure proper spacing around lists but don't change internal spacing
|
724 |
+
content = re.sub(r'\n\\begin\{enumerate\}\n\n', r'\n\n\\begin{enumerate}\n', content)
|
725 |
+
content = re.sub(r'\n\n\\end\{enumerate\}\n', r'\n\\end{enumerate}\n\n', content)
|
726 |
+
content = re.sub(r'\n\\begin\{itemize\}\n\n', r'\n\n\\begin{itemize}\n', content)
|
727 |
+
content = re.sub(r'\n\n\\end\{itemize\}\n', r'\n\\end{itemize}\n\n', content)
|
728 |
+
|
729 |
+
# Minimal section spacing - preserve Word's pagination
|
730 |
+
content = re.sub(r'\n(\\(?:sub)*section\{[^}]+\})\n\n', r'\n\n\1\n\n', content)
|
731 |
+
|
732 |
+
# Only remove excessive spacing (3+ line breaks) but preserve double breaks
|
733 |
+
content = re.sub(r'\n\n\n+', r'\n\n', content)
|
734 |
+
|
735 |
+
# Ensure proper spacing around figures and tables
|
736 |
+
content = re.sub(r'\n\\begin\{figure\}', r'\n\n\\begin{figure}', content)
|
737 |
+
content = re.sub(r'\\end\{figure\}\n([A-Z])', r'\\end{figure}\n\n\1', content)
|
738 |
+
content = re.sub(r'\n\\begin\{table\}', r'\n\n\\begin{table}', content)
|
739 |
+
content = re.sub(r'\\end\{table\}\n([A-Z])', r'\\end{table}\n\n\1', content)
|
740 |
+
|
741 |
+
return content
|
742 |
+
|
743 |
+
if __name__ == '__main__':
|
744 |
+
from docx import Document
|
745 |
+
from docx.shared import Inches
|
746 |
+
from PIL import Image
|
747 |
+
import shutil
|
748 |
+
|
749 |
+
# --- Helper Functions for DOCX and Template Creation ---
|
750 |
+
def create_dummy_image(filename, size=(60, 60), color="red", img_format="PNG"):
|
751 |
+
img = Image.new('RGB', size, color=color)
|
752 |
+
img.save(filename, img_format)
|
753 |
+
print(f"Created dummy image: {filename}")
|
754 |
+
|
755 |
+
def create_test_docx_with_styles(filename):
|
756 |
+
doc = Document()
|
757 |
+
doc.add_heading("Document with Enhanced Features", level=1)
|
758 |
+
|
759 |
+
# Add paragraph with text
|
760 |
+
p1 = doc.add_paragraph("This document tests enhanced features including:")
|
761 |
+
|
762 |
+
# Add numbered list
|
763 |
+
doc.add_paragraph("First numbered item", style='List Number')
|
764 |
+
doc.add_paragraph("Second numbered item", style='List Number')
|
765 |
+
doc.add_paragraph("Third numbered item", style='List Number')
|
766 |
+
|
767 |
+
# Add some text
|
768 |
+
doc.add_paragraph("Here is some regular text between lists.")
|
769 |
+
|
770 |
+
# Add bullet list
|
771 |
+
doc.add_paragraph("First bullet point", style='List Bullet')
|
772 |
+
doc.add_paragraph("Second bullet point", style='List Bullet')
|
773 |
+
|
774 |
+
doc.add_heading("Image Section", level=2)
|
775 |
+
doc.add_paragraph("Below is a test image:")
|
776 |
+
|
777 |
+
doc.save(filename)
|
778 |
+
print(f"Created test DOCX with styles: {filename}")
|
779 |
+
|
780 |
+
def create_complex_docx(filename, img1_path, img2_path):
|
781 |
+
doc = Document()
|
782 |
+
doc.add_heading("Complex Document Title", level=1)
|
783 |
+
doc.add_paragraph("Introduction to the complex document.")
|
784 |
+
doc.add_heading("Image Section", level=2)
|
785 |
+
doc.add_picture(img1_path, width=Inches(1.0))
|
786 |
+
doc.add_paragraph("Some text after the first image.")
|
787 |
+
doc.add_picture(img2_path, width=Inches(1.0))
|
788 |
+
doc.add_heading("Conclusion Section", level=2)
|
789 |
+
doc.add_paragraph("Final remarks.")
|
790 |
+
doc.save(filename)
|
791 |
+
print(f"Created complex DOCX: {filename}")
|
792 |
+
|
793 |
+
# --- Test Files ---
|
794 |
+
docx_styles = "test_enhanced_styles.docx"
|
795 |
+
docx_complex = "test_complex_enhanced.docx"
|
796 |
+
img1 = "dummy_img1.png"
|
797 |
+
img2 = "dummy_img2.jpg"
|
798 |
+
|
799 |
+
output_enhanced_test = "output_enhanced_test.tex"
|
800 |
+
output_overleaf_test = "output_overleaf_test.tex"
|
801 |
+
media_dir = "./media_enhanced"
|
802 |
+
|
803 |
+
all_test_files = [docx_styles, docx_complex, img1, img2, output_enhanced_test, output_overleaf_test]
|
804 |
+
all_test_dirs = [media_dir]
|
805 |
+
|
806 |
+
# --- Create Test Files ---
|
807 |
+
print("--- Setting up enhanced test files ---")
|
808 |
+
create_dummy_image(img1, color="blue", img_format="PNG")
|
809 |
+
create_dummy_image(img2, color="green", img_format="JPEG")
|
810 |
+
create_test_docx_with_styles(docx_styles)
|
811 |
+
create_complex_docx(docx_complex, img1, img2)
|
812 |
+
print("--- Enhanced test file setup complete ---")
|
813 |
+
|
814 |
+
# --- Test Enhanced Features ---
|
815 |
+
print("\n--- Testing Enhanced Features ---")
|
816 |
+
|
817 |
+
# Test 1: Style preservation and line breaks
|
818 |
+
print("\n--- Test 1: Enhanced Style Preservation ---")
|
819 |
+
success, msg = convert_docx_to_latex(
|
820 |
+
docx_styles,
|
821 |
+
output_enhanced_test,
|
822 |
+
generate_toc=True,
|
823 |
+
preserve_styles=True,
|
824 |
+
preserve_linebreaks=True
|
825 |
+
)
|
826 |
+
print(f"Enhanced Test: {success}, Msg: {msg}")
|
827 |
+
|
828 |
+
if success and os.path.exists(output_enhanced_test):
|
829 |
+
with open(output_enhanced_test, 'r') as f:
|
830 |
+
content = f.read()
|
831 |
+
checks = {
|
832 |
+
'packages': any(pkg in content for pkg in ['\\usepackage{float}', '\\usepackage{enumitem}']),
|
833 |
+
'toc': '\\tableofcontents' in content,
|
834 |
+
'sections': '\\section' in content,
|
835 |
+
'lists': '\\begin{enumerate}' in content or '\\begin{itemize}' in content
|
836 |
+
}
|
837 |
+
print(f"Enhanced verification: {checks}")
|
838 |
+
|
839 |
+
# Test 2: Overleaf compatibility with images
|
840 |
+
print("\n--- Test 2: Overleaf Compatibility ---")
|
841 |
+
success, msg = convert_docx_to_latex(
|
842 |
+
docx_complex,
|
843 |
+
output_overleaf_test,
|
844 |
+
extract_media_to_path=media_dir,
|
845 |
+
overleaf_compatible=True,
|
846 |
+
preserve_styles=True,
|
847 |
+
preserve_linebreaks=True
|
848 |
+
)
|
849 |
+
print(f"Overleaf Test: {success}, Msg: {msg}")
|
850 |
+
|
851 |
+
if success and os.path.exists(output_overleaf_test):
|
852 |
+
with open(output_overleaf_test, 'r') as f:
|
853 |
+
content = f.read()
|
854 |
+
media_check = 'media/' in content and '\\includegraphics' in content
|
855 |
+
print(f"Overleaf compatibility check - relative paths: {media_check}")
|
856 |
+
|
857 |
+
media_files_exist = os.path.exists(os.path.join(media_dir, 'media'))
|
858 |
+
print(f"Media files extracted: {media_files_exist}")
|
859 |
+
|
860 |
+
# --- Cleanup ---
|
861 |
+
print("\n--- Cleaning up enhanced test files ---")
|
862 |
+
for f_path in all_test_files:
|
863 |
+
if os.path.exists(f_path):
|
864 |
+
try:
|
865 |
+
os.remove(f_path)
|
866 |
+
print(f"Removed: {f_path}")
|
867 |
+
except Exception as e:
|
868 |
+
print(f"Error removing {f_path}: {e}")
|
869 |
+
|
870 |
+
for d_path in all_test_dirs:
|
871 |
+
if os.path.isdir(d_path):
|
872 |
+
try:
|
873 |
+
shutil.rmtree(d_path)
|
874 |
+
print(f"Removed directory: {d_path}")
|
875 |
+
except Exception as e:
|
876 |
+
print(f"Error removing {d_path}: {e}")
|
877 |
+
|
878 |
+
print("--- Enhanced testing completed ---")
|
preserve_linebreaks.lua
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
-- preserve_linebreaks.lua
|
3 |
+
-- Filter for better preservation of line breaks and paragraph structure
|
4 |
+
|
5 |
+
function LineBreak(el)
|
6 |
+
return pandoc.RawInline("latex", "\\\\")
|
7 |
+
end
|
8 |
+
|
9 |
+
function SoftBreak(el)
|
10 |
+
return pandoc.RawInline("latex", " ")
|
11 |
+
end
|
12 |
+
|
13 |
+
function Para(el)
|
14 |
+
-- Add proper spacing for numbered lists and paragraph breaks
|
15 |
+
if #el.content > 0 then
|
16 |
+
return pandoc.Para(el.content)
|
17 |
+
end
|
18 |
+
end
|
19 |
+
|
20 |
+
-- Improve list formatting
|
21 |
+
function OrderedList(el)
|
22 |
+
-- Ensure proper spacing in numbered lists
|
23 |
+
return el
|
24 |
+
end
|
25 |
+
|
26 |
+
function BulletList(el)
|
27 |
+
-- Ensure proper spacing in bullet lists
|
28 |
+
return el
|
29 |
+
end
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask==2.3.3
|
2 |
+
flask-cors==4.0.0
|
3 |
+
pypandoc==1.13
|
4 |
+
python-docx==0.8.11
|
5 |
+
Pillow==10.0.0
|
6 |
+
werkzeug==2.3.7
|
7 |
+
gunicorn==21.2.0
|
temp/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
web_api.py
ADDED
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify, send_file
|
2 |
+
from flask_cors import CORS
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
+
import uuid
|
6 |
+
from werkzeug.utils import secure_filename
|
7 |
+
from converter import convert_docx_to_latex
|
8 |
+
import shutil
|
9 |
+
|
10 |
+
app = Flask(__name__)
|
11 |
+
CORS(app) # Enable CORS for all routes
|
12 |
+
|
13 |
+
# Configuration
|
14 |
+
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
|
15 |
+
UPLOAD_FOLDER = 'temp/uploads'
|
16 |
+
OUTPUT_FOLDER = 'temp/outputs'
|
17 |
+
|
18 |
+
# Ensure directories exist
|
19 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
20 |
+
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
21 |
+
|
22 |
+
# Store conversion tasks
|
23 |
+
conversion_tasks = {}
|
24 |
+
|
25 |
+
@app.route('/api/health', methods=['GET'])
|
26 |
+
def health_check():
|
27 |
+
"""Health check endpoint"""
|
28 |
+
return jsonify({'status': 'healthy', 'message': 'DOCX to LaTeX API is running'})
|
29 |
+
|
30 |
+
@app.route('/api/upload', methods=['POST'])
|
31 |
+
def upload_file():
|
32 |
+
"""Handle file upload"""
|
33 |
+
try:
|
34 |
+
if 'file' not in request.files:
|
35 |
+
return jsonify({'error': 'No file provided'}), 400
|
36 |
+
|
37 |
+
file = request.files['file']
|
38 |
+
if file.filename == '':
|
39 |
+
return jsonify({'error': 'No file selected'}), 400
|
40 |
+
|
41 |
+
if not file.filename.lower().endswith('.docx'):
|
42 |
+
return jsonify({'error': 'Only DOCX files are allowed'}), 400
|
43 |
+
|
44 |
+
# Generate unique task ID
|
45 |
+
task_id = str(uuid.uuid4())
|
46 |
+
|
47 |
+
# Save uploaded file
|
48 |
+
filename = secure_filename(file.filename)
|
49 |
+
file_path = os.path.join(UPLOAD_FOLDER, f"{task_id}_{filename}")
|
50 |
+
file.save(file_path)
|
51 |
+
|
52 |
+
# Store task info
|
53 |
+
conversion_tasks[task_id] = {
|
54 |
+
'status': 'uploaded',
|
55 |
+
'original_filename': filename,
|
56 |
+
'file_path': file_path,
|
57 |
+
'output_filename': filename.replace('.docx', '.tex'),
|
58 |
+
'created_at': os.path.getctime(file_path)
|
59 |
+
}
|
60 |
+
|
61 |
+
return jsonify({
|
62 |
+
'task_id': task_id,
|
63 |
+
'filename': filename,
|
64 |
+
'status': 'uploaded',
|
65 |
+
'message': 'File uploaded successfully'
|
66 |
+
})
|
67 |
+
|
68 |
+
except Exception as e:
|
69 |
+
return jsonify({'error': f'Upload failed: {str(e)}'}), 500
|
70 |
+
|
71 |
+
@app.route('/api/convert', methods=['POST'])
|
72 |
+
def convert_document():
|
73 |
+
"""Convert DOCX to LaTeX"""
|
74 |
+
try:
|
75 |
+
data = request.get_json()
|
76 |
+
|
77 |
+
if not data or 'task_id' not in data:
|
78 |
+
return jsonify({'error': 'Task ID is required'}), 400
|
79 |
+
|
80 |
+
task_id = data['task_id']
|
81 |
+
|
82 |
+
if task_id not in conversion_tasks:
|
83 |
+
return jsonify({'error': 'Invalid task ID'}), 404
|
84 |
+
|
85 |
+
task = conversion_tasks[task_id]
|
86 |
+
|
87 |
+
if task['status'] != 'uploaded':
|
88 |
+
return jsonify({'error': 'Task is not in uploadable state'}), 400
|
89 |
+
|
90 |
+
# Get conversion options
|
91 |
+
options = data.get('options', {})
|
92 |
+
output_filename = data.get('output_filename', task['output_filename'])
|
93 |
+
|
94 |
+
# Update task status
|
95 |
+
task['status'] = 'converting'
|
96 |
+
task['output_filename'] = output_filename
|
97 |
+
|
98 |
+
# Prepare output paths
|
99 |
+
output_path = os.path.join(OUTPUT_FOLDER, f"{task_id}_{output_filename}")
|
100 |
+
media_path = os.path.join(OUTPUT_FOLDER, f"{task_id}_media")
|
101 |
+
|
102 |
+
# Perform conversion
|
103 |
+
success, message = convert_docx_to_latex(
|
104 |
+
docx_path=task['file_path'],
|
105 |
+
latex_path=output_path,
|
106 |
+
generate_toc=options.get('generateToc', False),
|
107 |
+
extract_media_to_path=media_path if options.get('extractMedia', True) else None,
|
108 |
+
latex_template_path=None, # Could be added later for custom templates
|
109 |
+
overleaf_compatible=options.get('overleafCompatible', True),
|
110 |
+
preserve_styles=options.get('preserveStyles', True),
|
111 |
+
preserve_linebreaks=options.get('preserveLineBreaks', True)
|
112 |
+
)
|
113 |
+
|
114 |
+
if success:
|
115 |
+
task['status'] = 'completed'
|
116 |
+
task['output_path'] = output_path
|
117 |
+
task['media_path'] = media_path if os.path.exists(media_path) else None
|
118 |
+
task['conversion_message'] = message
|
119 |
+
|
120 |
+
return jsonify({
|
121 |
+
'task_id': task_id,
|
122 |
+
'status': 'completed',
|
123 |
+
'message': message,
|
124 |
+
'output_filename': output_filename,
|
125 |
+
'has_media': os.path.exists(media_path)
|
126 |
+
})
|
127 |
+
else:
|
128 |
+
task['status'] = 'failed'
|
129 |
+
task['error_message'] = message
|
130 |
+
|
131 |
+
return jsonify({
|
132 |
+
'task_id': task_id,
|
133 |
+
'status': 'failed',
|
134 |
+
'error': message
|
135 |
+
}), 500
|
136 |
+
|
137 |
+
except Exception as e:
|
138 |
+
# Update task status if possible
|
139 |
+
if 'task_id' in locals() and task_id in conversion_tasks:
|
140 |
+
conversion_tasks[task_id]['status'] = 'failed'
|
141 |
+
conversion_tasks[task_id]['error_message'] = str(e)
|
142 |
+
|
143 |
+
return jsonify({'error': f'Conversion failed: {str(e)}'}), 500
|
144 |
+
|
145 |
+
@app.route('/api/download/<task_id>', methods=['GET'])
|
146 |
+
def download_file(task_id):
|
147 |
+
"""Download converted LaTeX file"""
|
148 |
+
try:
|
149 |
+
if task_id not in conversion_tasks:
|
150 |
+
return jsonify({'error': 'Invalid task ID'}), 404
|
151 |
+
|
152 |
+
task = conversion_tasks[task_id]
|
153 |
+
|
154 |
+
if task['status'] != 'completed':
|
155 |
+
return jsonify({'error': 'Conversion not completed'}), 400
|
156 |
+
|
157 |
+
if not os.path.exists(task['output_path']):
|
158 |
+
return jsonify({'error': 'Output file not found'}), 404
|
159 |
+
|
160 |
+
return send_file(
|
161 |
+
task['output_path'],
|
162 |
+
as_attachment=True,
|
163 |
+
download_name=task['output_filename'],
|
164 |
+
mimetype='text/plain'
|
165 |
+
)
|
166 |
+
|
167 |
+
except Exception as e:
|
168 |
+
return jsonify({'error': f'Download failed: {str(e)}'}), 500
|
169 |
+
|
170 |
+
@app.route('/api/download-media/<task_id>', methods=['GET'])
|
171 |
+
def download_media(task_id):
|
172 |
+
"""Download media files as a ZIP archive"""
|
173 |
+
try:
|
174 |
+
if task_id not in conversion_tasks:
|
175 |
+
return jsonify({'error': 'Invalid task ID'}), 404
|
176 |
+
|
177 |
+
task = conversion_tasks[task_id]
|
178 |
+
|
179 |
+
if task['status'] != 'completed':
|
180 |
+
return jsonify({'error': 'Conversion not completed'}), 400
|
181 |
+
|
182 |
+
if not task.get('media_path') or not os.path.exists(task['media_path']):
|
183 |
+
return jsonify({'error': 'No media files found'}), 404
|
184 |
+
|
185 |
+
# Create a ZIP file of the media directory
|
186 |
+
zip_path = task['media_path'] + '.zip'
|
187 |
+
shutil.make_archive(task['media_path'], 'zip', task['media_path'])
|
188 |
+
|
189 |
+
return send_file(
|
190 |
+
zip_path,
|
191 |
+
as_attachment=True,
|
192 |
+
download_name=f"{task['output_filename'].replace('.tex', '')}_media.zip",
|
193 |
+
mimetype='application/zip'
|
194 |
+
)
|
195 |
+
|
196 |
+
except Exception as e:
|
197 |
+
return jsonify({'error': f'Media download failed: {str(e)}'}), 500
|
198 |
+
|
199 |
+
@app.route('/api/download-complete/<task_id>', methods=['GET'])
|
200 |
+
def download_complete_package(task_id):
|
201 |
+
"""Download complete package (LaTeX + media) as a ZIP archive"""
|
202 |
+
try:
|
203 |
+
if task_id not in conversion_tasks:
|
204 |
+
return jsonify({'error': 'Invalid task ID'}), 404
|
205 |
+
|
206 |
+
task = conversion_tasks[task_id]
|
207 |
+
|
208 |
+
if task['status'] != 'completed':
|
209 |
+
return jsonify({'error': 'Conversion not completed'}), 400
|
210 |
+
|
211 |
+
if not os.path.exists(task['output_path']):
|
212 |
+
return jsonify({'error': 'Output file not found'}), 404
|
213 |
+
|
214 |
+
# Create a temporary directory for the complete package
|
215 |
+
import tempfile
|
216 |
+
base_name = task['output_filename'].replace('.tex', '')
|
217 |
+
|
218 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
219 |
+
package_dir = os.path.join(temp_dir, base_name)
|
220 |
+
os.makedirs(package_dir, exist_ok=True)
|
221 |
+
|
222 |
+
# Copy and fix LaTeX file for Overleaf compatibility
|
223 |
+
latex_dest = os.path.join(package_dir, task['output_filename'])
|
224 |
+
|
225 |
+
# Read the original LaTeX file
|
226 |
+
with open(task['output_path'], 'r', encoding='utf-8') as f:
|
227 |
+
latex_content = f.read()
|
228 |
+
|
229 |
+
# Fix image paths to use relative paths suitable for Overleaf
|
230 |
+
# Convert paths like: task_id_media/media/image.png -> media/image.png
|
231 |
+
import re
|
232 |
+
|
233 |
+
# Fix paths with task IDs
|
234 |
+
latex_content = re.sub(
|
235 |
+
r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[a-f0-9\-]+_media[/\\]media[/\\]([^{}]+)\}',
|
236 |
+
r'\\includegraphics\1{media/\2}',
|
237 |
+
latex_content
|
238 |
+
)
|
239 |
+
|
240 |
+
# Fix any remaining absolute paths
|
241 |
+
latex_content = re.sub(
|
242 |
+
r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[/\\]media[/\\]([^{}]+)\}',
|
243 |
+
r'\\includegraphics\1{media/\2}',
|
244 |
+
latex_content
|
245 |
+
)
|
246 |
+
|
247 |
+
# Write the fixed LaTeX file
|
248 |
+
with open(latex_dest, 'w', encoding='utf-8') as f:
|
249 |
+
f.write(latex_content)
|
250 |
+
|
251 |
+
# Copy media files if they exist
|
252 |
+
if task.get('media_path') and os.path.exists(task['media_path']):
|
253 |
+
media_dest = os.path.join(package_dir, 'media')
|
254 |
+
|
255 |
+
# Check if there's a nested media folder structure
|
256 |
+
inner_media = os.path.join(task['media_path'], 'media')
|
257 |
+
if os.path.exists(inner_media):
|
258 |
+
# Copy from the inner media folder to avoid media/media/ nesting
|
259 |
+
shutil.copytree(inner_media, media_dest)
|
260 |
+
else:
|
261 |
+
# Copy the media_path directly if no nesting
|
262 |
+
shutil.copytree(task['media_path'], media_dest)
|
263 |
+
|
264 |
+
# Create README file
|
265 |
+
readme_content = f"""# {base_name} - DOCX to LaTeX Conversion
|
266 |
+
|
267 |
+
## Package Contents:
|
268 |
+
|
269 |
+
1. **{task['output_filename']}** - Main LaTeX file
|
270 |
+
2. **media/** - Images and media files (if any)
|
271 |
+
|
272 |
+
## How to Use:
|
273 |
+
|
274 |
+
### Compiling LaTeX:
|
275 |
+
```bash
|
276 |
+
pdflatex {task['output_filename']}
|
277 |
+
```
|
278 |
+
|
279 |
+
### For Overleaf:
|
280 |
+
1. Upload all files to a new Overleaf project
|
281 |
+
2. Set main file: {task['output_filename']}
|
282 |
+
3. Compile the project
|
283 |
+
|
284 |
+
### Local Compilation:
|
285 |
+
```bash
|
286 |
+
# Basic compilation
|
287 |
+
pdflatex {task['output_filename']}
|
288 |
+
|
289 |
+
# For bibliography and cross-references
|
290 |
+
pdflatex {task['output_filename']}
|
291 |
+
bibtex {task['output_filename'].replace('.tex', '')}
|
292 |
+
pdflatex {task['output_filename']}
|
293 |
+
pdflatex {task['output_filename']}
|
294 |
+
```
|
295 |
+
|
296 |
+
## Features:
|
297 |
+
- Enhanced formatting preservation
|
298 |
+
- Overleaf compatibility
|
299 |
+
- Automatic image path fixing
|
300 |
+
- Unicode character conversion
|
301 |
+
- Mathematical expression optimization
|
302 |
+
|
303 |
+
## Generated by:
|
304 |
+
DOCX to LaTeX Web Converter
|
305 |
+
https://github.com/your-username/docx-to-latex
|
306 |
+
"""
|
307 |
+
|
308 |
+
readme_path = os.path.join(package_dir, 'README.txt')
|
309 |
+
with open(readme_path, 'w', encoding='utf-8') as f:
|
310 |
+
f.write(readme_content)
|
311 |
+
|
312 |
+
# Create ZIP file
|
313 |
+
zip_path = os.path.join(temp_dir, f"{base_name}_complete.zip")
|
314 |
+
shutil.make_archive(zip_path.replace('.zip', ''), 'zip', package_dir)
|
315 |
+
|
316 |
+
return send_file(
|
317 |
+
zip_path,
|
318 |
+
as_attachment=True,
|
319 |
+
download_name=f"{base_name}_complete.zip",
|
320 |
+
mimetype='application/zip'
|
321 |
+
)
|
322 |
+
|
323 |
+
except Exception as e:
|
324 |
+
return jsonify({'error': f'Complete package download failed: {str(e)}'}), 500
|
325 |
+
|
326 |
+
@app.route('/api/status/<task_id>', methods=['GET'])
|
327 |
+
def get_task_status(task_id):
|
328 |
+
"""Get conversion task status"""
|
329 |
+
try:
|
330 |
+
if task_id not in conversion_tasks:
|
331 |
+
return jsonify({'error': 'Invalid task ID'}), 404
|
332 |
+
|
333 |
+
task = conversion_tasks[task_id]
|
334 |
+
|
335 |
+
response_data = {
|
336 |
+
'task_id': task_id,
|
337 |
+
'status': task['status'],
|
338 |
+
'original_filename': task['original_filename'],
|
339 |
+
'output_filename': task.get('output_filename', ''),
|
340 |
+
}
|
341 |
+
|
342 |
+
if task['status'] == 'completed':
|
343 |
+
response_data['message'] = task.get('conversion_message', 'Conversion completed successfully')
|
344 |
+
response_data['has_media'] = task.get('media_path') and os.path.exists(task['media_path'])
|
345 |
+
elif task['status'] == 'failed':
|
346 |
+
response_data['error'] = task.get('error_message', 'Conversion failed')
|
347 |
+
|
348 |
+
return jsonify(response_data)
|
349 |
+
|
350 |
+
except Exception as e:
|
351 |
+
return jsonify({'error': f'Status check failed: {str(e)}'}), 500
|
352 |
+
|
353 |
+
@app.route('/api/cleanup/<task_id>', methods=['DELETE'])
|
354 |
+
def cleanup_task(task_id):
|
355 |
+
"""Clean up task files"""
|
356 |
+
try:
|
357 |
+
if task_id not in conversion_tasks:
|
358 |
+
return jsonify({'error': 'Invalid task ID'}), 404
|
359 |
+
|
360 |
+
task = conversion_tasks[task_id]
|
361 |
+
|
362 |
+
# Remove uploaded file
|
363 |
+
if os.path.exists(task['file_path']):
|
364 |
+
os.remove(task['file_path'])
|
365 |
+
|
366 |
+
# Remove output file
|
367 |
+
if task.get('output_path') and os.path.exists(task['output_path']):
|
368 |
+
os.remove(task['output_path'])
|
369 |
+
|
370 |
+
# Remove media directory
|
371 |
+
if task.get('media_path') and os.path.exists(task['media_path']):
|
372 |
+
shutil.rmtree(task['media_path'])
|
373 |
+
|
374 |
+
# Remove media ZIP if it exists
|
375 |
+
media_zip = task.get('media_path', '') + '.zip'
|
376 |
+
if os.path.exists(media_zip):
|
377 |
+
os.remove(media_zip)
|
378 |
+
|
379 |
+
# Remove task from memory
|
380 |
+
del conversion_tasks[task_id]
|
381 |
+
|
382 |
+
return jsonify({'message': 'Task cleaned up successfully'})
|
383 |
+
|
384 |
+
except Exception as e:
|
385 |
+
return jsonify({'error': f'Cleanup failed: {str(e)}'}), 500
|
386 |
+
|
387 |
+
@app.route('/api/tasks', methods=['GET'])
|
388 |
+
def list_tasks():
|
389 |
+
"""List all conversion tasks (for debugging)"""
|
390 |
+
try:
|
391 |
+
tasks_summary = {}
|
392 |
+
for task_id, task in conversion_tasks.items():
|
393 |
+
tasks_summary[task_id] = {
|
394 |
+
'status': task['status'],
|
395 |
+
'original_filename': task['original_filename'],
|
396 |
+
'output_filename': task.get('output_filename', ''),
|
397 |
+
'created_at': task.get('created_at', 0)
|
398 |
+
}
|
399 |
+
|
400 |
+
return jsonify(tasks_summary)
|
401 |
+
|
402 |
+
except Exception as e:
|
403 |
+
return jsonify({'error': f'Failed to list tasks: {str(e)}'}), 500
|
404 |
+
|
405 |
+
# Cleanup old files on startup
|
406 |
+
def cleanup_old_files():
|
407 |
+
"""Remove old temporary files"""
|
408 |
+
try:
|
409 |
+
import time
|
410 |
+
current_time = time.time()
|
411 |
+
cutoff_time = current_time - (24 * 60 * 60) # 24 hours ago
|
412 |
+
|
413 |
+
for folder in [UPLOAD_FOLDER, OUTPUT_FOLDER]:
|
414 |
+
if os.path.exists(folder):
|
415 |
+
for filename in os.listdir(folder):
|
416 |
+
file_path = os.path.join(folder, filename)
|
417 |
+
if os.path.isfile(file_path):
|
418 |
+
file_time = os.path.getctime(file_path)
|
419 |
+
if file_time < cutoff_time:
|
420 |
+
os.remove(file_path)
|
421 |
+
elif os.path.isdir(file_path):
|
422 |
+
dir_time = os.path.getctime(file_path)
|
423 |
+
if dir_time < cutoff_time:
|
424 |
+
shutil.rmtree(file_path)
|
425 |
+
except Exception as e:
|
426 |
+
print(f"Warning: Failed to cleanup old files: {e}")
|
427 |
+
|
428 |
+
if __name__ == '__main__':
|
429 |
+
# Cleanup old files on startup
|
430 |
+
cleanup_old_files()
|
431 |
+
|
432 |
+
# Run the Flask app
|
433 |
+
print("Starting DOCX to LaTeX API server...")
|
434 |
+
print("API endpoints:")
|
435 |
+
print(" POST /api/upload - Upload DOCX file")
|
436 |
+
print(" POST /api/convert - Convert to LaTeX")
|
437 |
+
print(" GET /api/download/<task_id> - Download LaTeX file")
|
438 |
+
print(" GET /api/download-media/<task_id> - Download media files")
|
439 |
+
print(" GET /api/status/<task_id> - Get conversion status")
|
440 |
+
print(" DELETE /api/cleanup/<task_id> - Cleanup task files")
|
441 |
+
print(" GET /api/health - Health check")
|
442 |
+
|
443 |
+
app.run(debug=True, host='0.0.0.0', port=5000)
|