File size: 36,487 Bytes
a469ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
import pypandoc
import os
import re
import tempfile

def convert_docx_to_latex(
    docx_path: str,
    latex_path: str,
    generate_toc: bool = False,
    extract_media_to_path: str = None,
    latex_template_path: str = None,
    overleaf_compatible: bool = False,
    preserve_styles: bool = True,
    preserve_linebreaks: bool = True
) -> tuple[bool, str]:
    """
    Converts a DOCX file to a LaTeX file using pypandoc with enhanced features.

    Args:
        docx_path: Path to the input .docx file.
        latex_path: Path to save the output .tex file.
        generate_toc: If True, attempts to generate a Table of Contents.
        extract_media_to_path: If specified, path to extract media to (e.g., "./media").
        latex_template_path: If specified, path to a custom Pandoc LaTeX template file.
        overleaf_compatible: If True, makes images work in Overleaf with relative paths.
        preserve_styles: If True, preserves document styles like centering and alignment.
        preserve_linebreaks: If True, preserves line breaks and proper list formatting.

    Returns:
        A tuple (success: bool, message: str).
    """
    extra_args = []
    
    # Ensure standalone document (not fragment)
    extra_args.append("--standalone")
    
    # Basic options
    if generate_toc:
        extra_args.append("--toc")
    if extract_media_to_path:
        extra_args.append(f"--extract-media={extract_media_to_path}")
    if latex_template_path and os.path.isfile(latex_template_path):
        extra_args.append(f"--template={latex_template_path}")
    elif latex_template_path:
        pass  # Template not found, Pandoc will handle the error

    # Enhanced features
    if overleaf_compatible:
        extra_args.extend([
            "--resource-path=./",
            "--default-image-extension=png"
        ])
    
    if preserve_styles:
        extra_args.extend([
            "--from=docx+styles",
            "--wrap=preserve",
            "--columns=72",
            "--strip-comments"  # Remove comments that might cause highlighting
        ])
    
    if preserve_linebreaks:
        extra_args.extend([
            "--preserve-tabs",
            "--wrap=preserve",
            "--reference-doc=" + docx_path  # Use original Word doc as reference for formatting
        ])
        
        # Create minimal Lua filter that preserves Word's original line breaks
        lua_filter_content = '''
function Para(elem)
  -- Preserve all line breaks exactly as they appear in Word
  -- This maintains Word's original pagination and formatting
  local new_content = {}
  
  for i, item in ipairs(elem.content) do
    if item.t == "SoftBreak" then
      -- Convert all soft breaks to line breaks to match Word's formatting
      table.insert(new_content, pandoc.LineBreak())
    else
      table.insert(new_content, item)
    end
  end
  
  elem.content = new_content
  return elem
end

function LineBlock(elem)
  -- Preserve line blocks exactly as they are
  return elem
end

function Span(elem)
  -- Remove unwanted highlighting and formatting
  if elem.attributes and elem.attributes.style then
    -- Remove background colors and highlighting
    local style = elem.attributes.style
    if string.find(style, "background") or string.find(style, "highlight") then
      elem.attributes.style = nil
    end
  end
  return elem
end

function Div(elem)
  -- Remove unwanted div formatting that causes highlighting
  if elem.attributes and elem.attributes.style then
    local style = elem.attributes.style
    if string.find(style, "background") or string.find(style, "highlight") then
      elem.attributes.style = nil
    end
  end
  return elem
end

function RawBlock(elem)
  -- Preserve raw LaTeX blocks
  if elem.format == "latex" then
    return elem
  end
end
'''
        
        # Create temporary Lua filter file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.lua', delete=False) as f:
            f.write(lua_filter_content)
            lua_filter_path = f.name
        
        extra_args.append(f"--lua-filter={lua_filter_path}")

    try:
        # Perform conversion
        pypandoc.convert_file(docx_path, 'latex', outputfile=latex_path, extra_args=extra_args)
        
        # Clean up temporary Lua filter if created
        if preserve_linebreaks and 'lua_filter_path' in locals():
            try:
                os.unlink(lua_filter_path)
            except OSError:
                pass
        
        # Apply post-processing enhancements (always applied for Unicode conversion)
        _apply_post_processing(latex_path, overleaf_compatible, preserve_styles, preserve_linebreaks, extract_media_to_path)
        
        # Generate status message
        enhancements = []
        if overleaf_compatible:
            enhancements.append("Overleaf compatibility")
        if preserve_styles:
            enhancements.append("style preservation")
        if preserve_linebreaks:
            enhancements.append("line break preservation")
        
        if enhancements:
            enhancement_msg = f" with {', '.join(enhancements)}"
        else:
            enhancement_msg = ""
            
        return True, f"Conversion successful{enhancement_msg}!"
        
    except RuntimeError as e:
        # Clean up temporary Lua filter if created
        if preserve_linebreaks and 'lua_filter_path' in locals():
            try:
                os.unlink(lua_filter_path)
            except OSError:
                pass
        return False, f"RuntimeError: Could not execute Pandoc. Please ensure Pandoc is installed and in your system's PATH. Error: {e}"
    except Exception as e:
        # Clean up temporary Lua filter if created
        if preserve_linebreaks and 'lua_filter_path' in locals():
            try:
                os.unlink(lua_filter_path)
            except OSError:
                pass
        return False, f"Conversion failed: {e}"

def _apply_post_processing(latex_path: str, overleaf_compatible: bool, preserve_styles: bool, preserve_linebreaks: bool, extract_media_to_path: str = None):
    """
    Apply post-processing enhancements to the generated LaTeX file.
    """
    try:
        with open(latex_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Always inject essential packages for compilation compatibility
        content = _inject_essential_packages(content)
        
        # Fix mixed mathematical expressions first to remove duplicated text
        content = _fix_mixed_mathematical_expressions(content)
        
        # Convert Unicode mathematical characters to LaTeX equivalents (always applied)
        content = _convert_unicode_math_characters(content)
        
        # Apply additional Unicode cleanup as a safety net
        content = _additional_unicode_cleanup(content)
        
        # Apply overleaf compatibility fixes
        if overleaf_compatible:
            content = _fix_image_paths_for_overleaf(content, extract_media_to_path)
        
        # Apply style preservation enhancements
        if preserve_styles:
            content = _inject_latex_packages(content)
            content = _add_centering_commands(content)
        
        # Apply line break preservation fixes
        if preserve_linebreaks:
            content = _fix_line_breaks_and_spacing(content)
        
        # Remove unwanted formatting and highlighting
        content = _remove_unwanted_formatting(content)
        
        # Fix common LaTeX compilation issues
        content = _fix_compilation_issues(content)
        
        # Write back the processed content
        with open(latex_path, 'w', encoding='utf-8') as f:
            f.write(content)
            
    except Exception as e:
        # Post-processing failures shouldn't break the conversion
        print(f"Warning: Post-processing failed: {e}")

def _inject_essential_packages(content: str) -> str:
    """
    Inject essential packages that are always needed for compilation.
    """
    # Core packages that Pandoc might not include but are often needed
    essential_packages = [
        r'\usepackage[utf8]{inputenc}',  # UTF-8 input encoding
        r'\usepackage[T1]{fontenc}',     # Font encoding
        r'\usepackage{graphicx}',        # For images
        r'\usepackage{longtable}',       # For tables
        r'\usepackage{booktabs}',        # Better table formatting
        r'\usepackage{hyperref}',        # For links (if not already included)
        r'\usepackage{amsmath}',         # Mathematical formatting
        r'\usepackage{amssymb}',         # Mathematical symbols
        r'\usepackage{textcomp}',        # Additional text symbols
    ]
    
    documentclass_pattern = r'\\documentclass(?:\[[^\]]*\])?\{[^}]+\}'
    documentclass_match = re.search(documentclass_pattern, content)
    
    if documentclass_match:
        insert_pos = documentclass_match.end()
        
        packages_to_insert = []
        for package in essential_packages:
            package_name = package.split('{')[1].split('}')[0].split(']')[0]  # Extract package name
            if f'usepackage' not in content or package_name not in content:
                packages_to_insert.append(package)
        
        if packages_to_insert:
            package_block = '\n% Essential packages for compilation\n' + '\n'.join(packages_to_insert) + '\n'
            content = content[:insert_pos] + package_block + content[insert_pos:]
        
        # Add Unicode character definitions to handle any remaining problematic characters
        unicode_definitions = r'''
% Unicode character definitions for LaTeX compatibility
\DeclareUnicodeCharacter{2003}{ }  % Em space
\DeclareUnicodeCharacter{2002}{ }  % En space
\DeclareUnicodeCharacter{2009}{ }  % Thin space
\DeclareUnicodeCharacter{200A}{ }  % Hair space
\DeclareUnicodeCharacter{2004}{ }  % Three-per-em space
\DeclareUnicodeCharacter{2005}{ }  % Four-per-em space
\DeclareUnicodeCharacter{2006}{ }  % Six-per-em space
\DeclareUnicodeCharacter{2008}{ }  % Punctuation space
\DeclareUnicodeCharacter{202F}{ }  % Narrow no-break space
\DeclareUnicodeCharacter{2212}{-}  % Unicode minus sign
\DeclareUnicodeCharacter{2010}{-}  % Hyphen
\DeclareUnicodeCharacter{2011}{-}  % Non-breaking hyphen
\DeclareUnicodeCharacter{2013}{--} % En dash
\DeclareUnicodeCharacter{2014}{---}% Em dash
'''
        
        # Insert Unicode definitions after packages but before \begin{document}
        begin_doc_match = re.search(r'\\begin\{document\}', content)
        if begin_doc_match:
            insert_pos_unicode = begin_doc_match.start()
            content = content[:insert_pos_unicode] + unicode_definitions + '\n' + content[insert_pos_unicode:]
    
    return content

def _convert_unicode_math_characters(content: str) -> str:
    """
    Convert Unicode mathematical characters to their LaTeX equivalents.
    """
    # Dictionary of Unicode characters to LaTeX commands
    unicode_to_latex = {
        # Mathematical operators
        'Δ': r'$\Delta$',           # U+0394 - Greek capital letter delta
        'δ': r'$\delta$',           # U+03B4 - Greek small letter delta
        '∑': r'$\sum$',             # U+2211 - N-ary summation
        '∏': r'$\prod$',            # U+220F - N-ary product
        '∫': r'$\int$',             # U+222B - Integral
        '∂': r'$\partial$',         # U+2202 - Partial differential
        '∇': r'$\nabla$',           # U+2207 - Nabla
        '√': r'$\sqrt{}$',          # U+221A - Square root
        '∞': r'$\infty$',           # U+221E - Infinity
        
        # Relations and equality
        '≈': r'$\approx$',          # U+2248 - Almost equal to
        '≠': r'$\neq$',             # U+2260 - Not equal to
        '≤': r'$\leq$',             # U+2264 - Less-than or equal to
        '≥': r'$\geq$',             # U+2265 - Greater-than or equal to
        '±': r'$\pm$',              # U+00B1 - Plus-minus sign
        '∓': r'$\mp$',              # U+2213 - Minus-or-plus sign
        '×': r'$\times$',           # U+00D7 - Multiplication sign
        '÷': r'$\div$',             # U+00F7 - Division sign
        '⋅': r'$\cdot$',            # U+22C5 - Dot operator
        
        # Set theory and logic
        '∈': r'$\in$',              # U+2208 - Element of
        '∉': r'$\notin$',           # U+2209 - Not an element of
        '⊂': r'$\subset$',          # U+2282 - Subset of
        '⊃': r'$\supset$',          # U+2283 - Superset of
        '⊆': r'$\subseteq$',        # U+2286 - Subset of or equal to
        '⊇': r'$\supseteq$',        # U+2287 - Superset of or equal to
        '∪': r'$\cup$',             # U+222A - Union
        '∩': r'$\cap$',             # U+2229 - Intersection
        '∅': r'$\emptyset$',        # U+2205 - Empty set
        '∀': r'$\forall$',          # U+2200 - For all
        '∃': r'$\exists$',          # U+2203 - There exists
        
        # Special symbols
        '∣': r'$|$',                # U+2223 - Divides
        '∥': r'$\parallel$',        # U+2225 - Parallel to
        '⊥': r'$\perp$',            # U+22A5 - Up tack (perpendicular)
        '∠': r'$\angle$',           # U+2220 - Angle
        '°': r'$^\circ$',           # U+00B0 - Degree sign
        
        # Arrows
        '→': r'$\rightarrow$',      # U+2192 - Rightwards arrow
        '←': r'$\leftarrow$',       # U+2190 - Leftwards arrow
        '↔': r'$\leftrightarrow$',  # U+2194 - Left right arrow
        '⇒': r'$\Rightarrow$',      # U+21D2 - Rightwards double arrow
        '⇐': r'$\Leftarrow$',       # U+21D0 - Leftwards double arrow
        '⇔': r'$\Leftrightarrow$',  # U+21D4 - Left right double arrow
        
        # Accents and diacritics
        'ˉ': r'$\bar{}$',           # U+02C9 - Modifier letter macron
        'ˆ': r'$\hat{}$',           # U+02C6 - Modifier letter circumflex accent
        'ˇ': r'$\check{}$',         # U+02C7 - Caron
        '˜': r'$\tilde{}$',         # U+02DC - Small tilde
        '˙': r'$\dot{}$',           # U+02D9 - Dot above
        '¨': r'$\ddot{}$',          # U+00A8 - Diaeresis
        
        # Special minus and spaces - using explicit Unicode escape sequences
        '−': r'-',                  # U+2212 - Minus sign (convert to regular hyphen)
        '\u2003': r' ',             # U+2003 - Em space (convert to regular space)
        '\u2009': r' ',             # U+2009 - Thin space (convert to regular space)
        '\u2002': r' ',             # U+2002 - En space (convert to regular space)
        '\u2004': r' ',             # U+2004 - Three-per-em space
        '\u2005': r' ',             # U+2005 - Four-per-em space
        '\u2006': r' ',             # U+2006 - Six-per-em space
        '\u2008': r' ',             # U+2008 - Punctuation space
        '\u200A': r' ',             # U+200A - Hair space
        '\u202F': r' ',             # U+202F - Narrow no-break space
        
        # Greek letters (commonly used in math)
        'α': r'$\alpha$',           # U+03B1
        'β': r'$\beta$',            # U+03B2
        'γ': r'$\gamma$',           # U+03B3
        'Γ': r'$\Gamma$',           # U+0393
        'ε': r'$\varepsilon$',      # U+03B5
        'ζ': r'$\zeta$',            # U+03B6
        'η': r'$\eta$',             # U+03B7
        'θ': r'$\theta$',           # U+03B8
        'Θ': r'$\Theta$',           # U+0398
        'ι': r'$\iota$',            # U+03B9
        'κ': r'$\kappa$',           # U+03BA
        'λ': r'$\lambda$',          # U+03BB
        'Λ': r'$\Lambda$',          # U+039B
        'μ': r'$\mu$',              # U+03BC
        'ν': r'$\nu$',              # U+03BD
        'ξ': r'$\xi$',              # U+03BE
        'Ξ': r'$\Xi$',              # U+039E
        'π': r'$\pi$',              # U+03C0
        'Π': r'$\Pi$',              # U+03A0
        'ρ': r'$\rho$',             # U+03C1
        'σ': r'$\sigma$',           # U+03C3
        'Σ': r'$\Sigma$',           # U+03A3
        'τ': r'$\tau$',             # U+03C4
        'υ': r'$\upsilon$',         # U+03C5
        'Υ': r'$\Upsilon$',         # U+03A5
        'φ': r'$\varphi$',          # U+03C6
        'Φ': r'$\Phi$',             # U+03A6
        'χ': r'$\chi$',             # U+03C7
        'ψ': r'$\psi$',             # U+03C8
        'Ψ': r'$\Psi$',             # U+03A8
        'ω': r'$\omega$',           # U+03C9
        'Ω': r'$\Omega$',           # U+03A9
    }
    
    # Apply conversions
    for unicode_char, latex_cmd in unicode_to_latex.items():
        if unicode_char in content:
            content = content.replace(unicode_char, latex_cmd)
    
    # Additional aggressive Unicode space cleanup using regex
    # Handle various Unicode spaces more comprehensively
    content = re.sub(r'[\u2000-\u200F\u2028-\u202F\u205F\u3000]', ' ', content)  # All Unicode spaces
    
    # Handle specific problematic Unicode characters that might not be in our dictionary
    content = re.sub(r'[\u2010-\u2015]', '-', content)  # Various Unicode dashes
    content = re.sub(r'[\u2212]', '-', content)         # Unicode minus sign
    
    # Handle specific cases where characters might appear in math environments
    # Fix double math mode (e.g., $\alpha$ inside already math mode)
    content = re.sub(r'\$\$([^$]+)\$\$', r'$\1$', content)  # Convert display math to inline
    content = re.sub(r'\$\$([^$]*)\$([^$]*)\$\$', r'$\1\2$', content)  # Fix broken math
    
    # Fix bar notation that might have been broken
    content = re.sub(r'\$\\bar\{\}\$([a-zA-Z])', r'$\\bar{\1}$', content)
    content = re.sub(r'([a-zA-Z])\$\\bar\{\}\$', r'$\\bar{\1}$', content)
    
    return content

def _additional_unicode_cleanup(content: str) -> str:
    """
    Additional aggressive Unicode cleanup to handle any characters that slip through.
    """
    # Convert all common problematic Unicode spaces to regular spaces
    # This covers a wider range than the dictionary approach
    unicode_spaces = [
        '\u00A0',  # Non-breaking space
        '\u1680',  # Ogham space mark
        '\u2000',  # En quad
        '\u2001',  # Em quad
        '\u2002',  # En space
        '\u2003',  # Em space
        '\u2004',  # Three-per-em space
        '\u2005',  # Four-per-em space
        '\u2006',  # Six-per-em space
        '\u2007',  # Figure space
        '\u2008',  # Punctuation space
        '\u2009',  # Thin space
        '\u200A',  # Hair space
        '\u200B',  # Zero width space
        '\u202F',  # Narrow no-break space
        '\u205F',  # Medium mathematical space
        '\u3000',  # Ideographic space
    ]
    
    for unicode_space in unicode_spaces:
        content = content.replace(unicode_space, ' ')
    
    # Convert Unicode dashes
    unicode_dashes = [
        '\u2010',  # Hyphen
        '\u2011',  # Non-breaking hyphen
        '\u2012',  # Figure dash
        '\u2013',  # En dash
        '\u2014',  # Em dash
        '\u2015',  # Horizontal bar
        '\u2212',  # Minus sign
    ]
    
    for unicode_dash in unicode_dashes:
        if unicode_dash in ['\u2013', '\u2014']:  # En and Em dashes
            content = content.replace(unicode_dash, '--')
        else:
            content = content.replace(unicode_dash, '-')
    
    # Use regex for any remaining problematic characters
    # Remove or replace any remaining Unicode characters that commonly cause issues
    content = re.sub(r'[\u2000-\u200F\u2028-\u202F\u205F\u3000]', ' ', content)
    content = re.sub(r'[\u2010-\u2015\u2212]', '-', content)
    
    return content

def _fix_mixed_mathematical_expressions(content: str) -> str:
    """
    Removes duplicated plain-text versions of mathematical expressions
    that Pandoc sometimes generates alongside the LaTeX version by deleting
    the plain text part when it is immediately followed by the LaTeX part.
    """
    
    processed_content = content

    # A list of compiled regex patterns.
    # Each pattern matches a plain-text formula but only if it's followed
    # by its corresponding LaTeX version (using a positive lookahead).
    patterns_to_remove = [
        # Pattern for: hq,k=x[nq,k]...h_{q,k} = x[n_{q,k}]...
        re.compile(r'h[qrs],k=x\[n[qrs],k\](?:,h[qrs],k=x\[n[qrs],k\])*\s*' +
                   r'(?=h_{q,k}\s*=\s*x\\\[n_{q,k}\\\],)', re.UNICODE),

        # Pattern for: ∆hq,r,k=hq,k-hr,k...\Delta h_{q,r,k} = ...
        re.compile(r'(?:∆h[qrs],[qrs],k=h[qrs],k-h[qrs],k\s*)+' +
                   r'(?=\\Delta\s*h_{q,r,k})', re.UNICODE),

        # Pattern for: RRk=tr,k+1-tr,kRR_k = ...
        re.compile(r'RRk=tr,k\+1-tr,k\s*' +
                   r'(?=RR_k\s*=\s*t_{r,k\+1})', re.UNICODE),

        # Pattern for: Tmed=median{RRk}T_{\mathrm{med}}
        re.compile(r'Tmed=median\{RRk\}\s*' +
                   r'(?=T_{\\mathrm{med}}\s*=\s*\\mathrm{median}\\{RR_k\\})', re.UNICODE),

        # Pattern for: Tk=[tr,k-Tmed2, tr,k+Tmed2]\mathcal{T}_k
        re.compile(r'Tk=\[tr,k-Tmed2,.*?tr,k\+Tmed2\]\s*' +
                   r'(?=\\mathcal\{T\}_k\s*=\s*\\\[t_{r,k})', re.UNICODE | re.DOTALL),

        # Pattern for: h¯k=1|Ik|∑n∈Ikx[n]\bar h_k
        re.compile(r'h¯k=1\|Ik\|∑n∈Ikx\[n\]\s*' +
                   r'(?=\\bar\s*h_k\s*=\s*\\frac)', re.UNICODE),

        # Pattern for: Mrs=median{∆hr,s,k}M_{rs}
        re.compile(r'Mrs=median\{∆hr,s,k\}\s*' +
                   r'(?=M_{rs}\s*=\s*\\mathrm{median})', re.UNICODE),
        
        # Pattern for: ∆h¯k=h¯k-Mrs\Delta\bar h_k
        re.compile(r'∆h¯k=h¯k-Mrs\s*' +
                   r'(?=\\Delta\\bar\s*h_k\s*=\s*\\bar\s*h_k)', re.UNICODE),
    ]

    for pattern in patterns_to_remove:
        processed_content = pattern.sub('', processed_content)
    
    return processed_content

def _fix_compilation_issues(content: str) -> str:
    """
    Fix common LaTeX compilation issues.
    """
    # Fix \tightlist command if not defined
    if r'\tightlist' in content and r'\providecommand{\tightlist}' not in content:
        tightlist_def = r'''
% Define \tightlist command for lists
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
'''
        # Insert after packages but before \begin{document}
        begin_doc_match = re.search(r'\\begin\{document\}', content)
        if begin_doc_match:
            insert_pos = begin_doc_match.start()
            content = content[:insert_pos] + tightlist_def + '\n' + content[insert_pos:]
    
    # Fix \euro command if used but not defined
    if r'\euro' in content and r'usepackage{eurosym}' not in content:
        content = re.sub(
            r'(\\usepackage\{[^}]+\}\s*\n)',
            r'\1\\usepackage{eurosym}\n',
            content,
            count=1
        )
    
    # Fix undefined references to figures/tables
    content = re.sub(r'\\ref\{fig:([^}]+)\}', r'Figure~\\ref{fig:\1}', content)
    content = re.sub(r'\\ref\{tab:([^}]+)\}', r'Table~\\ref{tab:\1}', content)
    
    # Ensure proper figure placement
    if r'\begin{figure}' in content:
        content = re.sub(
            r'\\begin\{figure\}(?!\[)',
            r'\\begin{figure}[htbp]',
            content
        )
    
    # Ensure proper table placement  
    if r'\begin{table}' in content:
        content = re.sub(
            r'\\begin\{table\}(?!\[)',
            r'\\begin{table}[htbp]',
            content
        )
    
    return content

def _fix_image_paths_for_overleaf(content: str, extract_media_to_path: str = None) -> str:
    """
    Convert absolute image paths to relative paths for Overleaf compatibility.
    """
    if extract_media_to_path:
        # Extract the media directory name
        media_dir = os.path.basename(extract_media_to_path.rstrip('/'))
        
        # Fix paths with task IDs like: task_id_media/media/image.png -> media/image.png
        # Pattern: \includegraphics{any_path/task_id_media/media/image.ext}
        # Replace with: \includegraphics{media/image.ext}
        pattern1 = r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[a-f0-9\-]+_media[/\\]media[/\\]([^{}]+)\}'
        replacement1 = r'\\includegraphics\1{media/\2}'
        content = re.sub(pattern1, replacement1, content)
        
        # Fix paths like: task_id_media/media/image.png -> media/image.png (without includegraphics)
        pattern2 = r'[a-f0-9\-]+_media[/\\]media[/\\]'
        replacement2 = r'media/'
        content = re.sub(pattern2, replacement2, content)
        
        # Also handle regular media paths: /absolute/path/to/media/image.ext -> media/image.ext
        pattern3 = r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[/\\]' + re.escape(media_dir) + r'[/\\]([^{}]+)\}'
        replacement3 = r'\\includegraphics\1{' + media_dir + r'/\2}'
        content = re.sub(pattern3, replacement3, content)
    
    return content

def _remove_unwanted_formatting(content: str) -> str:
    """
    Remove unwanted highlighting and formatting that causes visual issues.
    """
    # Remove highlighting commands
    content = re.sub(r'\\colorbox\{[^}]*\}\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\hl\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\texthl\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\hlc\[[^\]]*\]\{([^}]*)\}', r'\1', content)
    
    # Remove table cell coloring
    content = re.sub(r'\\cellcolor\{[^}]*\}', '', content)
    content = re.sub(r'\\rowcolor\{[^}]*\}', '', content)
    content = re.sub(r'\\columncolor\{[^}]*\}', '', content)
    
    # Remove text background colors
    content = re.sub(r'\\textcolor\{[^}]*\}\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\color\{[^}]*\}', '', content)
    
    # Remove box formatting that might cause highlighting
    content = re.sub(r'\\fcolorbox\{[^}]*\}\{[^}]*\}\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\framebox\[[^\]]*\]\{([^}]*)\}', r'\1', content)
    
    # Remove soul package highlighting
    content = re.sub(r'\\sethlcolor\{[^}]*\}', '', content)
    content = re.sub(r'\\ul\{([^}]*)\}', r'\1', content)  # Remove underline if causing issues
    
    return content

def _inject_latex_packages(content: str) -> str:
    """
    Inject additional LaTeX packages needed for enhanced formatting.
    """
    # Essential packages for enhanced conversion
    essential_packages = [
        r'\usepackage{graphicx}',      # For images - ensure it's included
        r'\usepackage{longtable}',     # For tables
        r'\usepackage{booktabs}',      # Better table formatting  
        r'\usepackage{array}',         # Enhanced table formatting
        r'\usepackage{calc}',          # For calculations
        r'\usepackage{url}',           # For URLs
    ]
    
    # Style enhancement packages
    style_packages = [
        r'\usepackage{float}',         # Better float positioning
        r'\usepackage{adjustbox}',     # For centering and scaling
        r'\usepackage{caption}',       # Better caption formatting
        r'\usepackage{subcaption}',    # For subfigures
        r'\usepackage{tabularx}',      # Flexible table widths
        r'\usepackage{enumitem}',      # Better list formatting
        r'\usepackage{setspace}',      # Line spacing control
        r'\usepackage{ragged2e}',      # Better text alignment
        r'\usepackage{amsmath}',       # Mathematical formatting
        r'\usepackage{amssymb}',       # Mathematical symbols
        r'\usepackage{needspace}',     # Prevent orphaned lines and improve page breaks
    ]
    
    all_packages = essential_packages + style_packages
    
    # Find the position after \documentclass but before any existing \usepackage or \begin{document}
    documentclass_pattern = r'\\documentclass(?:\[[^\]]*\])?\{[^}]+\}'
    documentclass_match = re.search(documentclass_pattern, content)
    
    if documentclass_match:
        insert_pos = documentclass_match.end()
        
        # Find the next significant LaTeX command to insert before it
        # Look for existing \usepackage, \begin{document}, or other commands
        remaining_content = content[insert_pos:]
        next_command_match = re.search(r'\\(?:usepackage|begin\{document\}|title|author|date)', remaining_content)
        
        if next_command_match:
            insert_pos += next_command_match.start()
        
        # Check which packages are not already included
        packages_to_insert = []
        for package in all_packages:
            package_name = package.replace(r'\usepackage{', '').replace('}', '')
            if f'usepackage{{{package_name}}}' not in content:
                packages_to_insert.append(package)
        
        if packages_to_insert:
            # Add packages with proper spacing
            package_block = '\n% Enhanced conversion packages\n' + '\n'.join(packages_to_insert) + '\n\n'
            content = content[:insert_pos] + package_block + content[insert_pos:]
    
    return content

def _add_centering_commands(content: str) -> str:
    """
    Add centering commands to figures and tables.
    """
    # Add \centering to figure environments
    content = re.sub(
        r'(\\begin\{figure\}(?:\[[^\]]*\])?)\s*\n',
        r'\1\n\\centering\n',
        content
    )
    
    # Add \centering to table environments
    content = re.sub(
        r'(\\begin\{table\}(?:\[[^\]]*\])?)\s*\n',
        r'\1\n\\centering\n',
        content
    )
    
    return content

def _fix_line_breaks_and_spacing(content: str) -> str:
    """
    Minimal fixes to preserve Word's original formatting and pagination.
    """
    # Remove unwanted highlighting and color commands
    content = re.sub(r'\\colorbox\{[^}]*\}\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\hl\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\texthl\{([^}]*)\}', r'\1', content)
    content = re.sub(r'\\cellcolor\{[^}]*\}', '', content)
    content = re.sub(r'\\rowcolor\{[^}]*\}', '', content)
    
    # Only fix critical spacing issues that break compilation
    # Preserve Word's original line breaks and spacing as much as possible
    
    # Ensure proper spacing around lists but don't change internal spacing
    content = re.sub(r'\n\\begin\{enumerate\}\n\n', r'\n\n\\begin{enumerate}\n', content)
    content = re.sub(r'\n\n\\end\{enumerate\}\n', r'\n\\end{enumerate}\n\n', content)
    content = re.sub(r'\n\\begin\{itemize\}\n\n', r'\n\n\\begin{itemize}\n', content)
    content = re.sub(r'\n\n\\end\{itemize\}\n', r'\n\\end{itemize}\n\n', content)
    
    # Minimal section spacing - preserve Word's pagination
    content = re.sub(r'\n(\\(?:sub)*section\{[^}]+\})\n\n', r'\n\n\1\n\n', content)
    
    # Only remove excessive spacing (3+ line breaks) but preserve double breaks
    content = re.sub(r'\n\n\n+', r'\n\n', content)
    
    # Ensure proper spacing around figures and tables
    content = re.sub(r'\n\\begin\{figure\}', r'\n\n\\begin{figure}', content)
    content = re.sub(r'\\end\{figure\}\n([A-Z])', r'\\end{figure}\n\n\1', content)
    content = re.sub(r'\n\\begin\{table\}', r'\n\n\\begin{table}', content)
    content = re.sub(r'\\end\{table\}\n([A-Z])', r'\\end{table}\n\n\1', content)
    
    return content

if __name__ == '__main__':
    from docx import Document
    from docx.shared import Inches
    from PIL import Image
    import shutil

    # --- Helper Functions for DOCX and Template Creation ---
    def create_dummy_image(filename, size=(60, 60), color="red", img_format="PNG"):
        img = Image.new('RGB', size, color=color)
        img.save(filename, img_format)
        print(f"Created dummy image: {filename}")

    def create_test_docx_with_styles(filename):
        doc = Document()
        doc.add_heading("Document with Enhanced Features", level=1)
        
        # Add paragraph with text
        p1 = doc.add_paragraph("This document tests enhanced features including:")
        
        # Add numbered list
        doc.add_paragraph("First numbered item", style='List Number')
        doc.add_paragraph("Second numbered item", style='List Number')
        doc.add_paragraph("Third numbered item", style='List Number')
        
        # Add some text
        doc.add_paragraph("Here is some regular text between lists.")
        
        # Add bullet list
        doc.add_paragraph("First bullet point", style='List Bullet')
        doc.add_paragraph("Second bullet point", style='List Bullet')
        
        doc.add_heading("Image Section", level=2)
        doc.add_paragraph("Below is a test image:")
        
        doc.save(filename)
        print(f"Created test DOCX with styles: {filename}")

    def create_complex_docx(filename, img1_path, img2_path):
        doc = Document()
        doc.add_heading("Complex Document Title", level=1)
        doc.add_paragraph("Introduction to the complex document.")
        doc.add_heading("Image Section", level=2)
        doc.add_picture(img1_path, width=Inches(1.0))
        doc.add_paragraph("Some text after the first image.")
        doc.add_picture(img2_path, width=Inches(1.0))
        doc.add_heading("Conclusion Section", level=2)
        doc.add_paragraph("Final remarks.")
        doc.save(filename)
        print(f"Created complex DOCX: {filename}")

    # --- Test Files ---
    docx_styles = "test_enhanced_styles.docx"
    docx_complex = "test_complex_enhanced.docx"
    img1 = "dummy_img1.png"
    img2 = "dummy_img2.jpg"

    output_enhanced_test = "output_enhanced_test.tex"
    output_overleaf_test = "output_overleaf_test.tex"
    media_dir = "./media_enhanced"

    all_test_files = [docx_styles, docx_complex, img1, img2, output_enhanced_test, output_overleaf_test]
    all_test_dirs = [media_dir]

    # --- Create Test Files ---
    print("--- Setting up enhanced test files ---")
    create_dummy_image(img1, color="blue", img_format="PNG")
    create_dummy_image(img2, color="green", img_format="JPEG")
    create_test_docx_with_styles(docx_styles)
    create_complex_docx(docx_complex, img1, img2)
    print("--- Enhanced test file setup complete ---")

    # --- Test Enhanced Features ---
    print("\n--- Testing Enhanced Features ---")

    # Test 1: Style preservation and line breaks
    print("\n--- Test 1: Enhanced Style Preservation ---")
    success, msg = convert_docx_to_latex(
        docx_styles, 
        output_enhanced_test,
        generate_toc=True,
        preserve_styles=True,
        preserve_linebreaks=True
    )
    print(f"Enhanced Test: {success}, Msg: {msg}")
    
    if success and os.path.exists(output_enhanced_test):
        with open(output_enhanced_test, 'r') as f:
            content = f.read()
            checks = {
                'packages': any(pkg in content for pkg in ['\\usepackage{float}', '\\usepackage{enumitem}']),
                'toc': '\\tableofcontents' in content,
                'sections': '\\section' in content,
                'lists': '\\begin{enumerate}' in content or '\\begin{itemize}' in content
            }
            print(f"Enhanced verification: {checks}")

    # Test 2: Overleaf compatibility with images
    print("\n--- Test 2: Overleaf Compatibility ---")
    success, msg = convert_docx_to_latex(
        docx_complex,
        output_overleaf_test,
        extract_media_to_path=media_dir,
        overleaf_compatible=True,
        preserve_styles=True,
        preserve_linebreaks=True
    )
    print(f"Overleaf Test: {success}, Msg: {msg}")
    
    if success and os.path.exists(output_overleaf_test):
        with open(output_overleaf_test, 'r') as f:
            content = f.read()
            media_check = 'media/' in content and '\\includegraphics' in content
            print(f"Overleaf compatibility check - relative paths: {media_check}")
            
        media_files_exist = os.path.exists(os.path.join(media_dir, 'media'))
        print(f"Media files extracted: {media_files_exist}")

    # --- Cleanup ---
    print("\n--- Cleaning up enhanced test files ---")
    for f_path in all_test_files:
        if os.path.exists(f_path):
            try:
                os.remove(f_path)
                print(f"Removed: {f_path}")
            except Exception as e:
                print(f"Error removing {f_path}: {e}")

    for d_path in all_test_dirs:
        if os.path.isdir(d_path):
            try:
                shutil.rmtree(d_path)
                print(f"Removed directory: {d_path}")
            except Exception as e:
                print(f"Error removing {d_path}: {e}")

    print("--- Enhanced testing completed ---")