mammoth
Browse files- app.py +4 -75
- requirements.txt +1 -0
app.py
CHANGED
@@ -5,8 +5,8 @@ import subprocess
|
|
5 |
from search_errors_logic import check_text
|
6 |
import html
|
7 |
import docx
|
|
|
8 |
from io import BytesIO
|
9 |
-
import re
|
10 |
|
11 |
is_java_installed = False
|
12 |
prompt = """
|
@@ -80,80 +80,9 @@ def load_assets():
|
|
80 |
# for para in doc.paragraphs:
|
81 |
# full_text.append(para.text)
|
82 |
# return "\n".join(full_text)
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
"""
|
87 |
-
Извлекает текст из DOCX файла с сохранением базового форматирования в Markdown
|
88 |
-
|
89 |
-
Параметры:
|
90 |
-
uploaded_file: UploadedFile - файл, загруженный через st.file_uploader()
|
91 |
-
|
92 |
-
Возвращает:
|
93 |
-
str: Текст в формате Markdown
|
94 |
-
"""
|
95 |
-
doc = docx.Document(uploaded_file)
|
96 |
-
markdown_lines = []
|
97 |
-
|
98 |
-
for paragraph in doc.paragraphs:
|
99 |
-
text = paragraph.text.strip()
|
100 |
-
if not text:
|
101 |
-
continue
|
102 |
-
|
103 |
-
# Обработка стилей
|
104 |
-
style = paragraph.style.name.lower()
|
105 |
-
|
106 |
-
# Заголовки
|
107 |
-
if 'heading' in style:
|
108 |
-
level = 1
|
109 |
-
if 'heading 1' in style:
|
110 |
-
level = 1
|
111 |
-
elif 'heading 2' in style:
|
112 |
-
level = 2
|
113 |
-
elif 'heading 3' in style:
|
114 |
-
level = 3
|
115 |
-
markdown_lines.append(f"{'#' * level} {text}")
|
116 |
-
continue
|
117 |
-
|
118 |
-
# Обработка форматирования внутри абзаца
|
119 |
-
formatted_text = ""
|
120 |
-
for run in paragraph.runs:
|
121 |
-
run_text = run.text
|
122 |
-
if run.bold:
|
123 |
-
run_text = f"**{run_text}**"
|
124 |
-
if run.italic:
|
125 |
-
run_text = f"*{run_text}*"
|
126 |
-
if run.underline:
|
127 |
-
run_text = f"<u>{run_text}</u>"
|
128 |
-
formatted_text += run_text
|
129 |
-
|
130 |
-
# Маркированные списки
|
131 |
-
if paragraph.style.name.startswith('List Bullet'):
|
132 |
-
formatted_text = f"- {formatted_text}"
|
133 |
-
|
134 |
-
# Нумерованные списки
|
135 |
-
elif paragraph.style.name.startswith('List Number'):
|
136 |
-
formatted_text = f"1. {formatted_text}"
|
137 |
-
|
138 |
-
# Блоки кода (если есть специальный стиль)
|
139 |
-
elif paragraph.style.name.startswith('Code'):
|
140 |
-
formatted_text = f"`{formatted_text}`"
|
141 |
-
|
142 |
-
markdown_lines.append(formatted_text)
|
143 |
-
|
144 |
-
# Обработка таблиц
|
145 |
-
for table in doc.tables:
|
146 |
-
markdown_lines.append("\n") # Добавляем пустую строку перед таблицей
|
147 |
-
headers = [cell.text.strip() for cell in table.rows[0].cells]
|
148 |
-
markdown_lines.append("| " + " | ".join(headers) + " |")
|
149 |
-
markdown_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
150 |
-
|
151 |
-
for row in table.rows[1:]:
|
152 |
-
cells = [cell.text.strip() for cell in row.cells]
|
153 |
-
markdown_lines.append("| " + " | ".join(cells) + " |")
|
154 |
-
markdown_lines.append("\n") # Добавляем пустую строку после таблицы
|
155 |
-
|
156 |
-
return "\n".join(markdown_lines)
|
157 |
|
158 |
|
159 |
def main():
|
|
|
5 |
from search_errors_logic import check_text
|
6 |
import html
|
7 |
import docx
|
8 |
+
import mammoth
|
9 |
from io import BytesIO
|
|
|
10 |
|
11 |
is_java_installed = False
|
12 |
prompt = """
|
|
|
80 |
# for para in doc.paragraphs:
|
81 |
# full_text.append(para.text)
|
82 |
# return "\n".join(full_text)
|
83 |
+
def extract_text_from_docx(file):
|
84 |
+
result = mammoth.convert_to_markdown(file)
|
85 |
+
return result.value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
|
88 |
def main():
|
requirements.txt
CHANGED
@@ -14,6 +14,7 @@ httpx==0.28.1
|
|
14 |
idna==3.10
|
15 |
jiter==0.9.0
|
16 |
language_tool_python==2.9.3
|
|
|
17 |
multidict==6.4.3
|
18 |
openai==1.68.2
|
19 |
pip==25.0.1
|
|
|
14 |
idna==3.10
|
15 |
jiter==0.9.0
|
16 |
language_tool_python==2.9.3
|
17 |
+
mammoth==1.9.0
|
18 |
multidict==6.4.3
|
19 |
openai==1.68.2
|
20 |
pip==25.0.1
|