new extract text from docx
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ from search_errors_logic import check_text
|
|
| 6 |
import html
|
| 7 |
import docx
|
| 8 |
from io import BytesIO
|
|
|
|
| 9 |
|
| 10 |
is_java_installed = False
|
| 11 |
prompt = """
|
|
@@ -73,12 +74,86 @@ def load_assets():
|
|
| 73 |
return tool
|
| 74 |
|
| 75 |
|
| 76 |
-
def extract_text_from_docx(file):
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
def main():
|
|
|
|
| 6 |
import html
|
| 7 |
import docx
|
| 8 |
from io import BytesIO
|
| 9 |
+
import re
|
| 10 |
|
| 11 |
is_java_installed = False
|
| 12 |
prompt = """
|
|
|
|
| 74 |
return tool
|
| 75 |
|
| 76 |
|
| 77 |
+
# def extract_text_from_docx(file):
|
| 78 |
+
# doc = docx.Document(BytesIO(file.getvalue()))
|
| 79 |
+
# full_text = []
|
| 80 |
+
# for para in doc.paragraphs:
|
| 81 |
+
# full_text.append(para.text)
|
| 82 |
+
# return "\n".join(full_text)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def extract_text_from_docx(uploaded_file):
|
| 86 |
+
"""
|
| 87 |
+
Извлекает текст из DOCX файла с сохранением базового форматирования в Markdown
|
| 88 |
+
|
| 89 |
+
Параметры:
|
| 90 |
+
uploaded_file: UploadedFile - файл, загруженный через st.file_uploader()
|
| 91 |
+
|
| 92 |
+
Возвращает:
|
| 93 |
+
str: Текст в формате Markdown
|
| 94 |
+
"""
|
| 95 |
+
doc = docx.Document(uploaded_file)
|
| 96 |
+
markdown_lines = []
|
| 97 |
+
|
| 98 |
+
for paragraph in doc.paragraphs:
|
| 99 |
+
text = paragraph.text.strip()
|
| 100 |
+
if not text:
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
+
# Обработка стилей
|
| 104 |
+
style = paragraph.style.name.lower()
|
| 105 |
+
|
| 106 |
+
# Заголовки
|
| 107 |
+
if 'heading' in style:
|
| 108 |
+
level = 1
|
| 109 |
+
if 'heading 1' in style:
|
| 110 |
+
level = 1
|
| 111 |
+
elif 'heading 2' in style:
|
| 112 |
+
level = 2
|
| 113 |
+
elif 'heading 3' in style:
|
| 114 |
+
level = 3
|
| 115 |
+
markdown_lines.append(f"{'#' * level} {text}")
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
# Обработка форматирования внутри абзаца
|
| 119 |
+
formatted_text = ""
|
| 120 |
+
for run in paragraph.runs:
|
| 121 |
+
run_text = run.text
|
| 122 |
+
if run.bold:
|
| 123 |
+
run_text = f"**{run_text}**"
|
| 124 |
+
if run.italic:
|
| 125 |
+
run_text = f"*{run_text}*"
|
| 126 |
+
if run.underline:
|
| 127 |
+
run_text = f"<u>{run_text}</u>"
|
| 128 |
+
formatted_text += run_text
|
| 129 |
+
|
| 130 |
+
# Маркированные списки
|
| 131 |
+
if paragraph.style.name.startswith('List Bullet'):
|
| 132 |
+
formatted_text = f"- {formatted_text}"
|
| 133 |
+
|
| 134 |
+
# Нумерованные списки
|
| 135 |
+
elif paragraph.style.name.startswith('List Number'):
|
| 136 |
+
formatted_text = f"1. {formatted_text}"
|
| 137 |
+
|
| 138 |
+
# Блоки кода (если есть специальный стиль)
|
| 139 |
+
elif paragraph.style.name.startswith('Code'):
|
| 140 |
+
formatted_text = f"`{formatted_text}`"
|
| 141 |
+
|
| 142 |
+
markdown_lines.append(formatted_text)
|
| 143 |
+
|
| 144 |
+
# Обработка таблиц
|
| 145 |
+
for table in doc.tables:
|
| 146 |
+
markdown_lines.append("\n") # Добавляем пустую строку перед таблицей
|
| 147 |
+
headers = [cell.text.strip() for cell in table.rows[0].cells]
|
| 148 |
+
markdown_lines.append("| " + " | ".join(headers) + " |")
|
| 149 |
+
markdown_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
| 150 |
+
|
| 151 |
+
for row in table.rows[1:]:
|
| 152 |
+
cells = [cell.text.strip() for cell in row.cells]
|
| 153 |
+
markdown_lines.append("| " + " | ".join(cells) + " |")
|
| 154 |
+
markdown_lines.append("\n") # Добавляем пустую строку после таблицы
|
| 155 |
+
|
| 156 |
+
return "\n".join(markdown_lines)
|
| 157 |
|
| 158 |
|
| 159 |
def main():
|