Spaces:

bvd757
/

checked

Sleeping

App Files Files Community

bvd757 commited on May 10

Commit

06f15f9

1 Parent(s): 5c656e8

new extract text from docx

Browse files

Files changed (1) hide show

app.py +81 -6

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from search_errors_logic import check_text
 import html
 import docx
 from io import BytesIO
 is_java_installed = False
 prompt = """
@@ -73,12 +74,86 @@ def load_assets():
     return tool
-def extract_text_from_docx(file):
-    doc = docx.Document(BytesIO(file.getvalue()))
-    full_text = []
-    for para in doc.paragraphs:
-        full_text.append(para.text)
-    return "\n".join(full_text)
 def main():

 import html
 import docx
 from io import BytesIO
+import re
 is_java_installed = False
 prompt = """
     return tool
+# def extract_text_from_docx(file):
+#     doc = docx.Document(BytesIO(file.getvalue()))
+#     full_text = []
+#     for para in doc.paragraphs:
+#         full_text.append(para.text)
+#     return "\n".join(full_text)
+def extract_text_from_docx(uploaded_file):
+    """
+    Извлекает текст из DOCX файла с сохранением базового форматирования в Markdown
+    Параметры:
+    uploaded_file: UploadedFile - файл, загруженный через st.file_uploader()
+    Возвращает:
+    str: Текст в формате Markdown
+    """
+    doc = docx.Document(uploaded_file)
+    markdown_lines = []
+    for paragraph in doc.paragraphs:
+        text = paragraph.text.strip()
+        if not text:
+            continue
+        # Обработка стилей
+        style = paragraph.style.name.lower()
+        # Заголовки
+        if 'heading' in style:
+            level = 1
+            if 'heading 1' in style:
+                level = 1
+            elif 'heading 2' in style:
+                level = 2
+            elif 'heading 3' in style:
+                level = 3
+            markdown_lines.append(f"{'#' * level} {text}")
+            continue
+        # Обработка форматирования внутри абзаца
+        formatted_text = ""
+        for run in paragraph.runs:
+            run_text = run.text
+            if run.bold:
+                run_text = f"**{run_text}**"
+            if run.italic:
+                run_text = f"*{run_text}*"
+            if run.underline:
+                run_text = f"<u>{run_text}</u>"
+            formatted_text += run_text
+        # Маркированные списки
+        if paragraph.style.name.startswith('List Bullet'):
+            formatted_text = f"- {formatted_text}"
+        # Нумерованные списки
+        elif paragraph.style.name.startswith('List Number'):
+            formatted_text = f"1. {formatted_text}"
+        # Блоки кода (если есть специальный стиль)
+        elif paragraph.style.name.startswith('Code'):
+            formatted_text = f"`{formatted_text}`"
+        markdown_lines.append(formatted_text)
+    # Обработка таблиц
+    for table in doc.tables:
+        markdown_lines.append("\n")  # Добавляем пустую строку перед таблицей
+        headers = [cell.text.strip() for cell in table.rows[0].cells]
+        markdown_lines.append("| " + " | ".join(headers) + " |")
+        markdown_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
+        for row in table.rows[1:]:
+            cells = [cell.text.strip() for cell in row.cells]
+            markdown_lines.append("| " + " | ".join(cells) + " |")
+        markdown_lines.append("\n")  # Добавляем пустую строку после таблицы
+    return "\n".join(markdown_lines)
 def main():