Loversofdeath commited on
Commit
a73e1ef
·
verified ·
1 Parent(s): d10e6c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -54
app.py CHANGED
@@ -1,57 +1,58 @@
1
- import gradio as gr
2
  import os
3
- import re
4
-
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.vectorstores import FAISS
8
  from langchain.chains import RetrievalQA
9
- from langchain.llms import HuggingFaceHub
10
-
11
- # Убираем спецсимволы (кроме базовой пунктуации)
12
- def clean_text(text):
13
- return re.sub(r"[^\w\s.,!?–—:;()«»\"'-]", "", text, flags=re.UNICODE)
14
-
15
- # Собираем весь лор из нескольких файлов
16
- def load_all_lore_texts(folder="."):
17
- texts = []
18
- for filename in os.listdir(folder):
19
- if filename.startswith("lore") and filename.endswith(".txt"):
20
- with open(os.path.join(folder, filename), "r", encoding="utf-8") as f:
21
- content = clean_text(f.read())
22
- texts.append(content)
23
- return "\n".join(texts)
24
-
25
- # Загрузка и разбиение текста
26
- full_lore = load_all_lore_texts()
27
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
28
- chunks = splitter.split_text(full_lore)
29
-
30
- # Векторизация
31
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") # поддерживает русский
32
- db = FAISS.from_texts(chunks, embeddings)
33
- retriever = db.as_retriever()
34
-
35
- # Русскоязычная LLM
36
- llm = HuggingFaceHub(
37
- repo_id="cointegrated/rugpt3large_based_on_gpt2",
38
- model_kwargs={"temperature":0.6, "max_new_tokens":300}
39
- )
40
-
41
- qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
42
-
43
- # Ответ бота
44
- def ask_bot(question):
45
- cleaned_question = clean_text(question)
46
- return qa_chain.run(cleaned_question)
47
-
48
- # Интерфейс
49
- iface = gr.Interface(
50
- fn=ask_bot,
51
- inputs=gr.Textbox(lines=2, placeholder="Спроси что-нибудь по лору..."),
52
- outputs="text",
53
- title="ЛорБот",
54
- description="Задавайте вопросы о вселенной. Поддерживается русский язык."
55
- )
56
-
57
- iface.launch()
 
 
 
 
 
1
  import os
2
+ from langchain_community.document_loaders import TextLoader
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain.chains import RetrievalQA
7
+ from langchain_community.llms import HuggingFaceHub
8
+ import gradio as gr
9
+ import re
10
+
11
+ # 1. Загрузка и очистка всех .txt файлов
12
+ def load_documents(folder_path):
13
+ documents = []
14
+ for file_name in os.listdir(folder_path):
15
+ if file_name.endswith(".txt"):
16
+ loader = TextLoader(os.path.join(folder_path, file_name), encoding="utf-8")
17
+ docs = loader.load()
18
+ for doc in docs:
19
+ # Очищаем спецсимволы типа [=/ и прочую ерунду
20
+ doc.page_content = re.sub(r'\[=/.*?\]', '', doc.page_content)
21
+ documents.append(doc)
22
+ return documents
23
+
24
+ # 2. Разбивка на чанки
25
+ def split_documents(documents):
26
+ splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100)
27
+ return splitter.split_documents(documents)
28
+
29
+ # 3. Создание эмбеддингов
30
+ def create_embeddings():
31
+ return HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
32
+
33
+ # 4. Загрузка модели
34
+ def load_llm():
35
+ return HuggingFaceHub(
36
+ repo_id="IlyaGusev/saiga_mistral_7b_gguf", # можно заменить на что-то другое, если будет падать
37
+ model_kwargs={"temperature": 0.6, "max_new_tokens": 300}
38
+ )
39
+
40
+ # 5. Построение цепочки
41
+ def build_qa_chain():
42
+ raw_docs = load_documents("lore") # Папка lore/ рядом с app.py
43
+ docs = split_documents(raw_docs)
44
+ embeddings = create_embeddings()
45
+ db = FAISS.from_documents(docs, embeddings)
46
+ retriever = db.as_retriever()
47
+ llm = load_llm()
48
+ return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
49
+
50
+ # 6. Интерфейс
51
+ qa_chain = build_qa_chain()
52
+
53
+ def answer_question(question):
54
+ result = qa_chain.run(question)
55
+ return result
56
+
57
+ iface = gr.Interface(fn=answer_question, inputs="text", outputs="text", title="Чат по Лору (RU)")
58
+ iface.launch()