File size: 2,527 Bytes
b13872f
 
 
4e3a79b
b13872f
 
4e3a79b
b13872f
 
 
 
3b6dd97
b13872f
 
 
0ffbfee
a62dca0
b13872f
 
a62dca0
 
b13872f
 
 
 
 
 
 
 
4e3a79b
b13872f
4e3a79b
a62dca0
b13872f
 
 
 
 
 
 
 
 
4e3a79b
a62dca0
4e3a79b
a62dca0
b13872f
 
4e3a79b
 
b13872f
 
 
a62dca0
b13872f
 
 
 
 
0ffbfee
b13872f
 
a503e7e
4e3a79b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
import gradio as gr
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import CTransformers

# ุงุญุตู„ ุนู„ู‰ ุงู„ุชูˆูƒู† ู…ู† Secrets
HF_TOKEN = os.getenv("HF_TOKEN")

# ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ ู…ุญู„ูŠู‹ุง
llm = CTransformers(
    model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
    model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    model_type="mistral",
    hf_token=HF_TOKEN,
    config={"max_new_tokens": 512, "temperature": 0.7}
)

# ุงู„ุชุญู…ูŠู„ ุงู„ุฏู„ุงู„ูŠ ู„ู„ู†ุตูˆุต
def load_documents(file_path):
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith(".txt"):
        loader = TextLoader(file_path, encoding='utf-8')
    elif file_path.endswith(".docx"):
        loader = Docx2txtLoader(file_path)
    else:
        raise ValueError("Unsupported file type.")
    return loader.load()

# ุชุฌู‡ูŠุฒ ุงู„ู…ุณุชู†ุฏ
def process_file(file):
    docs = load_documents(file.name)
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(docs)
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    vectordb = Chroma.from_documents(chunks, embedding)
    retriever = vectordb.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    return qa_chain

qa_chain = None

# ูˆุงุฌู‡ุฉ Gradio
def ask_question(file, question):
    global qa_chain
    if qa_chain is None:
        qa_chain = process_file(file)
    answer = qa_chain.run(question)
    return f"<div dir='rtl' style='text-align: right;'>{answer}</div>"

with gr.Blocks(css="body {direction: rtl; text-align: right;}") as demo:
    gr.Markdown("## ู…ุณุงุนุฏ ุงู„ูˆุซุงุฆู‚ ุงู„ุฐูƒูŠ - ุงุณุชุนู„ุงู… ุจุงู„ู„ุบุฉ ุงู„ุนุฑุจูŠุฉ ู…ู† ู…ู„ูุงุชูƒ")
    file_input = gr.File(label="๐Ÿ“„ ุญู…ู‘ู„ ู…ู„ูู‹ุง (PDF / DOCX / TXT)", file_types=[".pdf", ".txt", ".docx"])
    question_input = gr.Textbox(label="โ“ ุฃุฏุฎู„ ุณุคุงู„ูƒ ุจุงู„ุนุฑุจูŠุฉ", placeholder="ู…ุง ู‡ูˆ ู…ูˆุถูˆุน ู‡ุฐุง ุงู„ู…ู„ูุŸ")
    output = gr.HTML()

    submit_btn = gr.Button("๐Ÿ” ุงุณุชุนู„ู…")
    submit_btn.click(fn=ask_question, inputs=[file_input, question_input], outputs=output)

demo.launch()