ramysaidagieb commited on
Commit
4ca054b
·
verified ·
1 Parent(s): fb77c86

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -122
app.py DELETED
@@ -1,122 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import tempfile
4
- import faiss
5
- import torch
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from langchain.prompts import PromptTemplate
10
- from langchain.chains import RetrievalQA
11
- from langchain.llms import HuggingFacePipeline
12
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
13
- from pdfminer.high_level import extract_text as extract_pdf_text
14
- import docx
15
- import nltk
16
-
17
- nltk.download('punkt')
18
- from nltk.tokenize import sent_tokenize
19
-
20
- uploaded_texts = []
21
- vector_store = None
22
- qa_chain = None
23
-
24
- embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
25
- embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
26
-
27
- model_name = "csebuetnlp/mT5_small"
28
- tokenizer = AutoTokenizer.from_pretrained(model_name)
29
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
30
-
31
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
32
- llm = HuggingFacePipeline(pipeline=pipe)
33
-
34
- ARABIC_PROMPT_TEMPLATE = """
35
- أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
36
- لا تستخدم أي معلومات خارجية.
37
- السؤال: {question}
38
- الإجابة:
39
- """
40
-
41
- def format_arabic_prompt(question):
42
- return ARABIC_PROMPT_TEMPLATE.format(question=question)
43
-
44
- def extract_text_from_file(file_path):
45
- if file_path.endswith(".pdf"):
46
- return extract_pdf_text(file_path)
47
- elif file_path.endswith(".docx") or file_path.endswith(".doc"):
48
- doc = docx.Document(file_path)
49
- return "\n".join([para.text for para in doc.paragraphs])
50
- else:
51
- raise ValueError("Unsupported file format")
52
-
53
- def arabic_split_text(text):
54
- sentences = sent_tokenize(text, language='arabic')
55
- chunks = []
56
- chunk = ""
57
- for sentence in sentences:
58
- if len(chunk) + len(sentence) <= 500:
59
- chunk += " " + sentence
60
- else:
61
- chunks.append(chunk.strip())
62
- chunk = sentence
63
- if chunk:
64
- chunks.append(chunk.strip())
65
- return chunks
66
-
67
- def train_from_texts(texts):
68
- global vector_store, qa_chain
69
-
70
- splitter = RecursiveCharacterTextSplitter(
71
- chunk_size=500,
72
- chunk_overlap=100,
73
- length_function=len,
74
- )
75
-
76
- all_chunks = []
77
- for text in texts:
78
- chunks = arabic_split_text(text)
79
- all_chunks.extend(chunks)
80
-
81
- vectors = embeddings.embed_documents(all_chunks)
82
- dimension = len(vectors[0])
83
- index = faiss.IndexFlatL2(dimension)
84
- vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)
85
-
86
- retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
87
- qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
88
-
89
- def upload_book(file, progress=gr.Progress()):
90
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
91
- tmp.write(file.read())
92
- tmp_path = tmp.name
93
-
94
- progress(0.2, desc="تحميل الملف...")
95
- extracted_text = extract_text_from_file(tmp_path)
96
- uploaded_texts.append(extracted_text)
97
- progress(0.5, desc="معالجة النص...")
98
-
99
- train_from_texts(uploaded_texts)
100
- progress(1.0, desc="اكتمل التدريب!")
101
- return "النظام جاهز للإجابة على أسئلتك"
102
-
103
- def answer_question(user_question):
104
- if qa_chain is None:
105
- return "الرجاء رفع كتاب أولاً."
106
- prompt = format_arabic_prompt(user_question)
107
- result = qa_chain.run(prompt)
108
- return result
109
-
110
- with gr.Blocks() as demo:
111
- with gr.Tab("تحميل الكتب"):
112
- upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
113
- upload_output = gr.Textbox(label="حالة النظام")
114
- upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)
115
-
116
- with gr.Tab("اسأل الكتاب"):
117
- question = gr.Textbox(label="اكتب سؤالك بالعربية")
118
- answer = gr.Textbox(label="الإجابة")
119
- ask_button = gr.Button("إرسال السؤال")
120
- ask_button.click(answer_question, inputs=question, outputs=answer)
121
-
122
- demo.launch(share=True)