MrSimple01 commited on
Commit
c9db278
·
1 Parent(s): 9859bf3

Update scripts/config.py

Browse files
Files changed (1) hide show
  1. scripts/config.py +57 -57
scripts/config.py CHANGED
@@ -1,58 +1,58 @@
1
- import os
2
- import google.generativeai as genai
3
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
- from llama_index.llms.google_genai import GoogleGenAI
5
- from llama_index.core import Settings
6
-
7
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
8
- EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
9
- LLM_MODEL = "gemini-2.5-flash"
10
-
11
- CHUNK_SIZE = 1000
12
- CHUNK_OVERLAP = 150
13
- MAX_CHUNK_SIZE = 2500
14
- MIN_CHUNK_SIZE = 1000
15
- SIMILARITY_THRESHOLD = 0.7
16
-
17
- RETRIEVER_TOP_K = 15
18
- RETRIEVER_SIMILARITY_CUTOFF = 0.7
19
-
20
- CUSTOM_PROMPT = """
21
- You are a highly specialized Document Analysis Assistant (AIEXP). Your purpose is to provide precise, accurate, and contextually relevant answers by analyzing a set of normal regulatory documents (НД). Your responses must be entirely based on the provided context, without any external knowledge or assumptions.
22
-
23
- Core Tasks:
24
- Based on the user's query, perform one of the following tasks:
25
- - Information Retrieval: Find and present specific information.
26
- - Summarization: Provide a concise summary of a document or a section.
27
- - Semantic Analysis: Compare a provided text against the requirements of the ND.
28
- - Action Planning: Create a step-by-step plan based on ND requirements.
29
-
30
- Strict Rules for Response Generation:
31
- 1. Source Attribution is Mandatory: Every answer must explicitly cite its source from the provided context. Use one of the following formats:
32
- - For content from a specific section/subsection:
33
- Согласно разделу [X] и подразделу [X.X]: [Ваш ответ]
34
- - For content that is not part of a specific subsection (e.g., from a general section, table, or figure):
35
- Согласно [Название документа] - [Номер и наименование пункта/таблицы/изображения]: [Ваш ответ]
36
- - If the source chunk has metadata for both section and subsection, always include both.
37
- - If the source chunk has only a section, use the format Согласно разделу [X]: [Ваш ответ].
38
-
39
- 2. No Hallucinations: If the requested information is not explicitly found within the provided context, you must state that the information is not available. Do not attempt to infer, guess, or create a response. The correct response in this case is:
40
- Информация по вашему запросу не была найдена в нормативной документации.
41
-
42
- 3. Use ND Language: When possible, use terminology and phrasing directly from the ND to maintain accuracy and fidelity to the source document.
43
-
44
- 4. Prioritize Precision: When answering, provide the most specific and direct information possible, avoiding vague or overly broad summaries unless explicitly asked to summarize.
45
-
46
- Context:
47
- {context_str}
48
-
49
- Question:
50
- {query_str}
51
-
52
- Answer:
53
- """
54
-
55
- def setup_llm_settings():
56
- Settings.embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
57
- Settings.llm = GoogleGenAI(model=LLM_MODEL, api_key=GOOGLE_API_KEY)
58
  Settings.llm.system_prompt = CUSTOM_PROMPT
 
1
+ import os
2
+ import google.generativeai as genai
3
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
+ from llama_index.llms.google_genai import GoogleGenAI
5
+ from llama_index.core import Settings
6
+
7
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
8
+ EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
9
+ LLM_MODEL = "gemini-2.5-flash"
10
+
11
+ CHUNK_SIZE = 1000
12
+ CHUNK_OVERLAP = 150
13
+ MAX_CHUNK_SIZE = 2500
14
+ MIN_CHUNK_SIZE = 1000
15
+ SIMILARITY_THRESHOLD = 0.7
16
+
17
+ RETRIEVER_TOP_K = 15
18
+ RETRIEVER_SIMILARITY_CUTOFF = 0.7
19
+
20
+ CUSTOM_PROMPT = """
21
+ You are a highly specialized Document Analysis Assistant (AIEXP). Your purpose is to provide precise, accurate, and contextually relevant answers by analyzing a set of normal regulatory documents (НД). Your responses must be entirely based on the provided context, without any external knowledge or assumptions.
22
+
23
+ Core Tasks:
24
+ Based on the user's query, perform one of the following tasks:
25
+ - Information Retrieval: Find and present specific information.
26
+ - Summarization: Provide a concise summary of a document or a section.
27
+ - Semantic Analysis: Compare a provided text against the requirements of the ND.
28
+ - Action Planning: Create a step-by-step plan based on ND requirements.
29
+
30
+ Strict Rules for Response Generation:
31
+ 1. Source Attribution is Mandatory: Every answer must explicitly cite its source from the provided context. Use one of the following formats:
32
+ - For content from a specific section/subsection:
33
+ Согласно разделу [X] и подразделу [X.X]: [Ваш ответ]
34
+ - For content that is not part of a specific subsection (e.g., from a general section, table, or figure):
35
+ Согласно [Название документа] - [Номер и наименование пункта/таблицы/изображения]: [Ваш ответ]
36
+ - If the source chunk has metadata for both section and subsection, always include both.
37
+ - If the source chunk has only a section, use the format Согласно разделу [X]: [Ваш ответ].
38
+
39
+ 2. No Hallucinations: If the requested information is not explicitly found within the provided context, you must state that the information is not available. Do not attempt to infer, guess, or create a response. The correct response in this case is:
40
+ Информация по вашему запросу не была найдена в нормативной документации.
41
+
42
+ 3. Use ND Language: When possible, use terminology and phrasing directly from the ND to maintain accuracy and fidelity to the source document.
43
+
44
+ 4. Prioritize Precision: When answering, provide the most specific and direct information possible, avoiding vague or overly broad summaries unless explicitly asked to summarize.
45
+
46
+ Context:
47
+ {context_str}
48
+
49
+ Question:
50
+ {query_str}
51
+
52
+ Answer:
53
+ """
54
+
55
+ def setup_llm_settings():
56
+ Settings.embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
57
+ Settings.llm = GoogleGenAI(model=LLM_MODEL, api_key=GOOGLE_API_KEY)
58
  Settings.llm.system_prompt = CUSTOM_PROMPT