MrSimple01 commited on
Commit
34abd7e
·
verified ·
1 Parent(s): 3f77b16

Update src/quiz_processing.py

Browse files
Files changed (1) hide show
  1. src/quiz_processing.py +336 -336
src/quiz_processing.py CHANGED
@@ -1,337 +1,337 @@
1
- import os
2
- import re
3
- import json
4
- import time
5
- import gradio as gr
6
- import tempfile
7
- from typing import Dict, Any, List, Optional
8
- from transformers import AutoTokenizer
9
- from sentence_transformers import SentenceTransformer
10
- from pydantic import BaseModel, Field
11
- from anthropic import Anthropic
12
- from huggingface_hub import login
13
- from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_CLAUDE, ANALYSIS_PROMPT_TEMPLATE_GEMINI
14
-
15
-
16
- CLAUDE_MODEL = "claude-3-5-sonnet-20241022"
17
- OPENAI_MODEL = "gpt-4o"
18
- GEMINI_MODEL = "gemini-2.0-flash"
19
-
20
- DEFAULT_TEMPERATURE = 0.7
21
-
22
- TOKENIZER_MODEL = "answerdotai/ModernBERT-base"
23
- SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
24
-
25
- hf_token = os.environ.get('HF_TOKEN', None)
26
- login(token=hf_token)
27
-
28
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
29
- sentence_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
30
-
31
- class CourseInfo(BaseModel):
32
- course_name: str = Field(description="Name of the course")
33
- section_name: str = Field(description="Name of the course section")
34
- lesson_name: str = Field(description="Name of the lesson")
35
-
36
- class QuizOption(BaseModel):
37
- text: str = Field(description="The text of the answer option")
38
- correct: bool = Field(description="Whether this option is correct")
39
-
40
- class QuizQuestion(BaseModel):
41
- question: str = Field(description="The text of the quiz question")
42
- options: List[QuizOption] = Field(description="List of answer options")
43
-
44
- class Segment(BaseModel):
45
- segment_number: int = Field(description="The segment number")
46
- topic_name: str = Field(description="Unique and specific topic name that clearly differentiates it from other segments")
47
- key_concepts: List[str] = Field(description="3-5 key concepts discussed in the segment")
48
- summary: str = Field(description="Brief summary of the segment (3-5 sentences)")
49
- quiz_questions: List[QuizQuestion] = Field(description="5 quiz questions based on the segment content")
50
-
51
- class TextSegmentAnalysis(BaseModel):
52
- course_info: CourseInfo = Field(description="Information about the course")
53
- segments: List[Segment] = Field(description="List of text segments with analysis")
54
-
55
- def clean_text(text):
56
- text = re.sub(r'\[speaker_\d+\]', '', text)
57
- text = re.sub(r'\s+', ' ', text).strip()
58
- return text
59
-
60
- def split_text_by_tokens(text, max_tokens=12000):
61
- text = clean_text(text)
62
- tokens = tokenizer.encode(text)
63
-
64
- if len(tokens) <= max_tokens:
65
- return [text]
66
-
67
- split_point = len(tokens) // 2
68
-
69
- sentences = re.split(r'(?<=[.!?])\s+', text)
70
-
71
- first_half = []
72
- second_half = []
73
-
74
- current_tokens = 0
75
- for sentence in sentences:
76
- sentence_tokens = len(tokenizer.encode(sentence))
77
-
78
- if current_tokens + sentence_tokens <= split_point:
79
- first_half.append(sentence)
80
- current_tokens += sentence_tokens
81
- else:
82
- second_half.append(sentence)
83
-
84
- return [" ".join(first_half), " ".join(second_half)]
85
-
86
-
87
-
88
- def generate_with_claude(text, api_key, course_name="", section_name="", lesson_name=""):
89
- from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_CLAUDE
90
-
91
- client = Anthropic(api_key=api_key)
92
-
93
- segment_analysis_schema = TextSegmentAnalysis.model_json_schema()
94
-
95
- tools = [
96
- {
97
- "name": "build_segment_analysis",
98
- "description": "Build the text segment analysis with quiz questions",
99
- "input_schema": segment_analysis_schema
100
- }
101
- ]
102
-
103
-
104
- prompt = ANALYSIS_PROMPT_TEMPLATE_CLAUDE.format(
105
- course_name=course_name,
106
- section_name=section_name,
107
- lesson_name=lesson_name,
108
- text=text
109
- )
110
-
111
- try:
112
- response = client.messages.create(
113
- model=CLAUDE_MODEL,
114
- max_tokens=8192,
115
- temperature=DEFAULT_TEMPERATURE,
116
- system=SYSTEM_PROMPT,
117
- messages=[
118
- {
119
- "role": "user",
120
- "content": prompt
121
- }
122
- ],
123
- tools=tools,
124
- tool_choice={"type": "tool", "name": "build_segment_analysis"}
125
- )
126
-
127
- # Extract the tool call content
128
- if response.content and len(response.content) > 0 and hasattr(response.content[0], 'input'):
129
- function_call = response.content[0].input
130
- return function_call
131
- else:
132
- raise Exception("No valid tool call found in the response")
133
- except Exception as e:
134
- raise Exception(f"Error calling Anthropic API: {str(e)}")
135
-
136
-
137
- def get_active_api_key(gemini_key, claude_key, language):
138
- if language == "Uzbek" and claude_key:
139
- return claude_key, "claude"
140
- else:
141
- return gemini_key, "gemini"
142
-
143
- def segment_and_analyze_text(text: str, gemini_api_key: str, claude_api_key: str, language: str,
144
- course_name="", section_name="", lesson_name="") -> Dict[str, Any]:
145
- active_key, api_type = get_active_api_key(gemini_api_key, claude_api_key, language)
146
-
147
- if api_type == "claude":
148
- return generate_with_claude(text, active_key, course_name, section_name, lesson_name)
149
-
150
- from langchain_google_genai import ChatGoogleGenerativeAI
151
- from prompts import ANALYSIS_PROMPT_TEMPLATE_GEMINI, SYSTEM_PROMPT
152
- os.environ["GOOGLE_API_KEY"] = active_key
153
- llm = ChatGoogleGenerativeAI(
154
- model=GEMINI_MODEL,
155
- temperature=DEFAULT_TEMPERATURE,
156
- max_retries=3
157
- )
158
-
159
- base_prompt = ANALYSIS_PROMPT_TEMPLATE_GEMINI.format(
160
- course_name=course_name,
161
- section_name=section_name,
162
- lesson_name=lesson_name,
163
- text=text
164
- )
165
-
166
- language_instruction = f"\nIMPORTANT: Generate ALL content (including topic names, key concepts, summaries, and quiz questions) in {language} language."
167
- prompt = base_prompt + language_instruction
168
-
169
-
170
- try:
171
- messages = [
172
- {"role": "system", "content": SYSTEM_PROMPT},
173
- {"role": "user", "content": prompt}
174
- ]
175
-
176
- response = llm.invoke(messages)
177
-
178
- try:
179
- content = response.content
180
- json_match = re.search(r'```json\s*([\s\S]*?)\s*```', content)
181
-
182
- if json_match:
183
- json_str = json_match.group(1)
184
- else:
185
- json_match = re.search(r'(\{[\s\S]*\})', content)
186
- if json_match:
187
- json_str = json_match.group(1)
188
- else:
189
- json_str = content
190
-
191
- # Parse the JSON
192
- function_call = json.loads(json_str)
193
- return function_call
194
- except json.JSONDecodeError:
195
- raise Exception("Could not parse JSON from LLM response")
196
- except Exception as e:
197
- raise Exception(f"Error calling API: {str(e)}")
198
-
199
- def format_quiz_for_display(results, language="English"):
200
- output = []
201
-
202
- if language == "Uzbek":
203
- course_header = "KURS"
204
- section_header = "BO'LIM"
205
- lesson_header = "DARS"
206
- segment_header = "QISM"
207
- key_concepts_header = "ASOSIY TUSHUNCHALAR"
208
- summary_header = "QISQACHA MAZMUN"
209
- quiz_questions_header = "TEST SAVOLLARI"
210
- elif language == "Russian":
211
- course_header = "КУРС"
212
- section_header = "РАЗДЕЛ"
213
- lesson_header = "УРОК"
214
- segment_header = "СЕГМЕНТ"
215
- key_concepts_header = "КЛЮЧЕВЫЕ ПОНЯТИЯ"
216
- summary_header = "КРАТКОЕ СОДЕРЖАНИЕ"
217
- quiz_questions_header = "ТЕСТОВЫЕ ВОПРОСЫ"
218
- else:
219
- course_header = "COURSE"
220
- section_header = "SECTION"
221
- lesson_header = "LESSON"
222
- segment_header = "SEGMENT"
223
- key_concepts_header = "KEY CONCEPTS"
224
- summary_header = "SUMMARY"
225
- quiz_questions_header = "QUIZ QUESTIONS"
226
-
227
- if "course_info" in results:
228
- course_info = results["course_info"]
229
- output.append(f"{'='*40}")
230
- output.append(f"{course_header}: {course_info.get('course_name', 'N/A')}")
231
- output.append(f"{section_header}: {course_info.get('section_name', 'N/A')}")
232
- output.append(f"{lesson_header}: {course_info.get('lesson_name', 'N/A')}")
233
- output.append(f"{'='*40}\n")
234
-
235
- segments = results.get("segments", [])
236
- for i, segment in enumerate(segments):
237
- topic = segment["topic_name"]
238
- segment_num = i + 1
239
- output.append(f"\n\n{'='*40}")
240
- output.append(f"{segment_header} {segment_num}: {topic}")
241
- output.append(f"{'='*40}\n")
242
- output.append(f"{key_concepts_header}:")
243
- for concept in segment["key_concepts"]:
244
- output.append(f"• {concept}")
245
- output.append(f"\n{summary_header}:")
246
- output.append(segment["summary"])
247
- output.append(f"\n{quiz_questions_header}:")
248
- for i, q in enumerate(segment["quiz_questions"]):
249
- output.append(f"\n{i+1}. {q['question']}")
250
- for j, option in enumerate(q['options']):
251
- letter = chr(97 + j).upper()
252
- correct_marker = " ✓" if option["correct"] else ""
253
- output.append(f" {letter}. {option['text']}{correct_marker}")
254
- return "\n".join(output)
255
-
256
- def analyze_document(text, gemini_api_key, claude_api_key, course_name, section_name, lesson_name, language):
257
- try:
258
- start_time = time.time()
259
- text_parts = split_text_by_tokens(text)
260
-
261
- input_tokens = 0
262
- output_tokens = 0
263
-
264
- all_results = {
265
- "course_info": {
266
- "course_name": course_name,
267
- "section_name": section_name,
268
- "lesson_name": lesson_name
269
- },
270
- "segments": []
271
- }
272
- segment_counter = 1
273
-
274
- # Process each part of the text
275
- for part in text_parts:
276
- if language == "Uzbek" and claude_api_key:
277
- # from prompts import ANALYSIS_PROMPT_TEMPLATE_CLAUDE
278
- prompt_template = ANALYSIS_PROMPT_TEMPLATE_CLAUDE
279
- else:
280
- # from prompts import ANALYSIS_PROMPT_TEMPLATE_GEMINI
281
- prompt_template = ANALYSIS_PROMPT_TEMPLATE_GEMINI
282
-
283
- # Format the prompt with actual values
284
- actual_prompt = prompt_template.format(
285
- course_name=course_name,
286
- section_name=section_name,
287
- lesson_name=lesson_name,
288
- text=part
289
- )
290
-
291
- prompt_tokens = len(tokenizer.encode(actual_prompt))
292
- input_tokens += prompt_tokens
293
-
294
-
295
- analysis = segment_and_analyze_text(
296
- text,
297
- gemini_api_key,
298
- claude_api_key,
299
- language,
300
- course_name=course_name,
301
- section_name=section_name,
302
- lesson_name=lesson_name
303
- )
304
-
305
-
306
- if "segments" in analysis:
307
- for segment in analysis["segments"]:
308
- segment["segment_number"] = segment_counter
309
- all_results["segments"].append(segment)
310
- segment_counter += 1
311
-
312
- end_time = time.time()
313
- total_time = end_time - start_time
314
- print(f"Total quiz processing time: {total_time}s")
315
-
316
-
317
- formatted_output = format_quiz_for_display(all_results, language)
318
- output_tokens = len(tokenizer.encode(formatted_output))
319
-
320
-
321
- token_info = f"Input tokens: {input_tokens}\nOutput tokens: {output_tokens}\nTotal tokens: {input_tokens + output_tokens}\n"
322
- formatted_text = format_quiz_for_display(all_results, language)
323
- formatted_text = f"Total quiz Processing time: {total_time:.2f}s\n{token_info}\n" + formatted_text
324
-
325
- output_tokens = len(tokenizer.encode(formatted_output))
326
- json_path = tempfile.mktemp(suffix='.json')
327
- with open(json_path, 'w', encoding='utf-8') as json_file:
328
- json.dump(all_results, json_file, indent=2)
329
-
330
- txt_path = tempfile.mktemp(suffix='.txt')
331
- with open(txt_path, 'w', encoding='utf-8') as txt_file:
332
- txt_file.write(formatted_text)
333
-
334
- return formatted_text, json_path, txt_path
335
- except Exception as e:
336
- error_message = f"Error processing document: {str(e)}"
337
  return error_message, None, None
 
1
+ import os
2
+ import re
3
+ import json
4
+ import time
5
+ import gradio as gr
6
+ import tempfile
7
+ from typing import Dict, Any, List, Optional
8
+ from transformers import AutoTokenizer
9
+ from sentence_transformers import SentenceTransformer
10
+ from pydantic import BaseModel, Field
11
+ from anthropic import Anthropic
12
+ from huggingface_hub import login
13
+ from src.prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_CLAUDE, ANALYSIS_PROMPT_TEMPLATE_GEMINI
14
+
15
+
16
+ CLAUDE_MODEL = "claude-3-5-sonnet-20241022"
17
+ OPENAI_MODEL = "gpt-4o"
18
+ GEMINI_MODEL = "gemini-2.0-flash"
19
+
20
+ DEFAULT_TEMPERATURE = 0.7
21
+
22
+ TOKENIZER_MODEL = "answerdotai/ModernBERT-base"
23
+ SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
24
+
25
+ hf_token = os.environ.get('HF_TOKEN', None)
26
+ login(token=hf_token)
27
+
28
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
29
+ sentence_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
30
+
31
+ class CourseInfo(BaseModel):
32
+ course_name: str = Field(description="Name of the course")
33
+ section_name: str = Field(description="Name of the course section")
34
+ lesson_name: str = Field(description="Name of the lesson")
35
+
36
+ class QuizOption(BaseModel):
37
+ text: str = Field(description="The text of the answer option")
38
+ correct: bool = Field(description="Whether this option is correct")
39
+
40
+ class QuizQuestion(BaseModel):
41
+ question: str = Field(description="The text of the quiz question")
42
+ options: List[QuizOption] = Field(description="List of answer options")
43
+
44
+ class Segment(BaseModel):
45
+ segment_number: int = Field(description="The segment number")
46
+ topic_name: str = Field(description="Unique and specific topic name that clearly differentiates it from other segments")
47
+ key_concepts: List[str] = Field(description="3-5 key concepts discussed in the segment")
48
+ summary: str = Field(description="Brief summary of the segment (3-5 sentences)")
49
+ quiz_questions: List[QuizQuestion] = Field(description="5 quiz questions based on the segment content")
50
+
51
+ class TextSegmentAnalysis(BaseModel):
52
+ course_info: CourseInfo = Field(description="Information about the course")
53
+ segments: List[Segment] = Field(description="List of text segments with analysis")
54
+
55
+ def clean_text(text):
56
+ text = re.sub(r'\[speaker_\d+\]', '', text)
57
+ text = re.sub(r'\s+', ' ', text).strip()
58
+ return text
59
+
60
+ def split_text_by_tokens(text, max_tokens=12000):
61
+ text = clean_text(text)
62
+ tokens = tokenizer.encode(text)
63
+
64
+ if len(tokens) <= max_tokens:
65
+ return [text]
66
+
67
+ split_point = len(tokens) // 2
68
+
69
+ sentences = re.split(r'(?<=[.!?])\s+', text)
70
+
71
+ first_half = []
72
+ second_half = []
73
+
74
+ current_tokens = 0
75
+ for sentence in sentences:
76
+ sentence_tokens = len(tokenizer.encode(sentence))
77
+
78
+ if current_tokens + sentence_tokens <= split_point:
79
+ first_half.append(sentence)
80
+ current_tokens += sentence_tokens
81
+ else:
82
+ second_half.append(sentence)
83
+
84
+ return [" ".join(first_half), " ".join(second_half)]
85
+
86
+
87
+
88
+ def generate_with_claude(text, api_key, course_name="", section_name="", lesson_name=""):
89
+ from prompts import SYSTEM_PROMPT, ANALYSIS_PROMPT_TEMPLATE_CLAUDE
90
+
91
+ client = Anthropic(api_key=api_key)
92
+
93
+ segment_analysis_schema = TextSegmentAnalysis.model_json_schema()
94
+
95
+ tools = [
96
+ {
97
+ "name": "build_segment_analysis",
98
+ "description": "Build the text segment analysis with quiz questions",
99
+ "input_schema": segment_analysis_schema
100
+ }
101
+ ]
102
+
103
+
104
+ prompt = ANALYSIS_PROMPT_TEMPLATE_CLAUDE.format(
105
+ course_name=course_name,
106
+ section_name=section_name,
107
+ lesson_name=lesson_name,
108
+ text=text
109
+ )
110
+
111
+ try:
112
+ response = client.messages.create(
113
+ model=CLAUDE_MODEL,
114
+ max_tokens=8192,
115
+ temperature=DEFAULT_TEMPERATURE,
116
+ system=SYSTEM_PROMPT,
117
+ messages=[
118
+ {
119
+ "role": "user",
120
+ "content": prompt
121
+ }
122
+ ],
123
+ tools=tools,
124
+ tool_choice={"type": "tool", "name": "build_segment_analysis"}
125
+ )
126
+
127
+ # Extract the tool call content
128
+ if response.content and len(response.content) > 0 and hasattr(response.content[0], 'input'):
129
+ function_call = response.content[0].input
130
+ return function_call
131
+ else:
132
+ raise Exception("No valid tool call found in the response")
133
+ except Exception as e:
134
+ raise Exception(f"Error calling Anthropic API: {str(e)}")
135
+
136
+
137
+ def get_active_api_key(gemini_key, claude_key, language):
138
+ if language == "Uzbek" and claude_key:
139
+ return claude_key, "claude"
140
+ else:
141
+ return gemini_key, "gemini"
142
+
143
+ def segment_and_analyze_text(text: str, gemini_api_key: str, claude_api_key: str, language: str,
144
+ course_name="", section_name="", lesson_name="") -> Dict[str, Any]:
145
+ active_key, api_type = get_active_api_key(gemini_api_key, claude_api_key, language)
146
+
147
+ if api_type == "claude":
148
+ return generate_with_claude(text, active_key, course_name, section_name, lesson_name)
149
+
150
+ from langchain_google_genai import ChatGoogleGenerativeAI
151
+ from prompts import ANALYSIS_PROMPT_TEMPLATE_GEMINI, SYSTEM_PROMPT
152
+ os.environ["GOOGLE_API_KEY"] = active_key
153
+ llm = ChatGoogleGenerativeAI(
154
+ model=GEMINI_MODEL,
155
+ temperature=DEFAULT_TEMPERATURE,
156
+ max_retries=3
157
+ )
158
+
159
+ base_prompt = ANALYSIS_PROMPT_TEMPLATE_GEMINI.format(
160
+ course_name=course_name,
161
+ section_name=section_name,
162
+ lesson_name=lesson_name,
163
+ text=text
164
+ )
165
+
166
+ language_instruction = f"\nIMPORTANT: Generate ALL content (including topic names, key concepts, summaries, and quiz questions) in {language} language."
167
+ prompt = base_prompt + language_instruction
168
+
169
+
170
+ try:
171
+ messages = [
172
+ {"role": "system", "content": SYSTEM_PROMPT},
173
+ {"role": "user", "content": prompt}
174
+ ]
175
+
176
+ response = llm.invoke(messages)
177
+
178
+ try:
179
+ content = response.content
180
+ json_match = re.search(r'```json\s*([\s\S]*?)\s*```', content)
181
+
182
+ if json_match:
183
+ json_str = json_match.group(1)
184
+ else:
185
+ json_match = re.search(r'(\{[\s\S]*\})', content)
186
+ if json_match:
187
+ json_str = json_match.group(1)
188
+ else:
189
+ json_str = content
190
+
191
+ # Parse the JSON
192
+ function_call = json.loads(json_str)
193
+ return function_call
194
+ except json.JSONDecodeError:
195
+ raise Exception("Could not parse JSON from LLM response")
196
+ except Exception as e:
197
+ raise Exception(f"Error calling API: {str(e)}")
198
+
199
+ def format_quiz_for_display(results, language="English"):
200
+ output = []
201
+
202
+ if language == "Uzbek":
203
+ course_header = "KURS"
204
+ section_header = "BO'LIM"
205
+ lesson_header = "DARS"
206
+ segment_header = "QISM"
207
+ key_concepts_header = "ASOSIY TUSHUNCHALAR"
208
+ summary_header = "QISQACHA MAZMUN"
209
+ quiz_questions_header = "TEST SAVOLLARI"
210
+ elif language == "Russian":
211
+ course_header = "КУРС"
212
+ section_header = "РАЗДЕЛ"
213
+ lesson_header = "УРОК"
214
+ segment_header = "СЕГМЕНТ"
215
+ key_concepts_header = "КЛЮЧЕВЫЕ ПОНЯТИЯ"
216
+ summary_header = "КРАТКОЕ СОДЕРЖАНИЕ"
217
+ quiz_questions_header = "ТЕСТОВЫЕ ВОПРОСЫ"
218
+ else:
219
+ course_header = "COURSE"
220
+ section_header = "SECTION"
221
+ lesson_header = "LESSON"
222
+ segment_header = "SEGMENT"
223
+ key_concepts_header = "KEY CONCEPTS"
224
+ summary_header = "SUMMARY"
225
+ quiz_questions_header = "QUIZ QUESTIONS"
226
+
227
+ if "course_info" in results:
228
+ course_info = results["course_info"]
229
+ output.append(f"{'='*40}")
230
+ output.append(f"{course_header}: {course_info.get('course_name', 'N/A')}")
231
+ output.append(f"{section_header}: {course_info.get('section_name', 'N/A')}")
232
+ output.append(f"{lesson_header}: {course_info.get('lesson_name', 'N/A')}")
233
+ output.append(f"{'='*40}\n")
234
+
235
+ segments = results.get("segments", [])
236
+ for i, segment in enumerate(segments):
237
+ topic = segment["topic_name"]
238
+ segment_num = i + 1
239
+ output.append(f"\n\n{'='*40}")
240
+ output.append(f"{segment_header} {segment_num}: {topic}")
241
+ output.append(f"{'='*40}\n")
242
+ output.append(f"{key_concepts_header}:")
243
+ for concept in segment["key_concepts"]:
244
+ output.append(f"• {concept}")
245
+ output.append(f"\n{summary_header}:")
246
+ output.append(segment["summary"])
247
+ output.append(f"\n{quiz_questions_header}:")
248
+ for i, q in enumerate(segment["quiz_questions"]):
249
+ output.append(f"\n{i+1}. {q['question']}")
250
+ for j, option in enumerate(q['options']):
251
+ letter = chr(97 + j).upper()
252
+ correct_marker = " ✓" if option["correct"] else ""
253
+ output.append(f" {letter}. {option['text']}{correct_marker}")
254
+ return "\n".join(output)
255
+
256
+ def analyze_document(text, gemini_api_key, claude_api_key, course_name, section_name, lesson_name, language):
257
+ try:
258
+ start_time = time.time()
259
+ text_parts = split_text_by_tokens(text)
260
+
261
+ input_tokens = 0
262
+ output_tokens = 0
263
+
264
+ all_results = {
265
+ "course_info": {
266
+ "course_name": course_name,
267
+ "section_name": section_name,
268
+ "lesson_name": lesson_name
269
+ },
270
+ "segments": []
271
+ }
272
+ segment_counter = 1
273
+
274
+ # Process each part of the text
275
+ for part in text_parts:
276
+ if language == "Uzbek" and claude_api_key:
277
+ # from prompts import ANALYSIS_PROMPT_TEMPLATE_CLAUDE
278
+ prompt_template = ANALYSIS_PROMPT_TEMPLATE_CLAUDE
279
+ else:
280
+ # from prompts import ANALYSIS_PROMPT_TEMPLATE_GEMINI
281
+ prompt_template = ANALYSIS_PROMPT_TEMPLATE_GEMINI
282
+
283
+ # Format the prompt with actual values
284
+ actual_prompt = prompt_template.format(
285
+ course_name=course_name,
286
+ section_name=section_name,
287
+ lesson_name=lesson_name,
288
+ text=part
289
+ )
290
+
291
+ prompt_tokens = len(tokenizer.encode(actual_prompt))
292
+ input_tokens += prompt_tokens
293
+
294
+
295
+ analysis = segment_and_analyze_text(
296
+ text,
297
+ gemini_api_key,
298
+ claude_api_key,
299
+ language,
300
+ course_name=course_name,
301
+ section_name=section_name,
302
+ lesson_name=lesson_name
303
+ )
304
+
305
+
306
+ if "segments" in analysis:
307
+ for segment in analysis["segments"]:
308
+ segment["segment_number"] = segment_counter
309
+ all_results["segments"].append(segment)
310
+ segment_counter += 1
311
+
312
+ end_time = time.time()
313
+ total_time = end_time - start_time
314
+ print(f"Total quiz processing time: {total_time}s")
315
+
316
+
317
+ formatted_output = format_quiz_for_display(all_results, language)
318
+ output_tokens = len(tokenizer.encode(formatted_output))
319
+
320
+
321
+ token_info = f"Input tokens: {input_tokens}\nOutput tokens: {output_tokens}\nTotal tokens: {input_tokens + output_tokens}\n"
322
+ formatted_text = format_quiz_for_display(all_results, language)
323
+ formatted_text = f"Total quiz Processing time: {total_time:.2f}s\n{token_info}\n" + formatted_text
324
+
325
+ output_tokens = len(tokenizer.encode(formatted_output))
326
+ json_path = tempfile.mktemp(suffix='.json')
327
+ with open(json_path, 'w', encoding='utf-8') as json_file:
328
+ json.dump(all_results, json_file, indent=2)
329
+
330
+ txt_path = tempfile.mktemp(suffix='.txt')
331
+ with open(txt_path, 'w', encoding='utf-8') as txt_file:
332
+ txt_file.write(formatted_text)
333
+
334
+ return formatted_text, json_path, txt_path
335
+ except Exception as e:
336
+ error_message = f"Error processing document: {str(e)}"
337
  return error_message, None, None