MrSimple01 commited on
Commit
06f06f1
·
verified ·
1 Parent(s): 3add489

Update quiz_processing.py

Browse files
Files changed (1) hide show
  1. quiz_processing.py +229 -201
quiz_processing.py CHANGED
@@ -1,201 +1,229 @@
1
- import re
2
- import json
3
- import os
4
- from langchain_google_genai import ChatGoogleGenerativeAI
5
- from transformers import AutoTokenizer
6
- from huggingface_hub import login
7
-
8
- hf_token = os.environ.get('HF_TOKEN', None)
9
- login(token=hf_token)
10
-
11
- tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", use_auth_token=hf_token)
12
-
13
- def clean_text(text):
14
- text = re.sub(r'\[speaker_\d+\]', '', text)
15
- text = re.sub(r'\s+', ' ', text).strip()
16
- return text
17
-
18
- def split_text_by_tokens(text, max_tokens=8000):
19
- text = clean_text(text)
20
- tokens = tokenizer.encode(text)
21
-
22
- if len(tokens) <= max_tokens:
23
- return [text]
24
-
25
- split_point = len(tokens) // 2
26
-
27
- sentences = re.split(r'(?<=[.!?])\s+', text)
28
-
29
- first_half = []
30
- second_half = []
31
-
32
- current_tokens = 0
33
- for sentence in sentences:
34
- sentence_tokens = len(tokenizer.encode(sentence))
35
-
36
- if current_tokens + sentence_tokens <= split_point:
37
- first_half.append(sentence)
38
- current_tokens += sentence_tokens
39
- else:
40
- second_half.append(sentence)
41
-
42
- return [" ".join(first_half), " ".join(second_half)]
43
-
44
- def analyze_segment_with_gemini(segment_text, google_api_key):
45
- os.environ["GOOGLE_API_KEY"] = google_api_key
46
-
47
- llm = ChatGoogleGenerativeAI(
48
- model="gemini-1.5-flash",
49
- temperature=0.7,
50
- max_tokens=None,
51
- timeout=None,
52
- max_retries=3
53
- )
54
-
55
- prompt = f"""
56
- Analyze the following text and identify distinct segments within it and do text segmentation:
57
- 1. Segments should be STRICTLY max=15
58
- 2. For each segment/topic you identify:
59
- - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
60
- - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
61
- - Write a brief summary of that segment (3-5 sentences)
62
- - Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only
63
- - Questions and answers should be only from the content of the segment
64
-
65
- For each quiz question:
66
- - Create one correct answer that comes DIRECTLY from the text
67
- - Create two plausible but incorrect answers
68
- - IMPORTANT: Ensure all answer options have similar length 3 words)
69
- - Ensure the correct answer is clearly indicated with a ✓ symbol
70
- - Questions should **require actual understanding**, NOT just basic fact recall.
71
- - Questions Are **non-trivial**, encourage deeper thinking, and **avoid surface-level facts**.
72
- - Are **directly based on the segment's content** (not inferred from the summary).
73
- - Do **not include questions about document structure** (e.g., title, number of paragraphs).
74
- - Do **not generate overly generic or obvious questions** (e.g., "What is mentioned in the text?").
75
- - Focus on **core ideas, logical reasoning, and conceptual understanding**.
76
-
77
-
78
- Text:
79
- {segment_text}
80
-
81
- Format your response as JSON with the following structure:
82
- {{
83
- "segments": [
84
- {{
85
- "topic_name": "Unique and Specific Topic Name",
86
- "key_concepts": ["concept1", "concept2", "concept3"],
87
- "summary": "Brief summary of this segment.",
88
- "quiz_questions": [
89
- {{
90
- "question": "Question text?",
91
- "options": [
92
- {{
93
- "text": "Option A",
94
- "correct": false
95
- }},
96
- {{
97
- "text": "Option B",
98
- "correct": true
99
- }},
100
- {{
101
- "text": "Option C",
102
- "correct": false
103
- }}
104
- ]
105
- }}
106
- ]
107
- }}
108
- ]
109
- }}
110
-
111
- IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
112
- - **Do NOT repeat** key concepts across multiple segments unless absolutely necessary.
113
- - **Ensure the quiz questions challenge the reader** and **are not easily guessable**.
114
-
115
- """
116
-
117
- try:
118
- response = llm.invoke(prompt)
119
- response_text = response.content
120
-
121
- json_match = re.search(r'\{[\s\S]*\}', response_text)
122
- if json_match:
123
- return json.loads(json_match.group(0))
124
- else:
125
- return json.loads(response_text)
126
- except Exception as e:
127
- print(f"Error in Gemini analysis: {e}")
128
- return {
129
- "segments": [
130
- {
131
- "topic_name": "Analysis Error",
132
- "key_concepts": ["Could not process text"],
133
- "summary": "An error occurred during text analysis.",
134
- "quiz_questions": []
135
- }
136
- ]
137
- }
138
-
139
- def format_quiz_for_display(results):
140
- output = []
141
-
142
- for segment in results:
143
- topic = segment["topic_name"]
144
- segment_num = segment.get("segment_number", 1)
145
-
146
- output.append(f"\n\n{'='*40}")
147
- output.append(f"SEGMENT {segment_num}: {topic}")
148
- output.append(f"{'='*40}\n")
149
-
150
- output.append("KEY CONCEPTS:")
151
- for concept in segment["key_concepts"]:
152
- output.append(f" {concept}")
153
-
154
- output.append("\nSUMMARY:")
155
- output.append(segment["summary"])
156
-
157
- output.append("\nQUIZ QUESTIONS:")
158
- for i, q in enumerate(segment["quiz_questions"]):
159
- output.append(f"\n{i+1}. {q['question']}")
160
-
161
- for j, option in enumerate(q['options']):
162
- letter = chr(97 + j).upper()
163
- correct_marker = " ✓" if option["correct"] else ""
164
- output.append(f" {letter}. {option['text']}{correct_marker}")
165
-
166
- return "\n".join(output)
167
-
168
- def save_quiz_json(all_segments):
169
- json_data = {"segments": all_segments}
170
- json_filename = "generated_quiz.json"
171
- with open(json_filename, "w", encoding="utf-8") as f:
172
- json.dump(json_data, f, indent=2)
173
- return json_filename
174
-
175
-
176
- def process_text(transcript_text, google_api_key):
177
- if not transcript_text:
178
- return "No text to analyze", None, None
179
- text_parts = split_text_by_tokens(transcript_text)
180
-
181
- all_segments = []
182
- segment_counter = 1
183
-
184
- for part in text_parts:
185
- analysis = analyze_segment_with_gemini(part, google_api_key)
186
-
187
- if "segments" in analysis:
188
- for segment in analysis["segments"]:
189
- segment["segment_number"] = segment_counter
190
- all_segments.append(segment)
191
- segment_counter += 1
192
-
193
- formatted_quiz = format_quiz_for_display(all_segments)
194
-
195
- quiz_filename = "generated_quiz.txt"
196
- with open(quiz_filename, "w", encoding="utf-8") as f:
197
- f.write(formatted_quiz)
198
-
199
- json_filename = save_quiz_json(all_segments)
200
-
201
- return formatted_quiz, quiz_filename, json_filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import os
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ from transformers import AutoTokenizer
6
+ from huggingface_hub import login
7
+
8
+ hf_token = os.environ.get('HF_TOKEN', None)
9
+ login(token=hf_token)
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", use_auth_token=hf_token)
12
+
13
+ def clean_text(text):
14
+ text = re.sub(r'\[speaker_\d+\]', '', text)
15
+ text = re.sub(r'\s+', ' ', text).strip()
16
+ return text
17
+
18
+ def split_text_by_tokens(text, max_tokens=12000):
19
+ text = clean_text(text)
20
+ tokens = tokenizer.encode(text)
21
+
22
+ if len(tokens) <= max_tokens:
23
+ return [text]
24
+
25
+ split_point = len(tokens) // 2
26
+
27
+ sentences = re.split(r'(?<=[.!?])\s+', text)
28
+
29
+ first_half = []
30
+ second_half = []
31
+
32
+ current_tokens = 0
33
+ for sentence in sentences:
34
+ sentence_tokens = len(tokenizer.encode(sentence))
35
+
36
+ if current_tokens + sentence_tokens <= split_point:
37
+ first_half.append(sentence)
38
+ current_tokens += sentence_tokens
39
+ else:
40
+ second_half.append(sentence)
41
+
42
+ return [" ".join(first_half), " ".join(second_half)]
43
+
44
+ def analyze_segment_with_gemini(segment_text, google_api_key, course_name="", section_name="", lesson_name=""):
45
+ os.environ["GOOGLE_API_KEY"] = google_api_key
46
+
47
+ llm = ChatGoogleGenerativeAI(
48
+ model="gemini-2.0-flash",
49
+ temperature=0.7,
50
+ max_tokens=None,
51
+ timeout=None,
52
+ max_retries=3
53
+ )
54
+
55
+ prompt = f"""
56
+ Analyze the following text and identify distinct segments within it and do text segmentation:
57
+ 1. Segments should be STRICTLY max=15
58
+ 2. For each segment/topic you identify:
59
+ - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
60
+ - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
61
+ - Write a brief summary of that segment (3-5 sentences)
62
+ - Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only
63
+ - Questions and answers should be only from the content of the segment
64
+ For each quiz question:
65
+ - Create one correct answer that comes DIRECTLY from the text
66
+ - Create two plausible but incorrect answers
67
+ - IMPORTANT: Ensure all answer options have similar length (± 3 words)
68
+ - Ensure the correct answer is clearly indicated with a symbol
69
+ - Questions should **require actual understanding**, NOT just basic fact recall.
70
+ - Questions Are **non-trivial**, encourage deeper thinking, and **avoid surface-level facts**.
71
+ - Are **directly based on the segment's content** (not inferred from the summary).
72
+ - Do **not include questions about document structure** (e.g., title, number of paragraphs).
73
+ - Do **not generate overly generic or obvious questions** (e.g., "What is mentioned in the text?").
74
+ - Focus on **core ideas, logical reasoning, and conceptual understanding**
75
+ ADDITIONAL REQUIREMENT:
76
+ - First, detect the language of the original text.
77
+ - Generate all questions and answers in the same language as the original text.
78
+ - If the text is in Russian, generate questions and answers in Russian.
79
+ - If the text is in another language, generate questions in that original language.
80
+
81
+ COURSE INFORMATION:
82
+ - Course Name: {course_name}
83
+ - Section Name: {section_name}
84
+ - Lesson Name: {lesson_name}
85
+ - Use this information to contextualize the quiz and make it relevant to the educational content.
86
+ - Include this information in the JSON response structure.
87
+
88
+ Text:
89
+ {segment_text}
90
+
91
+ Format your response as JSON with the following structure:
92
+ {{
93
+ "course_info": {{
94
+ "course_name": "{course_name}",
95
+ "section_name": "{section_name}",
96
+ "lesson_name": "{lesson_name}"
97
+ }},
98
+ "segments": [
99
+ {{
100
+ "topic_name": "Unique and Specific Topic Name",
101
+ "key_concepts": ["concept1", "concept2", "concept3"],
102
+ "summary": "Brief summary of this segment.",
103
+ "quiz_questions": [
104
+ {{
105
+ "question": "Question text?",
106
+ "options": [
107
+ {{
108
+ "text": "Option A",
109
+ "correct": false
110
+ }},
111
+ {{
112
+ "text": "Option B",
113
+ "correct": true
114
+ }},
115
+ {{
116
+ "text": "Option C",
117
+ "correct": false
118
+ }}
119
+ ]
120
+ }}
121
+ ]
122
+ }}
123
+ ]
124
+ }}
125
+ IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
126
+ - **Do NOT repeat** key concepts across multiple segments unless absolutely necessary.
127
+ - **Ensure the quiz questions challenge the reader** and **are not easily guessable**.
128
+ - **Tailor the content to fit within the context of the specified course, section, and lesson.**
129
+ """
130
+
131
+ try:
132
+ response = llm.invoke(prompt)
133
+ response_text = response.content
134
+
135
+ json_match = re.search(r'\{[\s\S]*\}', response_text)
136
+ if json_match:
137
+ return json.loads(json_match.group(0))
138
+ else:
139
+ return json.loads(response_text)
140
+ except Exception as e:
141
+ print(f"Error in Gemini analysis: {e}")
142
+ return {
143
+ "course_info": {
144
+ "course_name": course_name,
145
+ "section_name": section_name,
146
+ "lesson_name": lesson_name
147
+ },
148
+ "segments": [
149
+ {
150
+ "topic_name": "Analysis Error",
151
+ "key_concepts": ["Could not process text"],
152
+ "summary": "An error occurred during text analysis.",
153
+ "quiz_questions": []
154
+ }
155
+ ]
156
+ }
157
+
158
+ def format_quiz_for_display(results):
159
+ output = []
160
+
161
+ if "course_info" in results:
162
+ course_info = results["course_info"]
163
+ output.append(f"{'='*40}")
164
+ output.append(f"COURSE: {course_info.get('course_name', 'N/A')}")
165
+ output.append(f"SECTION: {course_info.get('section_name', 'N/A')}")
166
+ output.append(f"LESSON: {course_info.get('lesson_name', 'N/A')}")
167
+ output.append(f"{'='*40}\n")
168
+
169
+ segments = results.get("segments", [])
170
+ for i, segment in enumerate(segments):
171
+ topic = segment["topic_name"]
172
+ segment_num = i + 1
173
+ output.append(f"\n\n{'='*40}")
174
+ output.append(f"SEGMENT {segment_num}: {topic}")
175
+ output.append(f"{'='*40}\n")
176
+ output.append("KEY CONCEPTS:")
177
+ for concept in segment["key_concepts"]:
178
+ output.append(f" {concept}")
179
+ output.append("\nSUMMARY:")
180
+ output.append(segment["summary"])
181
+ output.append("\nQUIZ QUESTIONS:")
182
+ for i, q in enumerate(segment["quiz_questions"]):
183
+ output.append(f"\n{i+1}. {q['question']}")
184
+ for j, option in enumerate(q['options']):
185
+ letter = chr(97 + j).upper()
186
+ correct_marker = " ✓" if option["correct"] else ""
187
+ output.append(f" {letter}. {option['text']}{correct_marker}")
188
+ return "\n".join(output)
189
+
190
+
191
+ def save_quiz_json(results):
192
+ json_filename = "generated_quiz.json"
193
+ with open(json_filename, "w", encoding="utf-8") as f:
194
+ json.dump(results, f, indent=2)
195
+ return json_filename
196
+
197
+ def process_text(transcript_text, google_api_key, course_name="", section_name="", lesson_name=""):
198
+ if not transcript_text:
199
+ return "No text to analyze", None, None
200
+ text_parts = split_text_by_tokens(transcript_text)
201
+
202
+ all_results = {
203
+ "course_info": {
204
+ "course_name": course_name,
205
+ "section_name": section_name,
206
+ "lesson_name": lesson_name
207
+ },
208
+ "segments": []
209
+ }
210
+ segment_counter = 1
211
+
212
+ for part in text_parts:
213
+ analysis = analyze_segment_with_gemini(part, google_api_key, course_name, section_name, lesson_name)
214
+
215
+ if "segments" in analysis:
216
+ for segment in analysis["segments"]:
217
+ segment["segment_number"] = segment_counter
218
+ all_results["segments"].append(segment)
219
+ segment_counter += 1
220
+
221
+ formatted_quiz = format_quiz_for_display(all_results)
222
+
223
+ quiz_filename = "generated_quiz.txt"
224
+ with open(quiz_filename, "w", encoding="utf-8") as f:
225
+ f.write(formatted_quiz)
226
+
227
+ json_filename = save_quiz_json(all_results)
228
+
229
+ return formatted_quiz, quiz_filename, json_filename