MrSimple01 commited on
Commit
136731b
·
verified ·
1 Parent(s): 7cfdb4e

Delete quiz_processing.py

Browse files
Files changed (1) hide show
  1. quiz_processing.py +0 -231
quiz_processing.py DELETED
@@ -1,231 +0,0 @@
1
- import re
2
- import json
3
- import os
4
- from langchain_google_genai import ChatGoogleGenerativeAI
5
- from transformers import AutoTokenizer
6
- from huggingface_hub import login
7
-
8
- hf_token = os.environ.get('HF_TOKEN', None)
9
- login(token=hf_token)
10
-
11
- tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", use_auth_token=hf_token)
12
-
13
- def clean_text(text):
14
- text = re.sub(r'\[speaker_\d+\]', '', text)
15
- text = re.sub(r'\s+', ' ', text).strip()
16
- return text
17
-
18
- def split_text_by_tokens(text, max_tokens=12000):
19
- text = clean_text(text)
20
- tokens = tokenizer.encode(text)
21
-
22
- if len(tokens) <= max_tokens:
23
- return [text]
24
-
25
- split_point = len(tokens) // 2
26
-
27
- sentences = re.split(r'(?<=[.!?])\s+', text)
28
-
29
- first_half = []
30
- second_half = []
31
-
32
- current_tokens = 0
33
- for sentence in sentences:
34
- sentence_tokens = len(tokenizer.encode(sentence))
35
-
36
- if current_tokens + sentence_tokens <= split_point:
37
- first_half.append(sentence)
38
- current_tokens += sentence_tokens
39
- else:
40
- second_half.append(sentence)
41
-
42
- return [" ".join(first_half), " ".join(second_half)]
43
-
44
- def analyze_segment_with_gemini(segment_text, google_api_key, course_name="", section_name="", lesson_name=""):
45
- os.environ["GOOGLE_API_KEY"] = google_api_key
46
-
47
- llm = ChatGoogleGenerativeAI(
48
- model="gemini-2.0-flash",
49
- temperature=0.7,
50
- max_tokens=None,
51
- timeout=None,
52
- max_retries=3
53
- )
54
-
55
- prompt = f"""
56
- Analyze the following text and identify distinct segments within it and do text segmentation:
57
- 1. Segments should be STRICTLY max=15
58
- 2. For each segment/topic you identify:
59
- - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
60
- - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
61
- - Write a brief summary of that segment (3-5 sentences)
62
- - Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only
63
- - Questions and answers should be only from the content of the segment
64
- For each quiz question:
65
- - Create one correct answer that comes DIRECTLY from the text
66
- - Create two plausible but incorrect answers
67
- - IMPORTANT: Ensure all answer options have similar length (± 3 words)
68
- - Ensure the correct answer is clearly indicated with a ✓ symbol
69
- - Questions should **require actual understanding**, NOT just basic fact recall.
70
- - Questions Are **non-trivial**, encourage deeper thinking, and **avoid surface-level facts**.
71
- - Are **directly based on the segment's content** (not inferred from the summary).
72
- - Do **not include questions about document structure** (e.g., title, number of paragraphs).
73
- - Do **not generate overly generic or obvious questions** (e.g., "What is mentioned in the text?").
74
- - Focus on **core ideas, logical reasoning, and conceptual understanding**
75
-
76
- ADDITIONAL REQUIREMENT:
77
- - **First, detect the language of the original text.**
78
- - **Generate ALL output (topic names, key concepts, summaries, and quizzes) in the same language as the original text.**
79
- - If the text is in Russian, generate all responses in Russian.
80
- - If the text is in another language, generate responses in that original language.
81
-
82
-
83
- COURSE INFORMATION:
84
- - Course Name: {course_name}
85
- - Section Name: {section_name}
86
- - Lesson Name: {lesson_name}
87
- - Use this information to contextualize the quiz and make it relevant to the educational content.
88
- - Include this information in the JSON response structure.
89
-
90
- Text:
91
- {segment_text}
92
-
93
- Format your response as JSON with the following structure:
94
- {{
95
- "course_info": {{
96
- "course_name": "{course_name}",
97
- "section_name": "{section_name}",
98
- "lesson_name": "{lesson_name}"
99
- }},
100
- "segments": [
101
- {{
102
- "topic_name": "Unique and Specific Topic Name",
103
- "key_concepts": ["concept1", "concept2", "concept3"],
104
- "summary": "Brief summary of this segment.",
105
- "quiz_questions": [
106
- {{
107
- "question": "Question text?",
108
- "options": [
109
- {{
110
- "text": "Option A",
111
- "correct": false
112
- }},
113
- {{
114
- "text": "Option B",
115
- "correct": true
116
- }},
117
- {{
118
- "text": "Option C",
119
- "correct": false
120
- }}
121
- ]
122
- }}
123
- ]
124
- }}
125
- ]
126
- }}
127
- IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
128
- - **Do NOT repeat** key concepts across multiple segments unless absolutely necessary.
129
- - **Ensure the quiz questions challenge the reader** and **are not easily guessable**.
130
- - **Tailor the content to fit within the context of the specified course, section, and lesson.**
131
- """
132
-
133
- try:
134
- response = llm.invoke(prompt)
135
- response_text = response.content
136
-
137
- json_match = re.search(r'\{[\s\S]*\}', response_text)
138
- if json_match:
139
- return json.loads(json_match.group(0))
140
- else:
141
- return json.loads(response_text)
142
- except Exception as e:
143
- print(f"Error in Gemini analysis: {e}")
144
- return {
145
- "course_info": {
146
- "course_name": course_name,
147
- "section_name": section_name,
148
- "lesson_name": lesson_name
149
- },
150
- "segments": [
151
- {
152
- "topic_name": "Analysis Error",
153
- "key_concepts": ["Could not process text"],
154
- "summary": "An error occurred during text analysis.",
155
- "quiz_questions": []
156
- }
157
- ]
158
- }
159
-
160
- def format_quiz_for_display(results):
161
- output = []
162
-
163
- if "course_info" in results:
164
- course_info = results["course_info"]
165
- output.append(f"{'='*40}")
166
- output.append(f"COURSE: {course_info.get('course_name', 'N/A')}")
167
- output.append(f"SECTION: {course_info.get('section_name', 'N/A')}")
168
- output.append(f"LESSON: {course_info.get('lesson_name', 'N/A')}")
169
- output.append(f"{'='*40}\n")
170
-
171
- segments = results.get("segments", [])
172
- for i, segment in enumerate(segments):
173
- topic = segment["topic_name"]
174
- segment_num = i + 1
175
- output.append(f"\n\n{'='*40}")
176
- output.append(f"SEGMENT {segment_num}: {topic}")
177
- output.append(f"{'='*40}\n")
178
- output.append("KEY CONCEPTS:")
179
- for concept in segment["key_concepts"]:
180
- output.append(f"• {concept}")
181
- output.append("\nSUMMARY:")
182
- output.append(segment["summary"])
183
- output.append("\nQUIZ QUESTIONS:")
184
- for i, q in enumerate(segment["quiz_questions"]):
185
- output.append(f"\n{i+1}. {q['question']}")
186
- for j, option in enumerate(q['options']):
187
- letter = chr(97 + j).upper()
188
- correct_marker = " ✓" if option["correct"] else ""
189
- output.append(f" {letter}. {option['text']}{correct_marker}")
190
- return "\n".join(output)
191
-
192
-
193
- def save_quiz_json(results):
194
- json_filename = "generated_quiz.json"
195
- with open(json_filename, "w", encoding="utf-8") as f:
196
- json.dump(results, f, indent=2)
197
- return json_filename
198
-
199
- def process_text(transcript_text, google_api_key, course_name="", section_name="", lesson_name=""):
200
- if not transcript_text:
201
- return "No text to analyze", None, None
202
- text_parts = split_text_by_tokens(transcript_text)
203
-
204
- all_results = {
205
- "course_info": {
206
- "course_name": course_name,
207
- "section_name": section_name,
208
- "lesson_name": lesson_name
209
- },
210
- "segments": []
211
- }
212
- segment_counter = 1
213
-
214
- for part in text_parts:
215
- analysis = analyze_segment_with_gemini(part, google_api_key, course_name, section_name, lesson_name)
216
-
217
- if "segments" in analysis:
218
- for segment in analysis["segments"]:
219
- segment["segment_number"] = segment_counter
220
- all_results["segments"].append(segment)
221
- segment_counter += 1
222
-
223
- formatted_quiz = format_quiz_for_display(all_results)
224
-
225
- quiz_filename = "generated_quiz.txt"
226
- with open(quiz_filename, "w", encoding="utf-8") as f:
227
- f.write(formatted_quiz)
228
-
229
- json_filename = save_quiz_json(all_results)
230
-
231
- return formatted_quiz, quiz_filename, json_filename