File size: 4,927 Bytes
5798cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import json
from dotenv import load_dotenv
import fitz  # PyMuPDF
from langchain_openai import ChatOpenAI  # Correct import from langchain-openai
from langchain.schema import HumanMessage, SystemMessage  # For creating structured chat messages

QUESTIONS_PATH = "questions.json"

# Load environment variables
load_dotenv()

def split_text_into_chunks(text: str, chunk_size: int) -> list:
    """
    Splits the text into chunks of a specified maximum size.
    """
    # Trim the text to remove leading/trailing whitespace and reduce multiple spaces to a single space
    cleaned_text = " ".join(text.split())
    words = cleaned_text.split(" ")

    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)
            current_length += len(word) + 1

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def distribute_questions_across_chunks(n_chunks: int, n_questions: int) -> list:
    """
    Distributes a specified number of questions across a specified number of chunks.
    """
    questions_per_chunk = [1] * min(n_chunks, n_questions)
    remaining_questions = n_questions - len(questions_per_chunk)

    if remaining_questions > 0:
        for i in range(len(questions_per_chunk)):
            if remaining_questions == 0:
                break
            questions_per_chunk[i] += 1
            remaining_questions -= 1

    while len(questions_per_chunk) < n_chunks:
        questions_per_chunk.append(0)

    return questions_per_chunk


def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        print(f"[DEBUG] Opening PDF: {pdf_path}")
        with fitz.open(pdf_path) as pdf:
            print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
            for page in pdf:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        raise RuntimeError("Unable to extract text from PDF.")
    return text


def generate_questions_from_text(text, n_questions=5):
    openai_api_key = os.getenv("OPENAI_API_KEY")

    if not openai_api_key:
        raise RuntimeError(
            "OpenAI API key not found. Please add it to your .env file as OPENAI_API_KEY."
        )

    chat = ChatOpenAI(
        openai_api_key=openai_api_key, model="gpt-4", temperature=0.7, max_tokens=750
    )

    messages = [
        SystemMessage(
            content="You are an expert interviewer who generates concise technical interview questions. Do not enumerate the questions. Answer only with questions."
        ),
        HumanMessage(
            content=f"Based on the following content, generate {n_questions} technical interview questions:\n{text}"
        ),
    ]

    try:
        print(f"[DEBUG] Sending request to OpenAI with {n_questions} questions.")
        response = chat.invoke(messages)
        questions = response.content.strip().split("\n\n")
        questions = [q.strip() for q in questions if q.strip()]
    except Exception as e:
        print(f"[ERROR] Failed to generate questions: {e}")
        questions = ["An error occurred while generating questions."]

    return questions


def save_questions(questions):
    with open(QUESTIONS_PATH, "w") as f:
        json.dump(questions, f, indent=4)


def generate_and_save_questions_from_pdf(pdf_path, total_questions=5):
    print(f"[INFO] Generating questions from PDF: {pdf_path}")
    pdf_text = extract_text_from_pdf(pdf_path)

    if not pdf_text.strip():
        raise RuntimeError("The PDF content is empty or could not be read.")

    chunk_size = 2000
    chunks = split_text_into_chunks(pdf_text, chunk_size)
    n_chunks = len(chunks)

    questions_distribution = distribute_questions_across_chunks(n_chunks, total_questions)
    combined_questions = []

    for i, (chunk, n_questions) in enumerate(zip(chunks, questions_distribution)):
        print(f"[DEBUG] Processing chunk {i + 1} of {n_chunks}")
        if n_questions > 0:
            questions = generate_questions_from_text(chunk, n_questions=n_questions)
            combined_questions.extend(questions)

    print(f"[INFO] Total questions generated: {len(combined_questions)}")
    save_questions(combined_questions)
    print(f"[INFO] Questions saved to {QUESTIONS_PATH}")
    return combined_questions


if __name__ == "__main__":
    pdf_path = "professional_machine_learning_engineer_exam_guide_english.pdf"

    try:
        generated_questions = generate_and_save_questions_from_pdf(
            pdf_path, total_questions=5
        )
        print(f"Generated Questions:\n{json.dumps(generated_questions, indent=2)}")
    except Exception as e:
        print(f"Failed to generate questions: {e}")