Spaces:
Running
Running
File size: 4,927 Bytes
5798cfc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import os
import json
from dotenv import load_dotenv
import fitz # PyMuPDF
from langchain_openai import ChatOpenAI # Correct import from langchain-openai
from langchain.schema import HumanMessage, SystemMessage # For creating structured chat messages
QUESTIONS_PATH = "questions.json"
# Load environment variables
load_dotenv()
def split_text_into_chunks(text: str, chunk_size: int) -> list:
"""
Splits the text into chunks of a specified maximum size.
"""
# Trim the text to remove leading/trailing whitespace and reduce multiple spaces to a single space
cleaned_text = " ".join(text.split())
words = cleaned_text.split(" ")
chunks = []
current_chunk = []
current_length = 0
for word in words:
if current_length + len(word) + 1 > chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_length = len(word)
else:
current_chunk.append(word)
current_length += len(word) + 1
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def distribute_questions_across_chunks(n_chunks: int, n_questions: int) -> list:
"""
Distributes a specified number of questions across a specified number of chunks.
"""
questions_per_chunk = [1] * min(n_chunks, n_questions)
remaining_questions = n_questions - len(questions_per_chunk)
if remaining_questions > 0:
for i in range(len(questions_per_chunk)):
if remaining_questions == 0:
break
questions_per_chunk[i] += 1
remaining_questions -= 1
while len(questions_per_chunk) < n_chunks:
questions_per_chunk.append(0)
return questions_per_chunk
def extract_text_from_pdf(pdf_path):
text = ""
try:
print(f"[DEBUG] Opening PDF: {pdf_path}")
with fitz.open(pdf_path) as pdf:
print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
for page in pdf:
text += page.get_text()
except Exception as e:
print(f"Error reading PDF: {e}")
raise RuntimeError("Unable to extract text from PDF.")
return text
def generate_questions_from_text(text, n_questions=5):
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
raise RuntimeError(
"OpenAI API key not found. Please add it to your .env file as OPENAI_API_KEY."
)
chat = ChatOpenAI(
openai_api_key=openai_api_key, model="gpt-4", temperature=0.7, max_tokens=750
)
messages = [
SystemMessage(
content="You are an expert interviewer who generates concise technical interview questions. Do not enumerate the questions. Answer only with questions."
),
HumanMessage(
content=f"Based on the following content, generate {n_questions} technical interview questions:\n{text}"
),
]
try:
print(f"[DEBUG] Sending request to OpenAI with {n_questions} questions.")
response = chat.invoke(messages)
questions = response.content.strip().split("\n\n")
questions = [q.strip() for q in questions if q.strip()]
except Exception as e:
print(f"[ERROR] Failed to generate questions: {e}")
questions = ["An error occurred while generating questions."]
return questions
def save_questions(questions):
with open(QUESTIONS_PATH, "w") as f:
json.dump(questions, f, indent=4)
def generate_and_save_questions_from_pdf(pdf_path, total_questions=5):
print(f"[INFO] Generating questions from PDF: {pdf_path}")
pdf_text = extract_text_from_pdf(pdf_path)
if not pdf_text.strip():
raise RuntimeError("The PDF content is empty or could not be read.")
chunk_size = 2000
chunks = split_text_into_chunks(pdf_text, chunk_size)
n_chunks = len(chunks)
questions_distribution = distribute_questions_across_chunks(n_chunks, total_questions)
combined_questions = []
for i, (chunk, n_questions) in enumerate(zip(chunks, questions_distribution)):
print(f"[DEBUG] Processing chunk {i + 1} of {n_chunks}")
if n_questions > 0:
questions = generate_questions_from_text(chunk, n_questions=n_questions)
combined_questions.extend(questions)
print(f"[INFO] Total questions generated: {len(combined_questions)}")
save_questions(combined_questions)
print(f"[INFO] Questions saved to {QUESTIONS_PATH}")
return combined_questions
if __name__ == "__main__":
pdf_path = "professional_machine_learning_engineer_exam_guide_english.pdf"
try:
generated_questions = generate_and_save_questions_from_pdf(
pdf_path, total_questions=5
)
print(f"Generated Questions:\n{json.dumps(generated_questions, indent=2)}")
except Exception as e:
print(f"Failed to generate questions: {e}")
|