Spaces:
Sleeping
Sleeping
from flask import Flask, request, jsonify | |
import os | |
import requests | |
from PIL import Image | |
import tempfile | |
from PyPDF2 import PdfReader | |
from threading import Thread | |
import io | |
import fitz | |
from groq import Groq | |
from queue import Queue | |
import base64 | |
# Initialize Flask app | |
app = Flask(__name__) | |
# Get API tokens from environment variables | |
HF_TOKEN = os.environ.get('HF_TOKEN') | |
GROQ_API_KEY = os.environ.get('GROQ_API_KEY') | |
# Initialize Groq client | |
client = Groq(api_key=GROQ_API_KEY) | |
# Configuration for low memory mode (maintaining original functionality) | |
LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" | |
print(f"Low memory mode: {LOW_MEMORY}") | |
class TextStreamer: | |
def __init__(self): | |
self.queue = Queue() | |
self.buffer = "" | |
def put(self, text): | |
self.queue.put(text) | |
def __iter__(self): | |
while True: | |
if not self.queue.empty(): | |
text = self.queue.get() | |
if text is None: # End signal | |
break | |
yield text | |
else: | |
continue | |
def extract_image_from_pdf(pdf_url, dpi=75): | |
""" | |
Extract first page of PDF as image in memory | |
Args: | |
pdf_url (str): URL of PDF | |
dpi (int): Image resolution | |
Returns: | |
PIL.Image: First page as image or None | |
""" | |
try: | |
# Download PDF | |
print(f"Attempting to download PDF from: {pdf_url}") | |
# Download PDF | |
response = requests.get(pdf_url, timeout=30) | |
response.raise_for_status() | |
print(f"PDF download status code: {response.status_code}") | |
# Open PDF from bytes | |
print("Opening PDF document...") | |
pdf_document = fitz.open(stream=response.content, filetype="pdf") | |
# Get first page | |
print("Getting first page...") | |
first_page = pdf_document[0] | |
# Render page to pixmap | |
print("Rendering page to pixmap...") | |
pix = first_page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72)) | |
# Convert to PIL Image | |
print("Converting to PIL Image...") | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
pdf_document.close() | |
print("Successfully extracted image from PDF") | |
return img | |
except Exception as e: | |
print(f"Error extracting first page: {e}") | |
return None | |
def predict_image(image_url, text, file_pref): | |
try: | |
if file_pref == 'img': | |
response = requests.get(image_url) | |
response.raise_for_status() | |
image = Image.open(io.BytesIO(response.content)).convert("RGB") | |
else: | |
print("Extracting image from PDF...") | |
image = extract_image_from_pdf(image_url) | |
# Convert the image to base64 | |
buffered = io.BytesIO() | |
image.save(buffered, format="PNG") | |
img_str = base64.b64encode(buffered.getvalue()).decode() | |
image_url = f"data:image/png;base64,{img_str}" | |
print(f"Image URL being sent to Groq: {image_url[:100]}...") | |
streamer = TextStreamer() | |
def generate_response(): | |
try: | |
completion = client.chat.completions.create( | |
model="llama-3.2-11b-vision-preview", | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "image_url", | |
"image_url": {"url": image_url} | |
}, | |
{ | |
"type": "text", | |
"text": text | |
} | |
] | |
} | |
], | |
temperature=0.7, | |
max_tokens=4096, | |
top_p=1, | |
stream=True | |
# temperature=1, | |
# max_completion_tokens=4096, | |
# top_p=1, | |
# stream=False, | |
# stop=None, | |
) | |
print(f"Completions: {completion}") | |
for chunk in completion: | |
if chunk.choices[0].delta.content: | |
streamer.put(chunk.choices[0].delta.content) | |
streamer.put(None) # Signal the end | |
except Exception as e: | |
print(f"Error in generate_response: {e}") | |
streamer.put(None) | |
thread = Thread(target=generate_response) | |
thread.start() | |
buffer = "" | |
for new_text in streamer: | |
buffer += new_text | |
# buffer = completion.choices[0].message | |
# print(buffer) | |
return buffer | |
except Exception as e: | |
raise ValueError(f"Error during prediction: {str(e)}") | |
def extract_text_from_pdf(pdf_url): | |
try: | |
response = requests.get(pdf_url) | |
response.raise_for_status() | |
with tempfile.NamedTemporaryFile(delete=False) as temp_pdf: | |
temp_pdf.write(response.content) | |
temp_pdf_path = temp_pdf.name | |
reader = PdfReader(temp_pdf_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
os.remove(temp_pdf_path) | |
return text | |
except Exception as e: | |
raise ValueError(f"Error extracting text from PDF: {str(e)}") | |
def predict_text(text): | |
streamer = TextStreamer() | |
def generate_response(): | |
try: | |
completion = client.chat.completions.create( | |
model="meta-llama/llama-4-scout-17b-16e-instruct", | |
messages=[ | |
{ | |
"role": "user", | |
"content": text | |
} | |
], | |
temperature=0.7, | |
max_tokens=4096, | |
top_p=1, | |
stream=True | |
) | |
for chunk in completion: | |
if chunk.choices[0].delta.content: | |
streamer.put(chunk.choices[0].delta.content) | |
streamer.put(None) # Signal the end | |
except Exception as e: | |
print(f"Error in generate_response: {e}") | |
streamer.put(None) | |
thread = Thread(target=generate_response) | |
thread.start() | |
buffer = "" | |
for new_text in streamer: | |
buffer += new_text | |
return buffer | |
# [Rest of the prompts remain exactly the same as in original] | |
PROMPT = ( | |
"Extract the following information as per this format:\n" | |
"'Course Code:'\n" | |
"'Course Name:'\n" | |
"'Course Description:'\n" | |
"'Course Credits:'\n" | |
"'Course Learning Outcomes:'\n" | |
"'Delivery Method:'\n" | |
"'Prerequisite(s):'\n" | |
"'Co-requisite(s):'\n" | |
"'Materials:'\n" | |
"'Topical Outline:'\n" | |
"Do not add anything else except the required information from this text." | |
) | |
PROMPT_SKILLS = ( | |
"Provide skills based on the Lightcast Open Skills Taxonomy in categories as:\n" | |
"'Primary Skills' (the degree program or certification),\n" | |
"'Secondary Skills', and\n" | |
"'Tertiary Skills'." | |
) | |
PROMPT_IMAGE_STUDENT = ( | |
"You are a highly intelligent assistant designed to analyze images and extract structured information from them. " | |
"Your task is to analyze the given image of a student's academic record and generate a response in the exact JSON format provided below. " | |
"If any specific information is missing or unavailable in the image, replace the corresponding field with null. " | |
"Ensure the format is consistent, strictly adhering to the structure shown below.\n\n" | |
"Required JSON Format:\n\n" | |
"{\n" | |
' "student": {\n' | |
' "name": "string",\n' | |
' "id": "string",\n' | |
' "dob": "string",\n' | |
' "original_start_date": "string",\n' | |
' "cumulative_gpa": "string",\n' | |
' "program": "string",\n' | |
' "status": "string"\n' | |
' }\n' | |
"}\n\n" | |
"Instructions:\n\n" | |
"1. Extract the student's general information as displayed in the image.\n" | |
"2. Use null for any missing or unavailable information.\n" | |
"3. Format the extracted data exactly as shown above. Do not deviate from this structure.\n" | |
"4. Ensure accurate field names and proper nesting.\n" | |
"5. Return only the 'student' section as JSON.\n" | |
) | |
PROMPT_IMAGE_COURSES = ( | |
"You are a highly intelligent assistant designed to analyze images and extract structured information from them. " | |
"Your task is to analyze the given image of a student's academic record and generate a response in the exact JSON format provided below. " | |
"If any specific information is missing or unavailable in the image, replace the corresponding field with null. " | |
"Ensure the format is consistent, strictly adhering to the structure shown below.\n\n" | |
"Required JSON Format:\n\n" | |
"{\n" | |
' "courses": [\n' | |
' {\n' | |
' "transfer_institution": "string",\n' | |
' "course_code": "string",\n' | |
' "course_name": "string",\n' | |
' "credits_attempted": number,\n' | |
' "credits_earned": number,\n' | |
' "grade": "string",\n' | |
' "quality_points": number,\n' | |
' "semester_code": "string",\n' | |
' "semester_dates": "string"\n' | |
' }\n' | |
" // Additional courses can be added here\n" | |
" ]\n" | |
"}\n\n" | |
"Instructions:\n\n" | |
"1. Extract the course details as displayed in the image.\n" | |
"2. Use null for any missing or unavailable information.\n" | |
"3. Format the extracted data exactly as shown above. Do not deviate from this structure.\n" | |
"4. Ensure accurate field names and proper nesting.\n" | |
"5. Return only the 'courses' section as JSON.\n" | |
) | |
def home(): | |
return jsonify({"message": "Welcome to the PDF Extraction API. Use the /extract endpoint to extract information."}) | |
def favicon(): | |
return "", 204 | |
def extract_info(): | |
data = request.json | |
if not data: | |
return jsonify({"error": "Please provide a PDF URL in the request body."}), 400 | |
try: | |
if data["url"] is not None: | |
pdf_url = data["url"] | |
pdf_text = extract_text_from_pdf(pdf_url) | |
prompt = f"{PROMPT}\n\n{pdf_text}" | |
response = predict_text(prompt) | |
else: | |
response = '' | |
if data["skills"] == True: | |
if response: | |
prompt_skills = f"{PROMPT_SKILLS} using this information only -- {response}" | |
response_skills = predict_text(prompt_skills) | |
else: | |
response_skills = '' | |
else: | |
response_skills = '' | |
if data["img_url"] is not None: | |
prompt_student = f"{PROMPT_IMAGE_STUDENT}\n" | |
prompt_courses = f"{PROMPT_IMAGE_COURSES}\n" | |
img_url = data["img_url"] | |
file_pref = data["file_pref"] | |
response_student = predict_image(img_url, prompt_student, file_pref) | |
response_courses = predict_image(img_url, prompt_courses, file_pref) | |
response_image = response_student + response_courses | |
else: | |
response_image = '' | |
return jsonify({"extracted_info": response + "\n" + response_skills + "\n" + response_image}) | |
except Exception as e: | |
return jsonify({"error": str(e)}), 500 | |
if __name__ == "__main__": | |
app.run(host="0.0.0.0", port=7860) |