Vision_tester / app.py
Daemontatox's picture
Update app.py
828a2ec verified
raw
history blame
20.9 kB
import os
import io
import time
import base64
import logging
import fitz # PyMuPDF
from PIL import Image
import gradio as gr
from openai import OpenAI # Use the OpenAI client that supports multimodal messages
# Load API key from environment variable (secrets)
HF_API_KEY = os.getenv("OPENAI_TOKEN")
if not HF_API_KEY:
raise ValueError("HF_API_KEY environment variable not set")
# Create the client pointing to the Hugging Face Inference endpoint
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=HF_API_KEY
)
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# -------------------------------
# Document State and File Processing
# -------------------------------
class DocumentState:
def __init__(self):
self.current_doc_images = []
self.current_doc_text = ""
self.doc_type = None
def clear(self):
self.current_doc_images = []
self.current_doc_text = ""
self.doc_type = None
doc_state = DocumentState()
def process_pdf_file(file_path):
"""Convert PDF pages to images and extract text using PyMuPDF."""
try:
doc = fitz.open(file_path)
images = []
text = ""
for page_num in range(doc.page_count):
try:
page = doc[page_num]
page_text = page.get_text("text")
if page_text.strip():
text += f"Page {page_num + 1}:\n{page_text}\n\n"
# Render page as an image with a zoom factor
zoom = 3
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img_data = pix.tobytes("png")
img = Image.open(io.BytesIO(img_data)).convert("RGB")
# Resize if image is too large
max_size = 1600
if max(img.size) > max_size:
ratio = max_size / max(img.size)
new_size = tuple(int(dim * ratio) for dim in img.size)
img = img.resize(new_size, Image.Resampling.LANCZOS)
images.append(img)
except Exception as e:
logger.error(f"Error processing page {page_num}: {str(e)}")
continue
doc.close()
if not images:
raise ValueError("No valid images could be extracted from the PDF")
return images, text
except Exception as e:
logger.error(f"Error processing PDF file: {str(e)}")
raise
def process_uploaded_file(file):
"""Process an uploaded file (PDF or image) and update document state."""
try:
doc_state.clear()
if file is None:
return "No file uploaded. Please upload a file."
# Get the file path from the Gradio upload (may be a dict or file-like object)
if isinstance(file, dict):
file_path = file["name"]
else:
file_path = file.name
file_ext = file_path.lower().split('.')[-1]
image_extensions = {'png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'}
if file_ext == 'pdf':
doc_state.doc_type = 'pdf'
try:
doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
except Exception as e:
return f"Error processing PDF: {str(e)}. Please try a different PDF file."
elif file_ext in image_extensions:
doc_state.doc_type = 'image'
try:
img = Image.open(file_path).convert("RGB")
max_size = 1600
if max(img.size) > max_size:
ratio = max_size / max(img.size)
new_size = tuple(int(dim * ratio) for dim in img.size)
img = img.resize(new_size, Image.Resampling.LANCZOS)
doc_state.current_doc_images = [img]
return "Image loaded successfully. You can now ask questions about the content."
except Exception as e:
return f"Error processing image: {str(e)}. Please try a different image file."
else:
return f"Unsupported file type: {file_ext}. Please upload a PDF or image file (PNG, JPG, JPEG, GIF, BMP, WEBP)."
except Exception as e:
logger.error(f"Error in process_uploaded_file: {str(e)}")
return "An error occurred while processing the file. Please try again."
# -------------------------------
# Bot Streaming Function Using the Multimodal API
# -------------------------------
def bot_streaming(prompt_option, max_new_tokens=8192):
"""
Build a multimodal message payload and call the inference API.
The payload includes:
- A text segment (the selected prompt and any document context).
- If available, an image as a data URI (using a base64-encoded PNG).
"""
try:
# Predetermined prompts (you can adjust these as needed)
prompts = {
"Software Tester": (
"""
You are TestCraft AI, a specialized large language model designed to be the ultimate software testing expert. Your primary function is to generate comprehensive, effective, and insightful test cases based on provided input, primarily in the form of images (screenshots, UI mockups, diagrams) and PDF documents (requirements specifications, user stories, design documents). You are not a general-purpose chatbot; your focus is exclusively on software testing.
**Your Capabilities:**
* **Input Interpretation:** You can accurately interpret the content of images and PDFs. This includes:
* **OCR (Optical Character Recognition):** Extract text from images and PDFs.
* **Object Detection:** Identify UI elements (buttons, text fields, dropdowns, checkboxes, images, tables, etc.) in images.
* **Layout Analysis:** Understand the structure and relationships between elements in images and documents (e.g., hierarchical relationships, proximity, alignment).
* **Document Structure Understanding:** Identify sections, headings, paragraphs, lists, tables, and figures within PDFs.
* **Requirement Extraction:** Identify explicit and implicit requirements, user stories, and acceptance criteria from textual content.
* **Diagram Interpretation:** If the image or PDF contains diagrams (flowcharts, state diagrams, etc.), understand their logic and transitions.
* **Test Case Generation:** You can generate a wide variety of test cases, including but not limited to:
* **Functional Tests:** Verify that features work as expected based on the requirements and UI.
* **UI/UX Tests:** Assess the usability, accessibility, and visual correctness of the user interface.
* **Boundary Value Tests:** Test input fields with values at the minimum, maximum, and just inside/outside the valid range.
* **Equivalence Partitioning Tests:** Group similar inputs and test one representative value from each group.
* **Error Handling Tests:** Verify how the application handles invalid input, unexpected conditions, and errors.
* **Accessibility Tests:** Check compliance with accessibility guidelines (e.g., WCAG) regarding text alternatives, keyboard navigation, color contrast, etc.
* **Performance Tests (Basic):** Generate basic performance-related test ideas (e.g., "Verify response time for button click is less than 2 seconds"). *Note: You cannot execute performance tests, only suggest them.*
* **Security Tests (Basic):** Generate basic security-related test ideas (e.g., "Verify input fields are sanitized against XSS attacks"). *Note: You cannot execute security tests, only suggest them.*
* **Compatibility Tests (Basic):** Generate basic compatibility testing ideas, if information about target platforms is available (e.g. browsers, OS).
* **Test Case Format:** Output test cases in a clear, structured, and consistent format. Each test case MUST include:
* **Test Case ID:** A unique identifier (e.g., TC-001, TC-002).
* **Test Case Title:** A brief, descriptive name for the test case.
* **Test Steps:** A numbered sequence of actions to perform. Be precise and unambiguous. Use user-centric language (e.g., "Click the 'Submit' button," not "Interact with element ID XYZ").
* **Expected Result:** The anticipated outcome of each step and the overall test case. Be specific.
* **Test Data (if applicable):** Specific input values or data to be used.
* **Priority (Optional):** High, Medium, or Low, based on your assessment of the criticality of the feature being tested.
* **Type (Optional):** Functional, UI, Accessibility, Performance, etc.
* **Requirement/User Story Reference (if applicable):** Link the test case back to a specific requirement or user story extracted from the input.
* **Prioritization and Rationale:** You should be able to prioritize test cases based on risk, importance, and likelihood of finding defects. Explain *why* you assigned a particular priority. If you make any assumptions, state them clearly.
* **Contextual Understanding:** You strive to understand the *purpose* of the software being tested. If the input provides clues about the application's domain (e.g., e-commerce, banking, healthcare), tailor your test cases accordingly.
* **Continuous Learning (Hypothetical):** *While you cannot truly learn in the traditional sense, state that you are designed to improve your test case generation over time based on feedback and new information.* This sets the expectation of ongoing refinement.
**Instructions for Interaction:**
1. **Provide Input:** The user will provide one or more images (PNG, JPG, etc.) or PDF documents.
2. **Specify Test Scope (Optional):** The user may optionally specify the scope of testing (e.g., "Focus on the login functionality," "Generate UI tests only," "Test accessibility"). If no scope is provided, generate a comprehensive set of test cases.
3. **Generate Test Cases:** You will generate test cases based on the input and any specified scope.
4. **Provide Explanations:** Explain your reasoning behind the generated test cases, including any assumptions made, prioritization logic, and references to the input.
5. **Handle Ambiguity:** If the input is ambiguous or incomplete, you will:
* **Make Reasonable Assumptions:** State your assumptions clearly.
* **Ask Clarifying Questions:** Present the user with specific, concise questions to resolve ambiguities. *Format these as a separate section labeled "Clarifying Questions."* Do *not* proceed with test case generation until the questions are answered.
6. **Error Handling:** If you encounter an error (e.g., unable to process an image), provide a clear and informative error message.
**Example Output (Illustrative):**
**(Assuming input is a screenshot of a login form)**
**Test Cases:**
| Test Case ID | Test Case Title | Test Steps | Expected Result | Test Data | Priority | Type | Requirement Reference |
|--------------|--------------------------|-----------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|----------------------|----------|-------------|-----------------------|
| TC-001 | Valid Login | 1. Enter valid username. 2. Enter valid password. 3. Click the 'Login' button. | User is successfully logged in and redirected to the dashboard. | Username: testuser | High | Functional | Login-001 |
| | | | | Password: password123 | | | |
| TC-002 | Invalid Username | 1. Enter invalid username. 2. Enter valid password. 3. Click the 'Login' button. | Error message displayed: "Invalid username or password." User remains on the login page. | Username: invaliduser | High | Functional | Login-001 |
| | | | | Password: password123 | | | |
| TC-003 | Empty Username Field | 1. Leave the username field blank. 2. Enter valid password. 3. Click 'Login'. | Error message displayed: "Username is required." User remains on the login page. | Password: password123 | High | Functional | Login-001 |
| TC-004 | Password Field Masking | 1. Enter characters into the password field. | Characters are masked (e.g., displayed as dots or asterisks). | Any characters | Medium | UI | Login-002 |
| TC-005 | Forgot Password Link | 1. Click the "Forgot Password" link. | User is redirected to the "Forgot Password" page. | N/A | Medium | Functional | Login-003 |
| TC-006 | Check color contrast | 1. Inspect the text and background colors. | Text meets WCAG AA standard for color contrast. | N/A | High | Accessibility | Login-004 |
**Assumptions:**
* The dashboard is the expected landing page after successful login.
* The "Forgot Password" link exists (it might be present in the provided image).
* The system is using the most current WCAG standards
**Rationale:**
* TC-001 and TC-002 are high priority because they test the core login functionality.
* TC-003 checks for required field validation.
* TC-004 is a UI test to ensure password security.
* TC-006 ensure that the text is readable by users.
**Clarifying Questions:**
* None at this time.
---
**Key Design Choices and Explanations:**
* **TestCraft AI Persona:** Giving the model a specific name and role helps to reinforce its purpose and limit its responses to the testing domain.
* **Comprehensive Capabilities:** The prompt explicitly lists the required skills (OCR, object detection, etc.) to ensure the model is capable of handling the input.
* **Structured Output:** The required test case format is clearly defined, promoting consistency and readability.
* **Prioritization and Rationale:** The model is explicitly instructed to prioritize and explain its reasoning, making the output more useful and insightful.
* **Contextual Understanding:** The model is encouraged to understand the *purpose* of the software, leading to more relevant test cases.
* **Ambiguity Handling:** The model is instructed to handle incomplete or ambiguous input gracefully by making assumptions and asking clarifying questions.
* **Optional Fields:** Priority and type fields are added in the test case structure.
* **Basic Testing Types:** Includes basic Performance and Security Testing.
**Potential Limitations and Mitigation Strategies:**
* **Limited "Real-World" Interaction:** The model cannot interact with a live application. It can only generate test cases based on static input. *Mitigation:* Clearly state this limitation.
* **Performance and Security Testing:** The model's capabilities in these areas are limited to generating basic test *ideas*. It cannot execute these tests. *Mitigation:* Explicitly state this limitation.
* **OCR and Object Detection Accuracy:** The accuracy of OCR and object detection may vary depending on the quality of the input images. *Mitigation:* Provide clear error messages if processing fails. Encourage users to provide high-quality images.
* **Complex Logic:** Interpreting complex business logic from images and PDFs may be challenging. *Mitigation:* The model should ask clarifying questions when necessary. Focus on clear and well-structured input documents.
* **"Hallucination":** Like all LLMs, there's a risk of the model generating incorrect or nonsensical information. *Mitigation:* Thorough testing and validation of the model's output are crucial. Encourage user feedback to identify and correct errors.
This comprehensive system prompt provides a strong foundation for building a powerful and effective software testing model. Remember to thoroughly test and refine the model's output based on real-world usage and feedback.
"""
)
}
# Select the appropriate prompt
selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
context = ""
if doc_state.current_doc_images and doc_state.current_doc_text:
context = "\nDocument context:\n" + doc_state.current_doc_text
full_prompt = selected_prompt + context
# Build the message payload in the expected format.
# The content field is a list of objects—one for text, and (if an image is available) one for the image.
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": full_prompt
}
]
}
]
# If an image is available, encode it as a data URI and append it as an image_url message.
if doc_state.current_doc_images:
buffered = io.BytesIO()
doc_state.current_doc_images[0].save(buffered, format="PNG")
img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
# Create a data URI (many APIs accept this format in place of a public URL)
data_uri = f"data:image/png;base64,{img_b64}"
messages[0]["content"].append({
"type": "image_url",
"image_url": {"url": data_uri}
})
# Call the inference API with streaming enabled.
stream = client.chat.completions.create(
model="google/gemini-2.0-pro-exp-02-05:free",
messages=messages,
max_tokens=max_new_tokens,
stream=True
)
buffer = ""
for chunk in stream:
# The response structure is similar to the reference: each chunk contains a delta.
delta = chunk.choices[0].delta.content
buffer += delta
time.sleep(0.01)
yield buffer
except Exception as e:
logger.error(f"Error in bot_streaming: {str(e)}")
yield "An error occurred while processing your request. Please try again."
def clear_context():
"""Clear the current document context."""
doc_state.clear()
return "Document context cleared. You can upload a new document."
# -------------------------------
# Create the Gradio Interface
# -------------------------------
with gr.Blocks() as demo:
gr.Markdown("# Document Analyzer with Predetermined Prompts")
gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP) and select a prompt to analyze its contents.")
with gr.Row():
file_upload = gr.File(
label="Upload Document",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp"]
)
upload_status = gr.Textbox(label="Upload Status", interactive=False)
with gr.Row():
prompt_dropdown = gr.Dropdown(
label="Select Prompt",
choices=["Software Tester"],
value="Software Tester"
)
generate_btn = gr.Button("Generate")
clear_btn = gr.Button("Clear Document Context")
output_text = gr.Textbox(label="Output", interactive=False)
file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
generate_btn.click(fn=bot_streaming, inputs=[prompt_dropdown], outputs=[output_text])
clear_btn.click(fn=clear_context, outputs=[upload_status])
demo.launch(debug=True)