Spaces:

MawaredHR
/

Vision_tester

Running

App Files Files Community

Daemontatox commited on Feb 11

Commit

edaf4b6

verified ·

1 Parent(s): c7c8ce5

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -31

app.py CHANGED Viewed

@@ -11,9 +11,9 @@ from openai import OpenAI  # Use the OpenAI client that supports multimodal mess
 # Load API key from environment variable (secrets)
 HF_API_KEY = os.getenv("OPENAI_TOKEN")
 if not HF_API_KEY:
-    raise ValueError("HF_API_KEY environment variable not set")
-# Create the client pointing to the Hugging Face Inference endpoint
 client = OpenAI(
     base_url="https://openrouter.ai/api/v1",
     api_key=HF_API_KEY
@@ -50,15 +50,13 @@ def process_pdf_file(file_path):
                 page = doc[page_num]
                 page_text = page.get_text("text")
                 if page_text.strip():
-                    text += f"Page {page_num + 1}:\n{page_text}\n\n"
                 # Render page as an image with a zoom factor
                 zoom = 3
                 mat = fitz.Matrix(zoom, zoom)
                 pix = page.get_pixmap(matrix=mat, alpha=False)
                 img_data = pix.tobytes("png")
                 img = Image.open(io.BytesIO(img_data)).convert("RGB")
                 # Resize if image is too large
                 max_size = 1600
                 if max(img.size) > max_size:
@@ -140,31 +138,24 @@ predetermined_prompts = {
 # -------------------------------
 def chat_respond(user_message, history, prompt_option):
     """
-    Append the user message (or, if starting a new conversation and no message is provided,
-    use the predetermined prompt) to the conversation history; build the API call using
-    the full conversation history (and the image if available); stream back the assistant response
-    while updating the history.
     The history is a list of [user_text, assistant_text] pairs.
     """
-    # If this is the first message, add the predetermined prompt text.
     if history == []:
-        # If user_message is empty, use the predetermined prompt.
         if not user_message.strip():
             user_message = predetermined_prompts.get(prompt_option, "Hello")
         else:
-            # Optionally, prepend the predetermined prompt.
             user_message = predetermined_prompts.get(prompt_option, "") + "\n" + user_message
-    # Append the new user message with an empty assistant response.
     history = history + [[user_message, ""]]
-    # Build the messages list (for the multimodal API) from the conversation history.
     messages = []
     for i, (user_msg, assistant_msg) in enumerate(history):
-        # For the user message:
         user_content = [{"type": "text", "text": user_msg}]
-        # For the very first user message, if an image was uploaded, append the image.
         if i == 0 and doc_state.current_doc_images:
             buffered = io.BytesIO()
             doc_state.current_doc_images[0].save(buffered, format="PNG")
@@ -175,34 +166,31 @@ def chat_respond(user_message, history, prompt_option):
                 "image_url": {"url": data_uri}
             })
         messages.append({"role": "user", "content": user_content})
-        # For the assistant response, if available.
         if assistant_msg:
             messages.append({
                 "role": "assistant",
                 "content": [{"type": "text", "text": assistant_msg}]
             })
-    # Call the inference API with streaming enabled.
     try:
         stream = client.chat.completions.create(
-            model="google/gemini-2.0-pro-exp-02-05:free",
             messages=messages,
             max_tokens=8192,
             stream=True
         )
     except Exception as e:
         logger.error(f"Error calling the API: {str(e)}")
-        history[-1][1] = "An error occurred while processing your request. Please try again."
         yield history, history
-    # Stream and update the assistant's reply token by token.
     buffer = ""
     for chunk in stream:
         delta = chunk.choices[0].delta.content
         buffer += delta
-        # Update the assistant part of the latest message in the history.
         history[-1][1] = buffer
-        # Yield the updated chat history (for the Chatbot component) and the state.
         yield history, history
         time.sleep(0.01)
@@ -212,11 +200,11 @@ def chat_respond(user_message, history, prompt_option):
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# Vision Software Testing Chatbot")
     gr.Markdown(
         "Upload a PDF or an image (PNG, JPG, JPEG, GIF, BMP, WEBP). Then choose a prompt from the dropdown. "
         "For example, select **Software Tester** to have the bot analyze an image of a software interface "
-        "and generate test cases. Chat with the bot in the conversation below."
     )
     with gr.Row():
@@ -230,6 +218,7 @@ with gr.Blocks() as demo:
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
             choices=[
                 "Software Tester"
             ],
             value="Software Tester"
@@ -244,16 +233,19 @@ with gr.Blocks() as demo:
     # State to hold the conversation history
     chat_state = gr.State([])
     # When a file is uploaded, process it.
     file_upload.change(fn=process_uploaded_file, inputs=file_upload, outputs=upload_status)
-    # Clear both the document context and chat history.
     clear_btn.click(fn=clear_context, outputs=[upload_status, chat_state])
     # When the user clicks Send, process the message and update the chat.
-    send_btn.click(fn=chat_respond,
-                   inputs=[user_input, chat_state, prompt_dropdown],
-                   outputs=[chatbot, chat_state])
 demo.launch(debug=True)

 # Load API key from environment variable (secrets)
 HF_API_KEY = os.getenv("OPENAI_TOKEN")
 if not HF_API_KEY:
+    raise ValueError("OPENAI_TOKEN environment variable not set")
+# Create the client pointing to the inference endpoint (e.g., OpenRouter)
 client = OpenAI(
     base_url="https://openrouter.ai/api/v1",
     api_key=HF_API_KEY
                 page = doc[page_num]
                 page_text = page.get_text("text")
                 if page_text.strip():
+                    text += f"Page {page_num+1}:\n{page_text}\n\n"
                 # Render page as an image with a zoom factor
                 zoom = 3
                 mat = fitz.Matrix(zoom, zoom)
                 pix = page.get_pixmap(matrix=mat, alpha=False)
                 img_data = pix.tobytes("png")
                 img = Image.open(io.BytesIO(img_data)).convert("RGB")
                 # Resize if image is too large
                 max_size = 1600
                 if max(img.size) > max_size:
 # -------------------------------
 def chat_respond(user_message, history, prompt_option):
     """
+    Append the user message to the conversation history and call the API.
+    In case of an API error (such as unauthorized access), return an error message.
     The history is a list of [user_text, assistant_text] pairs.
     """
+    # If this is the first message and no message is provided, use the predetermined prompt.
     if history == []:
         if not user_message.strip():
             user_message = predetermined_prompts.get(prompt_option, "Hello")
         else:
             user_message = predetermined_prompts.get(prompt_option, "") + "\n" + user_message
     history = history + [[user_message, ""]]
+    # Build the messages list for the multimodal API from the conversation history.
     messages = []
     for i, (user_msg, assistant_msg) in enumerate(history):
         user_content = [{"type": "text", "text": user_msg}]
+        # For the very first user message, attach the image if available.
         if i == 0 and doc_state.current_doc_images:
             buffered = io.BytesIO()
             doc_state.current_doc_images[0].save(buffered, format="PNG")
                 "image_url": {"url": data_uri}
             })
         messages.append({"role": "user", "content": user_content})
         if assistant_msg:
             messages.append({
                 "role": "assistant",
                 "content": [{"type": "text", "text": assistant_msg}]
             })
+    # Try to call the API with streaming enabled.
     try:
         stream = client.chat.completions.create(
+            model="google/gemini-2.0-flash-lite-preview-02-05:free",
             messages=messages,
             max_tokens=8192,
             stream=True
         )
     except Exception as e:
         logger.error(f"Error calling the API: {str(e)}")
+        history[-1][1] = "An error occurred while processing your request. Please check your API credentials."
         yield history, history
+        return
     buffer = ""
     for chunk in stream:
         delta = chunk.choices[0].delta.content
         buffer += delta
         history[-1][1] = buffer
         yield history, history
         time.sleep(0.01)
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Document Analyzer & Software Testing Chatbot")
     gr.Markdown(
         "Upload a PDF or an image (PNG, JPG, JPEG, GIF, BMP, WEBP). Then choose a prompt from the dropdown. "
         "For example, select **Software Tester** to have the bot analyze an image of a software interface "
+        "and generate test cases. You can also chat with the model—the conversation history is preserved."
     )
     with gr.Row():
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
             choices=[
                 "Software Tester"
             ],
             value="Software Tester"
     # State to hold the conversation history
     chat_state = gr.State([])
     # When a file is uploaded, process it.
     file_upload.change(fn=process_uploaded_file, inputs=file_upload, outputs=upload_status)
+    # Clear both the document context and the chat history.
     clear_btn.click(fn=clear_context, outputs=[upload_status, chat_state])
     # When the user clicks Send, process the message and update the chat.
+    send_btn.click(
+        fn=chat_respond,
+        inputs=[user_input, chat_state, prompt_dropdown],
+        outputs=[chatbot, chat_state],
+        stream=True
+    )
 demo.launch(debug=True)