Spaces:

prithivMLmods
/

Multimodal-VLMs

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

a649be3

verified ·

1 Parent(s): 5d4f983

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -71

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ from transformers import (
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
-from pdf2image import convert_from_path
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
@@ -85,16 +84,6 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-# Function to convert PDF to image
-def pdf_to_image(pdf_path):
-    """
-    Converts a single-page PDF to a PIL image.
-    """
-    images = convert_from_path(pdf_path)
-    if not images:
-        raise ValueError("Failed to convert PDF to image.")
-    return images[0]  # Return the first page
 # Function to generate text responses based on image input
 @spaces.GPU
 def generate_image(model_name: str,
@@ -240,37 +229,7 @@ def generate_video(model_name: str,
         time.sleep(0.01)
         yield buffer, buffer
-# Function to generate text responses based on PDF input
-@spaces.GPU
-def generate_pdf(model_name: str,
-                 text: str,
-                 pdf_path: str,
-                 max_new_tokens: int = 1024,
-                 temperature: float = 0.6,
-                 top_p: float = 0.9,
-                 top_k: int = 50,
-                 repetition_penalty: float = 1.2):
-    """
-    Generates responses using the selected model for single-page PDF input by converting it to an image.
-    """
-    try:
-        image = pdf_to_image(pdf_path)
-    except Exception as e:
-        yield f"Error converting PDF to image: {str(e)}", f"Error converting PDF to image: {str(e)}"
-        return
-    yield from generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty)
-# Function to save the output text to a Markdown file
-def save_to_md(output_text):
-    """
-    Saves the output text to a Markdown file and returns the file path for download.
-    """
-    file_path = f"result_{uuid.uuid4()}.md"
-    with open(file_path, "w") as f:
-        f.write(output_text)
-    return file_path
-# Define examples for image, video, and PDF inference
 image_examples = [
     ["Solve the problem to find the value.", "images/1.jpg"],
     ["Explain the scene.", "images/6.JPG"],
@@ -283,12 +242,6 @@ image_examples = [
 video_examples = [
     ["Explain the video in detail.", "videos/1.mp4"],
     ["Explain the video in detail.", "videos/2.mp4"]
-]
-pdf_examples = [
-    ["Explain the content briefly.", "pdfs/1.pdf"],
-    ["What is the content about?", "pdfs/2.pdf"]
 ]
 # Added CSS to style the output area as a "Canvas"
@@ -333,15 +286,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                                              elem_classes="submit-btn")
                     gr.Examples(examples=video_examples,
                                 inputs=[video_query, video_upload])
-                with gr.TabItem("PDF Inference"):
-                    pdf_query = gr.Textbox(
-                        label="Query Input",
-                        placeholder="Enter your query here...")
-                    pdf_upload = gr.File(label="Single Page PDF", type="filepath")
-                    pdf_submit = gr.Button("Submit",
-                                           elem_classes="submit-btn")
-                    gr.Examples(examples=pdf_examples,
-                                inputs=[pdf_query, pdf_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens",
@@ -411,20 +355,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                            repetition_penalty
                        ],
                        outputs=[output, markdown_output])
-    pdf_submit.click(fn=generate_pdf,
-                     inputs=[
-                         model_choice, pdf_query, pdf_upload,
-                         max_new_tokens, temperature, top_p, top_k,
-                         repetition_penalty
-                     ],
-                     outputs=[output, markdown_output])
-    # Uncomment the following lines to enable download functionality(ps:no needed for now)
-    #download_btn.click(
-    #    fn=save_to_md,
-    #    inputs=output,
-    #    outputs=None
-    #)
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
     vidcap.release()
     return frames
 # Function to generate text responses based on image input
 @spaces.GPU
 def generate_image(model_name: str,
         time.sleep(0.01)
         yield buffer, buffer
+# Define examples for image and video inference
 image_examples = [
     ["Solve the problem to find the value.", "images/1.jpg"],
     ["Explain the scene.", "images/6.JPG"],
 video_examples = [
     ["Explain the video in detail.", "videos/1.mp4"],
     ["Explain the video in detail.", "videos/2.mp4"]
 ]
 # Added CSS to style the output area as a "Canvas"
                                              elem_classes="submit-btn")
                     gr.Examples(examples=video_examples,
                                 inputs=[video_query, video_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens",
                            repetition_penalty
                        ],
                        outputs=[output, markdown_output])
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)