import gradio as gr import os import torch from transformers import AutoProcessor, MllamaForConditionalGeneration from PIL import Image import spaces import tempfile import requests from PyPDF2 import PdfReader # Check if we're running in a Hugging Face Space and if SPACES_ZERO_GPU is enabled IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1" IS_SPACE = os.environ.get("SPACE_ID", None) is not None # Determine the device (GPU if available, else CPU) device = "cuda" if torch.cuda.is_available() else "cpu" LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" print(f"Using device: {device}") print(f"Low memory mode: {LOW_MEMORY}") # Get Hugging Face token from environment variables HF_TOKEN = os.environ.get('HF_TOKEN') # Load the model and processor model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_name, use_auth_token=HF_TOKEN, torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, device_map="auto" if device == "cuda" else None, # Use device mapping if CUDA is available ) # Move the model to the appropriate device (GPU if available) model.to(device) processor = AutoProcessor.from_pretrained(model_name, use_auth_token=HF_TOKEN) # @spaces.GPU # Use the free GPU provided by Hugging Face Spaces # def predict(image, text): # # Prepare the input messages # messages = [ # {"role": "user", "content": [ # {"type": "image"}, # Specify that an image is provided # {"type": "text", "text": text} # Add the user-provided text input # ]} # ] # # Create the input text using the processor's chat template # input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # # Process the inputs and move to the appropriate device # inputs = processor(image, input_text, return_tensors="pt").to(device) # # Generate a response from the model # outputs = model.generate(**inputs, max_new_tokens=100) # # Decode the output to return the final response # response = processor.decode(outputs[0], skip_special_tokens=True) # return response def extract_text_from_pdf(pdf_url): try: response = requests.get(pdf_url) response.raise_for_status() with tempfile.NamedTemporaryFile(delete=False) as temp_pdf: temp_pdf.write(response.content) temp_pdf_path = temp_pdf.name reader = PdfReader(temp_pdf_path) text = "" for page in reader.pages: text += page.extract_text() os.remove(temp_pdf_path) return text except Exception as e: raise ValueError(f"Error extracting text from PDF: {str(e)}") # raise HTTPException(status_code=400, detail=f"Error extracting text from PDF: {str(e)}") @spaces.GPU def predict_text(text, url = 'https://arinsight.co/2024_FA_AEC_1200_GR1_GR2.pdf'): pdf_text = extract_text_from_pdf('https://arinsight.co/2024_FA_AEC_1200_GR1_GR2.pdf') text_combined = text + "\n\nExtracted Text from PDF:\n" + pdf_text # Prepare the input messages messages = [{"role": "user", "content": [{"type": "text", "text": text_combined}]}] # Create the input text using the processor's chat template input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process the inputs and move to the appropriate device # inputs = processor(image, input_text, return_tensors="pt").to(device) inputs = processor(text=input_text, return_tensors="pt").to("cuda") # Generate a response from the model outputs = model.generate(**inputs, max_new_tokens=1024) # Decode the output to return the final response response = processor.decode(outputs[0], skip_special_tokens=True) return response # Define the Gradio interface interface = gr.Interface( fn=predict_text, inputs=[ # gr.Image(type="pil", label="Image Input"), # Image input with label gr.Textbox(label="Text Input") # Textbox input with label ], outputs=gr.Textbox(label="Generated Response"), # Output with a more descriptive label title="Llama 3.2 11B Vision Instruct Demo", # Title of the interface description="This demo uses Meta's Llama 3.2 11B Vision model to generate responses based on an image and text input.", # Short description theme="compact" # Using a compact theme for a cleaner look ) # Launch the interface interface.launch(debug=True)