g0th commited on
Commit
62f0ba5
Β·
verified Β·
1 Parent(s): af446f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -43
app.py CHANGED
@@ -1,18 +1,23 @@
1
  import os
2
  import json
3
- import requests
4
  from PIL import Image
5
  import torch
6
  import gradio as gr
 
 
 
 
 
7
  from ppt_parser import transfer_to_structure
8
- from transformers import AutoProcessor, Llama4ForConditionalGeneration
9
 
10
- # βœ… Hugging Face token
11
  hf_token = os.getenv("HF_TOKEN")
12
  model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
13
 
14
- # βœ… Load model & processor
15
- processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
 
 
16
  model = Llama4ForConditionalGeneration.from_pretrained(
17
  model_id,
18
  token=hf_token,
@@ -21,7 +26,7 @@ model = Llama4ForConditionalGeneration.from_pretrained(
21
  torch_dtype=torch.bfloat16,
22
  )
23
 
24
- # βœ… Global storage
25
  extracted_text = ""
26
  image_paths = []
27
 
@@ -41,7 +46,6 @@ def extract_text_from_pptx_json(parsed_json: dict) -> str:
41
  text += para.get("text", "") + "\n"
42
  return text.strip()
43
 
44
- # βœ… Handle uploaded PPTX
45
  def handle_pptx_upload(pptx_file):
46
  global extracted_text, image_paths
47
  tmp_path = pptx_file.name
@@ -50,57 +54,42 @@ def handle_pptx_upload(pptx_file):
50
  extracted_text = extract_text_from_pptx_json(parsed_json)
51
  return extracted_text or "No readable text found in slides."
52
 
53
- # βœ… Multimodal Q&A using Scout
54
  def ask_llama(question):
55
  global extracted_text, image_paths
56
 
57
  if not extracted_text and not image_paths:
58
- return "Please upload and extract a PPTX first."
59
-
60
- # 🧠 Build multimodal chat messages
61
- messages = [
62
- {
63
- "role": "user",
64
- "content": [],
65
- }
66
- ]
67
-
68
- # Add up to 2 images to prevent OOM
69
- for path in image_paths[:2]:
70
- messages[0]["content"].append({"type": "image", "image": Image.open(path)})
71
-
72
- messages[0]["content"].append({
73
- "type": "text",
74
- "text": f"{extracted_text}\n\nQuestion: {question}"
75
- })
76
-
77
- inputs = processor.apply_chat_template(
78
- messages,
79
- add_generation_prompt=True,
80
- tokenize=True,
81
- return_dict=True,
82
- return_tensors="pt"
83
- ).to(model.device)
84
-
85
- outputs = model.generate(**inputs, max_new_tokens=256)
86
-
87
- response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
88
  return response.strip()
89
 
90
  # βœ… Gradio UI
91
  with gr.Blocks() as demo:
92
- gr.Markdown("## 🧠 Multimodal Llama 4 Scout Study Assistant")
93
 
94
  pptx_input = gr.File(label="πŸ“‚ Upload PPTX File", file_types=[".pptx"])
95
- extract_btn = gr.Button("πŸ“œ Extract Text + Images")
96
 
97
  extracted_output = gr.Textbox(label="πŸ“„ Slide Text", lines=10, interactive=False)
98
-
99
  extract_btn.click(handle_pptx_upload, inputs=[pptx_input], outputs=[extracted_output])
100
 
101
  question = gr.Textbox(label="❓ Ask a Question")
102
- ask_btn = gr.Button("πŸ’¬ Ask Llama 4 Scout")
103
- ai_answer = gr.Textbox(label="πŸ€– Answer", lines=6)
104
 
105
  ask_btn.click(ask_llama, inputs=[question], outputs=[ai_answer])
106
 
 
1
  import os
2
  import json
 
3
  from PIL import Image
4
  import torch
5
  import gradio as gr
6
+ from transformers import (
7
+ BlipImageProcessor,
8
+ AutoTokenizer,
9
+ Llama4ForConditionalGeneration,
10
+ )
11
  from ppt_parser import transfer_to_structure
 
12
 
13
+ # βœ… Load Hugging Face token
14
  hf_token = os.getenv("HF_TOKEN")
15
  model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
16
 
17
+ # βœ… Load image processor, tokenizer, and model manually
18
+ image_processor = BlipImageProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
19
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
20
+
21
  model = Llama4ForConditionalGeneration.from_pretrained(
22
  model_id,
23
  token=hf_token,
 
26
  torch_dtype=torch.bfloat16,
27
  )
28
 
29
+ # βœ… Global state
30
  extracted_text = ""
31
  image_paths = []
32
 
 
46
  text += para.get("text", "") + "\n"
47
  return text.strip()
48
 
 
49
  def handle_pptx_upload(pptx_file):
50
  global extracted_text, image_paths
51
  tmp_path = pptx_file.name
 
54
  extracted_text = extract_text_from_pptx_json(parsed_json)
55
  return extracted_text or "No readable text found in slides."
56
 
 
57
  def ask_llama(question):
58
  global extracted_text, image_paths
59
 
60
  if not extracted_text and not image_paths:
61
+ return "Please upload and extract a PPTX file first."
62
+
63
+ # βœ… Use the first image only (you can expand to multiple with batching)
64
+ image = Image.open(image_paths[0]).convert("RGB")
65
+ vision_inputs = image_processor(images=image, return_tensors="pt").to(model.device)
66
+
67
+ prompt = f"<|user|>\n{extracted_text}\n\nQuestion: {question}<|end|>\n<|assistant|>\n"
68
+ text_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
69
+
70
+ with torch.no_grad():
71
+ output = model.generate(
72
+ input_ids=text_inputs["input_ids"],
73
+ pixel_values=vision_inputs["pixel_values"],
74
+ max_new_tokens=256,
75
+ )
76
+
77
+ response = tokenizer.decode(output[0][text_inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  return response.strip()
79
 
80
  # βœ… Gradio UI
81
  with gr.Blocks() as demo:
82
+ gr.Markdown("## 🧠 Llama-4-Scout Multimodal Study Assistant")
83
 
84
  pptx_input = gr.File(label="πŸ“‚ Upload PPTX File", file_types=[".pptx"])
85
+ extract_btn = gr.Button("πŸ“œ Extract Text + Slides")
86
 
87
  extracted_output = gr.Textbox(label="πŸ“„ Slide Text", lines=10, interactive=False)
 
88
  extract_btn.click(handle_pptx_upload, inputs=[pptx_input], outputs=[extracted_output])
89
 
90
  question = gr.Textbox(label="❓ Ask a Question")
91
+ ask_btn = gr.Button("πŸ’¬ Ask Scout")
92
+ ai_answer = gr.Textbox(label="πŸ€– Llama Answer", lines=6)
93
 
94
  ask_btn.click(ask_llama, inputs=[question], outputs=[ai_answer])
95