Robo-Beam

Running on Zero

App Files Files Community

Robo-Beam / app.py

seawolf2357

Update app.py

6778255 5 months ago

raw

history blame

17.3 kB

	#!/usr/bin/env python

	import os
	import re
	import tempfile
	from collections.abc import Iterator
	from threading import Thread

	import cv2
	import gradio as gr
	import spaces
	import torch
	from loguru import logger
	from PIL import Image
	from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
	import pandas as pd
	import PyPDF2

	##################################################
	# 기본 설정
	##################################################
	MAX_CONTENT_CHARS = 8000 # 텍스트로 전달 시 최대 글자 수
	model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")

	processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
	model = Gemma3ForConditionalGeneration.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	attn_implementation="eager"
	)
	MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))

	##################################################
	# 1) CSV, TXT, PDF 분석 함수 (빈 파일 대비)
	##################################################
	def analyze_csv_file(path: str) -> str:
	try:
	df = pd.read_csv(path)
	df_str = df.to_string().strip()
	if not df_str:
	df_str = "(CSV is empty)"
	if len(df_str) > MAX_CONTENT_CHARS:
	df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
	return f"[CSV File: {os.path.basename(path)}]\n\n{df_str}"
	except Exception as e:
	return f"Failed to read CSV ({os.path.basename(path)}): {str(e)}"


	def analyze_txt_file(path: str) -> str:
	try:
	with open(path, "r", encoding="utf-8") as f:
	text = f.read().strip()
	if not text:
	text = "(TXT is empty)"
	if len(text) > MAX_CONTENT_CHARS:
	text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
	return f"[TXT File: {os.path.basename(path)}]\n\n{text}"
	except Exception as e:
	return f"Failed to read TXT ({os.path.basename(path)}): {str(e)}"


	def pdf_to_markdown(pdf_path: str) -> str:
	try:
	with open(pdf_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	chunks = []
	for page_num, page in enumerate(reader.pages, start=1):
	ptext = (page.extract_text() or "").strip()
	if ptext:
	chunks.append(f"## Page {page_num}\n\n{ptext}\n")
	full_text = "\n".join(chunks).strip()
	if not full_text:
	full_text = "(PDF is empty)"
	if len(full_text) > MAX_CONTENT_CHARS:
	full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
	return f"[PDF File: {os.path.basename(pdf_path)}]\n\n{full_text}"
	except Exception as e:
	return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"


	##################################################
	# 2) 이미지/비디오 업로드 제한
	##################################################
	def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
	image_count = 0
	video_count = 0
	for path in paths:
	if path.endswith(".mp4"):
	video_count += 1
	else:
	image_count += 1
	return image_count, video_count


	def count_files_in_history(history: list[dict]) -> tuple[int, int]:
	image_count = 0
	video_count = 0
	for item in history:
	# assistant 또는 content가 str이면 제외
	if item["role"] != "user" or isinstance(item["content"], str):
	continue
	file_path = item["content"][0]
	if file_path.endswith(".mp4"):
	video_count += 1
	else:
	image_count += 1
	return image_count, video_count


	def validate_media_constraints(message: dict, history: list[dict]) -> bool:
	"""
	이미지/비디오 개수 제한
	"""
	media_files = []
	for f in message["files"]:
	if re.search(r"\.(png\|jpg\|jpeg\|gif\|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
	media_files.append(f)

	new_image_count, new_video_count = count_files_in_new_message(media_files)
	history_image_count, history_video_count = count_files_in_history(history)
	image_count = history_image_count + new_image_count
	video_count = history_video_count + new_video_count

	# 비디오 1개 초과 불가
	if video_count > 1:
	gr.Warning("Only one video is supported.")
	return False
	# 비디오+이미지 혼합 불가
	if video_count == 1:
	if image_count > 0:
	gr.Warning("Mixing images and videos is not allowed.")
	return False
	if "<image>" in message["text"]:
	gr.Warning("Using <image> tags with video files is not supported.")
	return False
	# 이미지 개수 제한
	if video_count == 0 and image_count > MAX_NUM_IMAGES:
	gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
	return False
	# <image> 태그 수와 이미지 파일 수 일치
	if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
	gr.Warning("The number of <image> tags in the text does not match the number of images.")
	return False

	return True


	##################################################
	# 3) 비디오 처리
	##################################################
	def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
	vidcap = cv2.VideoCapture(video_path)
	fps = vidcap.get(cv2.CAP_PROP_FPS)
	total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
	frame_interval = int(fps / 3)

	frames = []
	for i in range(0, total_frames, frame_interval):
	vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
	success, image = vidcap.read()
	if success:
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(image)
	timestamp = round(i / fps, 2)
	frames.append((pil_image, timestamp))
	vidcap.release()
	return frames


	def process_video(video_path: str) -> list[dict]:
	content = []
	frames = downsample_video(video_path)
	for pil_image, timestamp in frames:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
	pil_image.save(temp_file.name)
	content.append({"type": "text", "text": f"Frame {timestamp}:"})
	content.append({"type": "image", "url": temp_file.name})
	return content


	##################################################
	# 4) interleaved <image> 처리
	##################################################
	def process_interleaved_images(message: dict) -> list[dict]:
	parts = re.split(r"(<image>)", message["text"])
	content = []
	image_index = 0
	for part in parts:
	if part == "<image>":
	content.append({"type": "image", "url": message["files"][image_index]})
	image_index += 1
	elif part.strip():
	content.append({"type": "text", "text": part.strip()})
	else:
	if isinstance(part, str) and part != "<image>":
	content.append({"type": "text", "text": part})
	return content


	##################################################
	# 5) CSV/PDF/TXT = 텍스트 / 이미지,비디오 = 실제 경로
	##################################################
	def process_new_user_message(message: dict) -> list[dict]:
	user_text = (message["text"] or "").strip() or "(No text)"
	if not message["files"]:
	return [{"type": "text", "text": user_text}]

	video_files = [f for f in message["files"] if f.endswith(".mp4")]
	image_files = [f for f in message["files"] if re.search(r"\.(png\|jpg\|jpeg\|gif\|webp)$", f, re.IGNORECASE)]
	csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
	txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
	pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]

	content_list = [{"type": "text", "text": user_text}]

	# CSV
	for csv_path in csv_files:
	csv_analysis = analyze_csv_file(csv_path)
	if not csv_analysis.strip():
	csv_analysis = "(No CSV content?)"
	content_list.append({"type": "text", "text": csv_analysis})

	# TXT
	for txt_path in txt_files:
	txt_analysis = analyze_txt_file(txt_path)
	if not txt_analysis.strip():
	txt_analysis = "(No TXT content?)"
	content_list.append({"type": "text", "text": txt_analysis})

	# PDF
	for pdf_path in pdf_files:
	pdf_md = pdf_to_markdown(pdf_path)
	if not pdf_md.strip():
	pdf_md = "(No PDF content?)"
	content_list.append({"type": "text", "text": pdf_md})

	if video_files:
	# 하나만 처리
	content_list += process_video(video_files[0])
	return content_list

	if "<image>" in user_text:
	return process_interleaved_images(message)
	else:
	# 일반 이미지
	for img_path in image_files:
	content_list.append({"type": "image", "url": img_path})

	return content_list


	##################################################
	# 6) 히스토리 -> LLM 메시지 변환 (비이미지 경로는 무시)
	##################################################
	def process_history(history: list[dict]) -> list[dict]:
	messages = []
	current_user_content = []
	for item in history:
	if item["role"] == "assistant":
	if current_user_content:
	messages.append({"role": "user", "content": current_user_content})
	current_user_content = []
	messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
	else:
	# user
	content = item["content"]
	if isinstance(content, str):
	current_user_content.append({"type": "text", "text": content})
	else:
	# [파일경로]
	fpath = content[0]
	# 이미지나 mp4만 유지, 나머지는 제외
	if re.search(r"\.(png\|jpg\|jpeg\|gif\|webp)$", fpath, re.IGNORECASE) or fpath.endswith(".mp4"):
	current_user_content.append({"type": "image", "url": fpath})
	else:
	pass
	return messages


	##################################################
	# 7) 메인 추론 (빈 토큰 방어)
	##################################################
	@spaces.GPU(duration=120)
	def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
	if not validate_media_constraints(message, history):
	yield ""
	return

	messages = []
	if system_prompt:
	messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
	messages.extend(process_history(history))

	user_content = process_new_user_message(message)
	messages.append({"role": "user", "content": user_content})

	# 1) tokenize=False 후 토큰 길이 체크
	raw_text = processor.tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	token_ids = processor.tokenizer.encode(raw_text, add_special_tokens=False)
	if len(token_ids) == 0:
	# 빈 입력 → 임의 문구 추가
	raw_text += " (No content?)"
	token_ids = processor.tokenizer.encode(raw_text, add_special_tokens=False)

	# 2) 실제 tokenizer
	inputs = processor.tokenizer(
	raw_text,
	return_tensors="pt",
	padding=True
	)
	inputs = {k: v.to(model.device, dtype=torch.bfloat16) for k, v in inputs.items()}

	# 3) 스트리밍 생성
	streamer = TextIteratorStreamer(processor.tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
	gen_kwargs = {
	"inputs": inputs["input_ids"],
	"attention_mask": inputs.get("attention_mask"),
	"streamer": streamer,
	"max_new_tokens": max_new_tokens,
	"do_sample": True,
	"temperature": 0.3,
	"top_p": 0.95,
	}
	gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

	t = Thread(target=model.generate, kwargs=gen_kwargs)
	t.start()

	output = ""
	for chunk in streamer:
	output += chunk
	yield output


	##################################################
	# 8) 예시
	##################################################
	examples = [

	[
	{
	"text": "PDF 파일 내용을 요약, 분석하라.",
	"files": ["assets/additional-examples/pdf.pdf"],
	}
	],
	[
	{
	"text": "CSV 파일 내용을 요약, 분석하라",
	"files": ["assets/additional-examples/sample-csv.csv"],
	}
	],
	[
	{
	"text": "동일한 막대 그래프를 그리는 matplotlib 코드를 작성해주세요.",
	"files": ["assets/additional-examples/barchart.png"],
	}
	],
	[
	{
	"text": "이 영상에서 이상한 점이 무엇인가요?",
	"files": ["assets/additional-examples/tmp.mp4"],
	}
	],
	[
	{
	"text": "이미 이 영양제를 <image> 가지고 있고, 이 제품 <image>을 새로 사려 합니다. 함께 섭취할 때 주의해야 할 점이 있을까요?",
	"files": ["assets/additional-examples/pill1.png", "assets/additional-examples/pill2.png"],
	}
	],
	[
	{
	"text": "이미지의 시각적 요소에서 영감을 받아 시를 작성해주세요.",
	"files": ["assets/sample-images/06-1.png", "assets/sample-images/06-2.png"],
	}
	],
	[
	{
	"text": "이미지의 시각적 요소를 토대로 짧은 악곡을 작곡해주세요.",
	"files": [
	"assets/sample-images/07-1.png",
	"assets/sample-images/07-2.png",
	"assets/sample-images/07-3.png",
	"assets/sample-images/07-4.png",
	],
	}
	],
	[
	{
	"text": "이 집에서 무슨 일이 있었을지 짧은 이야기를 지어보세요.",
	"files": ["assets/sample-images/08.png"],
	}
	],
	[
	{
	"text": "이미지들의 순서를 바탕으로 짧은 이야기를 만들어 주세요.",
	"files": [
	"assets/sample-images/09-1.png",
	"assets/sample-images/09-2.png",
	"assets/sample-images/09-3.png",
	"assets/sample-images/09-4.png",
	"assets/sample-images/09-5.png",
	],
	}
	],
	[
	{
	"text": "이 세계에서 살고 있을 생물들을 상상해서 묘사해주세요.",
	"files": ["assets/sample-images/10.png"],
	}
	],
	[
	{
	"text": "이미지에 적힌 텍스트를 읽어주세요.",
	"files": ["assets/additional-examples/1.png"],
	}
	],
	[
	{
	"text": "이 티켓은 언제 발급된 것이고, 가격은 얼마인가요?",
	"files": ["assets/additional-examples/2.png"],
	}
	],
	[
	{
	"text": "이미지에 있는 텍스트를 그대로 읽어서 마크다운 형태로 적어주세요.",
	"files": ["assets/additional-examples/3.png"],
	}
	],
	[
	{
	"text": "이 적분을 풀어주세요.",
	"files": ["assets/additional-examples/4.png"],
	}
	],
	[
	{
	"text": "이 이미지를 간단히 캡션으로 설명해주세요.",
	"files": ["assets/sample-images/01.png"],
	}
	],
	[
	{
	"text": "이 표지판에는 무슨 문구가 적혀 있나요?",
	"files": ["assets/sample-images/02.png"],
	}
	],
	[
	{
	"text": "두 이미지를 비교해서 공통점과 차이점을 말해주세요.",
	"files": ["assets/sample-images/03.png"],
	}
	],
	[
	{
	"text": "이미지에 보이는 모든 사물과 그 색상을 나열해주세요.",
	"files": ["assets/sample-images/04.png"],
	}
	],
	[
	{
	"text": "장면의 분위기를 묘사해주세요.",
	"files": ["assets/sample-images/05.png"],
	}
	],
	]


	demo = gr.ChatInterface(
	fn=run,
	type="messages",
	chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
	textbox=gr.MultimodalTextbox(
	file_types=[
	".png", ".jpg", ".jpeg", ".gif", ".webp",
	".mp4", ".csv", ".txt", ".pdf"
	],
	file_count="multiple",
	autofocus=True
	),
	multimodal=True,
	additional_inputs=[
	gr.Textbox(
	label="System Prompt",
	value="You are a deeply thoughtful AI. Consider problems thoroughly and derive correct solutions through systematic reasoning. Please answer in korean."
	),
	gr.Slider(label="Max New Tokens", minimum=100, maximum=8000, step=50, value=2000),
	],
	stop_btn=False,
	title="Gemma 3 27B IT",
	examples=examples,
	run_examples_on_click=False,
	cache_examples=False,
	css_paths="style.css",
	delete_cache=(1800, 1800),
	)

	if __name__ == "__main__":
	demo.launch()