Spaces:
Sleeping
Sleeping
import argparse | |
import copy | |
import os | |
import re | |
import subprocess | |
import tempfile | |
import base64 | |
from pathlib import Path | |
import fitz | |
import gradio as gr | |
import time | |
import html | |
from openai import OpenAI | |
from s3_uploads import upload_to_s3 | |
from environs import env | |
stop_generation = False | |
def stream_from_vllm(messages): | |
global stop_generation | |
client = OpenAI( | |
base_url="https://router.huggingface.co/v1", | |
api_key=env.str("HF_API_KEY"), | |
) | |
response = client.chat.completions.create( | |
model="THUDM/GLM-4.1V-9B-Thinking:novita", | |
messages=messages, | |
temperature=0.01, | |
stream=True, | |
max_tokens=8000 | |
) | |
for chunk in response: | |
if stop_generation: | |
break | |
if chunk.choices and chunk.choices[0].delta: | |
delta = chunk.choices[0].delta | |
yield delta | |
class GLM4VModel: | |
def _strip_html(self, text: str) -> str: | |
return re.sub(r"<[^>]+>", "", text).strip() | |
def _wrap_text(self, text: str): | |
return [{"type": "text", "text": text}] | |
def _image_to_base64(self, image_path): | |
with open(image_path, "rb") as image_file: | |
encoded_string = base64.b64encode(image_file.read()).decode('utf-8') | |
ext = Path(image_path).suffix.lower() | |
if ext in ['.jpg', '.jpeg']: | |
mime_type = 'image/jpeg' | |
elif ext == '.png': | |
mime_type = 'image/png' | |
elif ext == '.gif': | |
mime_type = 'image/gif' | |
elif ext == '.bmp': | |
mime_type = 'image/bmp' | |
elif ext in ['.tiff', '.tif']: | |
mime_type = 'image/tiff' | |
elif ext == '.webp': | |
mime_type = 'image/webp' | |
else: | |
mime_type = 'image/jpeg' | |
return f"data:{mime_type};base64,{encoded_string}" | |
def _pdf_to_imgs(self, pdf_path): | |
doc = fitz.open(pdf_path) | |
imgs = [] | |
for i in range(doc.page_count): | |
pix = doc.load_page(i).get_pixmap(dpi=180) | |
img_p = os.path.join(tempfile.gettempdir(), f"{Path(pdf_path).stem}_{i}.png") | |
pix.save(img_p) | |
imgs.append(img_p) | |
doc.close() | |
return imgs | |
def _ppt_to_imgs(self, ppt_path): | |
tmp = tempfile.mkdtemp() | |
subprocess.run( | |
["libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmp, ppt_path], | |
check=True, | |
) | |
pdf_path = os.path.join(tmp, Path(ppt_path).stem + ".pdf") | |
return self._pdf_to_imgs(pdf_path) | |
def _files_to_content(self, media): | |
out = [] | |
for f in media or []: | |
ext = Path(f).suffix.lower() | |
if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]: | |
out.append({"type": "video_url", "video_url": {"url": upload_to_s3(f)}}) | |
elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]: | |
out.append({"type": "image_url", "image_url": {"url": upload_to_s3(f)}}) | |
elif ext in [".ppt", ".pptx"]: | |
for p in self._ppt_to_imgs(f): | |
out.append({"type": "image_url", "image_url": {"url": upload_to_s3(p)}}) | |
elif ext == ".pdf": | |
for p in self._pdf_to_imgs(f): | |
out.append({"type": "image_url", "image_url": {"url": upload_to_s3(p)}}) | |
return out | |
def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = True): | |
think_html = "" | |
answer_md = "" | |
if reasoning_content and not skip_think: | |
reasoning_content_clean = reasoning_content.strip() | |
think_html = ( | |
"### 💭 Thinking\n" | |
"<details open>\n" | |
"<summary>Click to expand</summary>\n\n" | |
f"{reasoning_content_clean}\n" | |
"</details>\n" | |
) | |
if content: | |
answer_md = content.strip() | |
return think_html + "\n\n" + answer_md | |
def _build_messages(self, raw_hist, sys_prompt): | |
msgs = [] | |
if sys_prompt.strip(): | |
msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]}) | |
for h in raw_hist: | |
if h["role"] == "user": | |
msgs.append({"role": "user", "content": h["content"]}) | |
else: | |
raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL) | |
clean_content = self._strip_html(raw).strip() | |
if clean_content: | |
msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)}) | |
return msgs | |
def stream_generate(self, raw_hist, sys_prompt: str, *, skip_special_tokens: bool = False): | |
global stop_generation | |
stop_generation = False | |
msgs = self._build_messages(raw_hist, sys_prompt) | |
reasoning_buffer = "" | |
content_buffer = "" | |
try: | |
for delta in stream_from_vllm(msgs): | |
if stop_generation: | |
break | |
if hasattr(delta, 'reasoning_content') and delta.reasoning_content: | |
reasoning_buffer += delta.reasoning_content | |
elif hasattr(delta, 'content') and delta.content: | |
content_buffer += delta.content | |
else: | |
if isinstance(delta, dict): | |
if 'reasoning_content' in delta and delta['reasoning_content']: | |
reasoning_buffer += delta['reasoning_content'] | |
if 'content' in delta and delta['content']: | |
content_buffer += delta['content'] | |
elif hasattr(delta, 'content') and delta.content: | |
content_buffer += delta.content | |
yield self._stream_fragment(reasoning_buffer, content_buffer) | |
except Exception as e: | |
error_msg = f"Error during streaming: {str(e)}" | |
yield self._stream_fragment("", error_msg) | |
glm4v = GLM4VModel() | |
sys_prompt = """Instructions: | |
Extract only "BILL OF METERIAL" table containing columns same as it is! | |
colums: (POSITION, DESCRIPTION, N PIECES, MATERIAL (like SA 516 Gr.70N or SA 105 N), DIMENSIONS(like 1700 I.D. X 2045H 50 THK.), WT.Kgs | |
Ignore title blocks, revision notes, drawing numbers, and general annotations outside the "BILL OF METERIAL". | |
If a page contains multiple tables, extract only those explicitly related to BILL OF METERIAL. | |
Preserve the row and column structure as files. | |
Do not include any surrounding decorative lines or borders—only clean tabular data. | |
output format: markdown table format with following columns (POSITION, DESCRIPTION, N PIECES, MATERIAL, DIMENSIONS(like 1700 I.D. X 2045H 50 THK.) and WT.Kgs)""" | |
def extract_table_from_file(file): | |
if file is None: | |
return "Please upload a file." | |
payload = glm4v._files_to_content([file.name]) | |
raw_hist = [{"role": "user", "content": payload}] | |
full_response = "" | |
yield "<h2>🌀 Processing...</h2>\n" | |
try: | |
for chunk in glm4v.stream_generate(raw_hist, sys_prompt): | |
full_response = chunk | |
yield full_response | |
except Exception as e: | |
yield f"<div style='color: red;'>Error: {html.escape(str(e))}</div>" | |
theme = gr.themes.Ocean( | |
primary_hue="gray", | |
) | |
with gr.Blocks(title="demo", theme=theme) as demo: | |
gr.Markdown( | |
"<div style='text-align:center; margin-bottom:20px;'><h1> PDF Extraction Demo</h1></div" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
up = gr.File(label="Upload File", type="filepath") | |
format_selector = gr.Radio(choices=["CSV", "JSON"], label="Output Format", value="CSV") | |
submit_btn = gr.Button("Submit", variant="primary") | |
with gr.Column(): | |
output_markdown = gr.Markdown(label="Extracted Table") | |
submit_btn.click( | |
extract_table_from_file, | |
inputs=[up], | |
outputs=[output_markdown], | |
) | |
if __name__ == "__main__": | |
demo.launch() | |