|
import streamlit as st |
|
import pdfplumber |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/mineru-base") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("OFA-Sys/mineru-base") |
|
return tokenizer, model |
|
|
|
tokenizer, model = load_model() |
|
|
|
|
|
st.title("π MinerU: Ask Questions from PDF") |
|
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") |
|
question = st.text_input("Enter your question:") |
|
|
|
if uploaded_file and question: |
|
with pdfplumber.open(uploaded_file) as pdf: |
|
text = '' |
|
for page in pdf.pages: |
|
text += page.extract_text() |
|
|
|
|
|
input_text = f"question: {question} context: {text[:3000]}" |
|
|
|
inputs = tokenizer(input_text, return_tensors="pt", truncation=True) |
|
outputs = model.generate(**inputs, max_new_tokens=128) |
|
answer = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
st.markdown(f"### Answer:\n{answer}") |
|
|