ignaciaginting's picture
Create app.py
721bdd5 verified
raw
history blame
1.09 kB
import streamlit as st
import pdfplumber
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load the model
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/mineru-base")
model = AutoModelForSeq2SeqLM.from_pretrained("OFA-Sys/mineru-base")
return tokenizer, model
tokenizer, model = load_model()
# UI
st.title("πŸ“„ MinerU: Ask Questions from PDF")
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
question = st.text_input("Enter your question:")
if uploaded_file and question:
with pdfplumber.open(uploaded_file) as pdf:
text = ''
for page in pdf.pages:
text += page.extract_text()
# Prepare input for MinerU (usually expects a prompt)
input_text = f"question: {question} context: {text[:3000]}" # MinerU has token limit
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
outputs = model.generate(**inputs, max_new_tokens=128)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
st.markdown(f"### Answer:\n{answer}")