ignaciaginting commited on
Commit
721bdd5
·
verified ·
1 Parent(s): 3f6444a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -0
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+
5
+ # Load the model
6
+ @st.cache_resource
7
+ def load_model():
8
+ tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/mineru-base")
9
+ model = AutoModelForSeq2SeqLM.from_pretrained("OFA-Sys/mineru-base")
10
+ return tokenizer, model
11
+
12
+ tokenizer, model = load_model()
13
+
14
+ # UI
15
+ st.title("📄 MinerU: Ask Questions from PDF")
16
+ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
17
+ question = st.text_input("Enter your question:")
18
+
19
+ if uploaded_file and question:
20
+ with pdfplumber.open(uploaded_file) as pdf:
21
+ text = ''
22
+ for page in pdf.pages:
23
+ text += page.extract_text()
24
+
25
+ # Prepare input for MinerU (usually expects a prompt)
26
+ input_text = f"question: {question} context: {text[:3000]}" # MinerU has token limit
27
+
28
+ inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
29
+ outputs = model.generate(**inputs, max_new_tokens=128)
30
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
31
+
32
+ st.markdown(f"### Answer:\n{answer}")