|
import streamlit as st |
|
import docx |
|
import PyPDF2 |
|
from transformers import pipeline |
|
import tempfile |
|
|
|
|
|
@st.cache_resource |
|
def load_pipeline(): |
|
return pipeline("question-answering", model="deepset/roberta-base-squad2") |
|
|
|
qa_pipeline = load_pipeline() |
|
|
|
def read_pdf(file): |
|
text = "" |
|
pdf_reader = PyPDF2.PdfReader(file) |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() + "\n" |
|
return text |
|
|
|
def read_word(file): |
|
doc = docx.Document(file) |
|
text = "" |
|
for para in doc.paragraphs: |
|
text += para.text + "\n" |
|
return text |
|
|
|
def extract_text(uploaded_file): |
|
file_type = uploaded_file.name.split('.')[-1].lower() |
|
if file_type == 'pdf': |
|
text = read_pdf(uploaded_file) |
|
elif file_type == 'docx': |
|
text = read_word(uploaded_file) |
|
else: |
|
st.error("Unsupported file type. Please upload a PDF or Word file.") |
|
text = None |
|
return text |
|
|
|
|
|
def main(): |
|
st.title("π File Reader & Hugging Face Q&A Application") |
|
st.write("Upload a PDF or Word file and ask questions based on its content.") |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a PDF or Word file", type=["pdf", "docx"]) |
|
|
|
if uploaded_file is not None: |
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
|
temp_file.write(uploaded_file.read()) |
|
temp_file_path = temp_file.name |
|
|
|
|
|
file_text = extract_text(temp_file_path) |
|
if file_text: |
|
st.text_area("File Content", file_text[:1000] + "... (truncated for display)") |
|
|
|
|
|
question = st.text_input("Ask a question based on the file content:") |
|
|
|
if st.button("Get Answer"): |
|
if question.strip(): |
|
try: |
|
result = qa_pipeline(question=question, context=file_text) |
|
st.success(f"Answer: {result['answer']}") |
|
except Exception as e: |
|
st.error(f"Error generating answer: {str(e)}") |
|
else: |
|
st.warning("Please enter a question.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|