urduhelper / app.py
engrharis's picture
Update app.py
7f63e62 verified
import streamlit as st
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import pytesseract
from PIL import Image
import requests # For downloading PDFs (optional)
import numpy as np
# Download PDF function (optional)
def download_pdf(url):
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
with open("temp.pdf", "wb") as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return True
else:
st.error("Error downloading PDF")
return False
except Exception as e:
st.error(f"Error: {e}")
return False
# OCR function
def extract_text(file):
if not file:
return None
if file.type == "application/pdf":
# Download PDF if URL provided (optional)
if st.checkbox("Document is a URL?"):
pdf_url = st.text_input("Enter the PDF URL:")
if pdf_url and download_pdf(pdf_url):
file = open("temp.pdf", "rb")
else:
return None
# Use PyTesseract for PDF OCR
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed
text = pytesseract.image_to_string(Image.open(file))
return text
elif file.type in ("image/jpeg", "image/png"):
# Use PyTesseract for image OCR
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed
text = pytesseract.image_to_string(Image.open(file))
return text
else:
st.error("Please upload a PDF or image file")
return None
# Urdu content generation model (replace with your preferred model)
urdu_gen_model_name = "mwz/UrduGPT2" # Replace placeholder
# Load the model using TensorFlow weights
urdu_gen_pipe = pipeline("text-generation", model=urdu_gen_model_name, framework="tf")
def answer_questions(document, question):
if not document or not question:
return None
# Use a pre-trained RAG model for Urdu (needs to be implemented)
# You can explore custom RAG models with Urdu models like Udify
# This section is a placeholder until an Urdu RAG model is available.
# Example using a non-Urdu RAG model (for demonstration purposes)
rag_model_name = "facebook/bart-large-cnn" # Replace with Urdu RAG (if available)
rag_pipe = pipeline("question-answering", model=rag_model_name)
answer = rag_pipe({"question": question, "context": document})["answer"]
return answer
def main():
st.title("Urdu Question Answering App")
st.write("This app uses OCR to extract text from your document and a generative model to answer your questions in Urdu.")
uploaded_file = st.file_uploader("Upload Reference Document (PDF or Image)")
document = extract_text(uploaded_file)
if document:
st.success("Document Text Extracted!")
st.write(document)
question = st.text_input("Ask your question in Urdu:")
if question:
answer = answer_questions(document, question)
if answer:
st.success("Answer:")
st.write(answer)
else:
st.warning("Couldn't find an answer. Try rephrasing your question.")
if __name__ == "__main__":
main()