import streamlit as st from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer import pytesseract from PIL import Image import requests # For downloading PDFs (optional) import numpy as np # Download PDF function (optional) def download_pdf(url): try: response = requests.get(url, stream=True) if response.status_code == 200: with open("temp.pdf", "wb") as f: for chunk in response.iter_content(1024): f.write(chunk) return True else: st.error("Error downloading PDF") return False except Exception as e: st.error(f"Error: {e}") return False # OCR function def extract_text(file): if not file: return None if file.type == "application/pdf": # Download PDF if URL provided (optional) if st.checkbox("Document is a URL?"): pdf_url = st.text_input("Enter the PDF URL:") if pdf_url and download_pdf(pdf_url): file = open("temp.pdf", "rb") else: return None # Use PyTesseract for PDF OCR pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed text = pytesseract.image_to_string(Image.open(file)) return text elif file.type in ("image/jpeg", "image/png"): # Use PyTesseract for image OCR pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed text = pytesseract.image_to_string(Image.open(file)) return text else: st.error("Please upload a PDF or image file") return None # Urdu content generation model (replace with your preferred model) urdu_gen_model_name = "mwz/UrduGPT2" # Replace placeholder # Load the model using TensorFlow weights urdu_gen_pipe = pipeline("text-generation", model=urdu_gen_model_name, framework="tf") def answer_questions(document, question): if not document or not question: return None # Use a pre-trained RAG model for Urdu (needs to be implemented) # You can explore custom RAG models with Urdu models like Udify # This section is a placeholder until an Urdu RAG model is available. # Example using a non-Urdu RAG model (for demonstration purposes) rag_model_name = "facebook/bart-large-cnn" # Replace with Urdu RAG (if available) rag_pipe = pipeline("question-answering", model=rag_model_name) answer = rag_pipe({"question": question, "context": document})["answer"] return answer def main(): st.title("Urdu Question Answering App") st.write("This app uses OCR to extract text from your document and a generative model to answer your questions in Urdu.") uploaded_file = st.file_uploader("Upload Reference Document (PDF or Image)") document = extract_text(uploaded_file) if document: st.success("Document Text Extracted!") st.write(document) question = st.text_input("Ask your question in Urdu:") if question: answer = answer_questions(document, question) if answer: st.success("Answer:") st.write(answer) else: st.warning("Couldn't find an answer. Try rephrasing your question.") if __name__ == "__main__": main()