Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
import pytesseract | |
from PIL import Image | |
import requests # For downloading PDFs (optional) | |
import numpy as np | |
# Download PDF function (optional) | |
def download_pdf(url): | |
try: | |
response = requests.get(url, stream=True) | |
if response.status_code == 200: | |
with open("temp.pdf", "wb") as f: | |
for chunk in response.iter_content(1024): | |
f.write(chunk) | |
return True | |
else: | |
st.error("Error downloading PDF") | |
return False | |
except Exception as e: | |
st.error(f"Error: {e}") | |
return False | |
# OCR function | |
def extract_text(file): | |
if not file: | |
return None | |
if file.type == "application/pdf": | |
# Download PDF if URL provided (optional) | |
if st.checkbox("Document is a URL?"): | |
pdf_url = st.text_input("Enter the PDF URL:") | |
if pdf_url and download_pdf(pdf_url): | |
file = open("temp.pdf", "rb") | |
else: | |
return None | |
# Use PyTesseract for PDF OCR | |
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed | |
text = pytesseract.image_to_string(Image.open(file)) | |
return text | |
elif file.type in ("image/jpeg", "image/png"): | |
# Use PyTesseract for image OCR | |
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed | |
text = pytesseract.image_to_string(Image.open(file)) | |
return text | |
else: | |
st.error("Please upload a PDF or image file") | |
return None | |
# Urdu content generation model (replace with your preferred model) | |
urdu_gen_model_name = "mwz/UrduGPT2" # Replace placeholder | |
# Load the model using TensorFlow weights | |
urdu_gen_pipe = pipeline("text-generation", model=urdu_gen_model_name, framework="tf") | |
def answer_questions(document, question): | |
if not document or not question: | |
return None | |
# Use a pre-trained RAG model for Urdu (needs to be implemented) | |
# You can explore custom RAG models with Urdu models like Udify | |
# This section is a placeholder until an Urdu RAG model is available. | |
# Example using a non-Urdu RAG model (for demonstration purposes) | |
rag_model_name = "facebook/bart-large-cnn" # Replace with Urdu RAG (if available) | |
rag_pipe = pipeline("question-answering", model=rag_model_name) | |
answer = rag_pipe({"question": question, "context": document})["answer"] | |
return answer | |
def main(): | |
st.title("Urdu Question Answering App") | |
st.write("This app uses OCR to extract text from your document and a generative model to answer your questions in Urdu.") | |
uploaded_file = st.file_uploader("Upload Reference Document (PDF or Image)") | |
document = extract_text(uploaded_file) | |
if document: | |
st.success("Document Text Extracted!") | |
st.write(document) | |
question = st.text_input("Ask your question in Urdu:") | |
if question: | |
answer = answer_questions(document, question) | |
if answer: | |
st.success("Answer:") | |
st.write(answer) | |
else: | |
st.warning("Couldn't find an answer. Try rephrasing your question.") | |
if __name__ == "__main__": | |
main() | |