File size: 3,397 Bytes
3ec3a18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08640e1
7f63e62
 
 
3ec3a18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f63e62
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import streamlit as st
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import pytesseract
from PIL import Image
import requests  # For downloading PDFs (optional)
import numpy as np

# Download PDF function (optional)
def download_pdf(url):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open("temp.pdf", "wb") as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return True
        else:
            st.error("Error downloading PDF")
            return False
    except Exception as e:
        st.error(f"Error: {e}")
        return False

# OCR function
def extract_text(file):
    if not file:
        return None

    if file.type == "application/pdf":
        # Download PDF if URL provided (optional)
        if st.checkbox("Document is a URL?"):
            pdf_url = st.text_input("Enter the PDF URL:")
            if pdf_url and download_pdf(pdf_url):
                file = open("temp.pdf", "rb")
            else:
                return None

        # Use PyTesseract for PDF OCR
        pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Update path if needed
        text = pytesseract.image_to_string(Image.open(file))
        return text

    elif file.type in ("image/jpeg", "image/png"):
        # Use PyTesseract for image OCR
        pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Update path if needed
        text = pytesseract.image_to_string(Image.open(file))
        return text

    else:
        st.error("Please upload a PDF or image file")
        return None

# Urdu content generation model (replace with your preferred model)
urdu_gen_model_name = "mwz/UrduGPT2"  # Replace placeholder

# Load the model using TensorFlow weights
urdu_gen_pipe = pipeline("text-generation", model=urdu_gen_model_name, framework="tf")


def answer_questions(document, question):
    if not document or not question:
        return None

    # Use a pre-trained RAG model for Urdu (needs to be implemented)
    # You can explore custom RAG models with Urdu models like Udify
    # This section is a placeholder until an Urdu RAG model is available.

    # Example using a non-Urdu RAG model (for demonstration purposes)
    rag_model_name = "facebook/bart-large-cnn"  # Replace with Urdu RAG (if available)
    rag_pipe = pipeline("question-answering", model=rag_model_name)
    answer = rag_pipe({"question": question, "context": document})["answer"]
    return answer

def main():
    st.title("Urdu Question Answering App")
    st.write("This app uses OCR to extract text from your document and a generative model to answer your questions in Urdu.")

    uploaded_file = st.file_uploader("Upload Reference Document (PDF or Image)")

    document = extract_text(uploaded_file)
    if document:
        st.success("Document Text Extracted!")
        st.write(document)

        question = st.text_input("Ask your question in Urdu:")
        if question:
            answer = answer_questions(document, question)
            if answer:
                st.success("Answer:")
                st.write(answer)
            else:
                st.warning("Couldn't find an answer. Try rephrasing your question.")

if __name__ == "__main__":
    main()