Spaces:

engrharis
/

urduhelper

Sleeping

App Files Files Community

engrharis commited on Dec 26, 2024

Commit

3ec3a18

verified ·

1 Parent(s): 8316eeb

Create app.py

Browse files

Files changed (1) hide show

app.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import streamlit as st
+from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
+import pytesseract
+from PIL import Image
+import requests  # For downloading PDFs (optional)
+import numpy as np
+# Download PDF function (optional)
+def download_pdf(url):
+    try:
+        response = requests.get(url, stream=True)
+        if response.status_code == 200:
+            with open("temp.pdf", "wb") as f:
+                for chunk in response.iter_content(1024):
+                    f.write(chunk)
+            return True
+        else:
+            st.error("Error downloading PDF")
+            return False
+    except Exception as e:
+        st.error(f"Error: {e}")
+        return False
+# OCR function
+def extract_text(file):
+    if not file:
+        return None
+    if file.type == "application/pdf":
+        # Download PDF if URL provided (optional)
+        if st.checkbox("Document is a URL?"):
+            pdf_url = st.text_input("Enter the PDF URL:")
+            if pdf_url and download_pdf(pdf_url):
+                file = open("temp.pdf", "rb")
+            else:
+                return None
+        # Use PyTesseract for PDF OCR
+        pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Update path if needed
+        text = pytesseract.image_to_string(Image.open(file))
+        return text
+    elif file.type in ("image/jpeg", "image/png"):
+        # Use PyTesseract for image OCR
+        pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Update path if needed
+        text = pytesseract.image_to_string(Image.open(file))
+        return text
+    else:
+        st.error("Please upload a PDF or image file")
+        return None
+# Urdu content generation model (replace with your preferred model)
+urdu_gen_model_name = "fawadmalik/urdu-gpt2-small"
+urdu_gen_pipe = pipeline("text-generation", model=urdu_gen_model_name)
+def answer_questions(document, question):
+    if not document or not question:
+        return None
+    # Use a pre-trained RAG model for Urdu (needs to be implemented)
+    # You can explore custom RAG models with Urdu models like Udify
+    # This section is a placeholder until an Urdu RAG model is available.
+    # Example using a non-Urdu RAG model (for demonstration purposes)
+    rag_model_name = "facebook/bart-large-cnn"  # Replace with Urdu RAG (if available)
+    rag_pipe = pipeline("question-answering", model=rag_model_name)
+    answer = rag_pipe({"question": question, "context": document})["answer"]
+    return answer
+def main():
+    st.title("Urdu Question Answering App")
+    st.write("This app uses OCR to extract text from your document and a generative model to answer your questions in Urdu.")
+    uploaded_file = st.file_uploader("Upload Reference Document (PDF or Image)")
+    document = extract_text(uploaded_file)
+    if document:
+        st.success("Document Text Extracted!")
+        st.write(document)
+        question = st.text_input("Ask your question in Urdu:")
+        if question:
+            answer = answer_questions(document, question)
+            if answer:
+                st.success("Answer:")
+                st.write(answer)
+            else:
+                st.warning("Couldn't find an answer. Try rephrasing your question.")
+if __name__ == "__main__":
+    main()