engrharis commited on
Commit
3ec3a18
·
verified ·
1 Parent(s): 8316eeb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
3
+ import pytesseract
4
+ from PIL import Image
5
+ import requests # For downloading PDFs (optional)
6
+ import numpy as np
7
+
8
+ # Download PDF function (optional)
9
+ def download_pdf(url):
10
+ try:
11
+ response = requests.get(url, stream=True)
12
+ if response.status_code == 200:
13
+ with open("temp.pdf", "wb") as f:
14
+ for chunk in response.iter_content(1024):
15
+ f.write(chunk)
16
+ return True
17
+ else:
18
+ st.error("Error downloading PDF")
19
+ return False
20
+ except Exception as e:
21
+ st.error(f"Error: {e}")
22
+ return False
23
+
24
+ # OCR function
25
+ def extract_text(file):
26
+ if not file:
27
+ return None
28
+
29
+ if file.type == "application/pdf":
30
+ # Download PDF if URL provided (optional)
31
+ if st.checkbox("Document is a URL?"):
32
+ pdf_url = st.text_input("Enter the PDF URL:")
33
+ if pdf_url and download_pdf(pdf_url):
34
+ file = open("temp.pdf", "rb")
35
+ else:
36
+ return None
37
+
38
+ # Use PyTesseract for PDF OCR
39
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed
40
+ text = pytesseract.image_to_string(Image.open(file))
41
+ return text
42
+
43
+ elif file.type in ("image/jpeg", "image/png"):
44
+ # Use PyTesseract for image OCR
45
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Update path if needed
46
+ text = pytesseract.image_to_string(Image.open(file))
47
+ return text
48
+
49
+ else:
50
+ st.error("Please upload a PDF or image file")
51
+ return None
52
+
53
+ # Urdu content generation model (replace with your preferred model)
54
+ urdu_gen_model_name = "fawadmalik/urdu-gpt2-small"
55
+ urdu_gen_pipe = pipeline("text-generation", model=urdu_gen_model_name)
56
+
57
+
58
+ def answer_questions(document, question):
59
+ if not document or not question:
60
+ return None
61
+
62
+ # Use a pre-trained RAG model for Urdu (needs to be implemented)
63
+ # You can explore custom RAG models with Urdu models like Udify
64
+ # This section is a placeholder until an Urdu RAG model is available.
65
+
66
+ # Example using a non-Urdu RAG model (for demonstration purposes)
67
+ rag_model_name = "facebook/bart-large-cnn" # Replace with Urdu RAG (if available)
68
+ rag_pipe = pipeline("question-answering", model=rag_model_name)
69
+ answer = rag_pipe({"question": question, "context": document})["answer"]
70
+ return answer
71
+
72
+ def main():
73
+ st.title("Urdu Question Answering App")
74
+ st.write("This app uses OCR to extract text from your document and a generative model to answer your questions in Urdu.")
75
+
76
+ uploaded_file = st.file_uploader("Upload Reference Document (PDF or Image)")
77
+
78
+ document = extract_text(uploaded_file)
79
+ if document:
80
+ st.success("Document Text Extracted!")
81
+ st.write(document)
82
+
83
+ question = st.text_input("Ask your question in Urdu:")
84
+ if question:
85
+ answer = answer_questions(document, question)
86
+ if answer:
87
+ st.success("Answer:")
88
+ st.write(answer)
89
+ else:
90
+ st.warning("Couldn't find an answer. Try rephrasing your question.")
91
+
92
+ if __name__ == "__main__":
93
+ main()