advanced read and textify added
Browse files- app.py +2 -1
- helper/utils.py +50 -0
app.py
CHANGED
|
@@ -110,7 +110,8 @@ if uploaded_files is None:
|
|
| 110 |
elif uploaded_files:
|
| 111 |
with st.spinner("Wait for it... 🤔"):
|
| 112 |
# Process the uploaded files to extract text and source information
|
| 113 |
-
textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
|
|
|
|
| 114 |
|
| 115 |
# Separate the output into documents (text) and their corresponding sources
|
| 116 |
documents, sources = textify_output
|
|
|
|
| 110 |
elif uploaded_files:
|
| 111 |
with st.spinner("Wait for it... 🤔"):
|
| 112 |
# Process the uploaded files to extract text and source information
|
| 113 |
+
# textify_output = read_and_textify(uploaded_files, chunk_size=chunk_size_input)
|
| 114 |
+
textify_output = read_and_textify_advanced(uploaded_files, chunk_size=chunk_size_input)
|
| 115 |
|
| 116 |
# Separate the output into documents (text) and their corresponding sources
|
| 117 |
documents, sources = textify_output
|
helper/utils.py
CHANGED
|
@@ -62,6 +62,56 @@ def read_and_textify(
|
|
| 62 |
return text_list, sources_list
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
| 66 |
|
| 67 |
|
|
|
|
| 62 |
return text_list, sources_list
|
| 63 |
|
| 64 |
|
| 65 |
+
def read_and_textify_advanced(
|
| 66 |
+
files: List[str], chunk_size: int = 2 # Default chunk size set to 50
|
| 67 |
+
) -> Tuple[List[str], List[str]]:
|
| 68 |
+
"""
|
| 69 |
+
Reads PDF files and extracts text from each page, breaking the text into specified segments.
|
| 70 |
+
|
| 71 |
+
This function iterates over a list of uploaded PDF files, extracts text from each page,
|
| 72 |
+
and compiles a list of texts and corresponding source information, segmented into smaller parts
|
| 73 |
+
of approximately 'chunk_size' words each.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
|
| 77 |
+
chunk_size (int): The number of words per text segment. Default is 50.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Tuple[List[str], List[str]]: A tuple containing two lists:
|
| 81 |
+
1. A list of strings, where each string is a segment of text extracted from a PDF page.
|
| 82 |
+
2. A list of strings indicating the source of each text segment (file name, page number, and segment number).
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
text_list = [] # List to store extracted text segments
|
| 86 |
+
sources_list = [] # List to store source information
|
| 87 |
+
|
| 88 |
+
# Iterate over each file
|
| 89 |
+
for file in files:
|
| 90 |
+
pdfReader = PyPDF2.PdfReader(file) # Create a PDF reader object
|
| 91 |
+
# Iterate over each page in the PDF
|
| 92 |
+
for i in range(len(pdfReader.pages)):
|
| 93 |
+
pageObj = pdfReader.pages[i] # Get the page object
|
| 94 |
+
text = pageObj.extract_text() # Extract text from the page
|
| 95 |
+
if text:
|
| 96 |
+
# Split text into chunks of approximately 'chunk_size' words
|
| 97 |
+
words = text.split(". ")
|
| 98 |
+
for j in range(len(words)):
|
| 99 |
+
# Get the chunk of text from j-chunk_size to j+chunk_size
|
| 100 |
+
start = max(0, j - chunk_size)
|
| 101 |
+
end = min(len(words), j + chunk_size + 1)
|
| 102 |
+
chunk = ". ".join(words[start:end]) + '.'
|
| 103 |
+
text_list.append(chunk)
|
| 104 |
+
# Create a source identifier for each chunk and add it to the list
|
| 105 |
+
sources_list.append(f"{file.name}_page_{i}_chunk_{j}")
|
| 106 |
+
else:
|
| 107 |
+
# If no text extracted, still add a placeholder
|
| 108 |
+
text_list.append("")
|
| 109 |
+
sources_list.append(f"{file.name}_page_{i}_chunk_0")
|
| 110 |
+
pageObj.clear() # Clear the page object (optional, for memory management)
|
| 111 |
+
|
| 112 |
+
return text_list, sources_list
|
| 113 |
+
|
| 114 |
+
|
| 115 |
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
| 116 |
|
| 117 |
|