Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -52,12 +52,12 @@ def count_tokens(text, tokenizer):
|
|
| 52 |
|
| 53 |
# دالة لاستخراج النص من ملفات PDF
|
| 54 |
def extract_pdf_text(file_path):
|
|
|
|
| 55 |
with open(file_path, "rb") as pdf_file:
|
| 56 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 57 |
-
text = ""
|
| 58 |
for page_num in range(len(pdf_reader.pages)):
|
| 59 |
page = pdf_reader.pages[page_num]
|
| 60 |
-
text += page.extract_text()
|
| 61 |
return text
|
| 62 |
|
| 63 |
# دالة لاستخراج النص من ملفات DOCX
|
|
@@ -81,15 +81,15 @@ def read_text_file(file_path):
|
|
| 81 |
|
| 82 |
# دالة لاستخراج المشاهد من النص
|
| 83 |
def extract_scenes(text):
|
| 84 |
-
scenes = re.split(r'
|
| 85 |
scenes = [scene.strip() for scene in scenes if scene.strip()]
|
| 86 |
return scenes
|
| 87 |
|
| 88 |
# دالة لاستخراج تفاصيل المشهد (المكان والوقت)
|
| 89 |
def extract_scene_details(scene):
|
| 90 |
details = {}
|
| 91 |
-
location_match = re.search(r'(
|
| 92 |
-
time_match = re.search(r'(
|
| 93 |
|
| 94 |
if location_match:
|
| 95 |
details['location'] = location_match.group()
|
|
@@ -176,29 +176,30 @@ def analyze_files(input_files, output_directory, tokenizer, max_length):
|
|
| 176 |
results.append(result)
|
| 177 |
|
| 178 |
# حفظ النتائج
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
-
with open(os.path.join(output_directory, f"{
|
| 183 |
-
file.write("\n".join(quotes))
|
| 184 |
|
| 185 |
-
with open(os.path.join(output_directory, f"{
|
| 186 |
-
file.write(str(token_count))
|
| 187 |
|
| 188 |
-
with open(os.path.join(output_directory, f"{
|
| 189 |
-
file.write("\n".join(scenes))
|
| 190 |
|
| 191 |
-
with open(os.path.join(output_directory, f"{
|
| 192 |
-
file.write(str(scene_details))
|
| 193 |
|
| 194 |
-
with open(os.path.join(output_directory, f"{
|
| 195 |
-
file.write(str(ages))
|
| 196 |
|
| 197 |
-
with open(os.path.join(output_directory, f"{
|
| 198 |
-
file.write(str(character_descriptions))
|
| 199 |
|
| 200 |
-
with open(os.path.join(output_directory, f"{
|
| 201 |
-
file.write(str(dialogues))
|
| 202 |
|
| 203 |
return results
|
| 204 |
|
|
|
|
| 52 |
|
| 53 |
# دالة لاستخراج النص من ملفات PDF
|
| 54 |
def extract_pdf_text(file_path):
|
| 55 |
+
text = ""
|
| 56 |
with open(file_path, "rb") as pdf_file:
|
| 57 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
|
|
|
| 58 |
for page_num in range(len(pdf_reader.pages)):
|
| 59 |
page = pdf_reader.pages[page_num]
|
| 60 |
+
text += page.extract_text() or ""
|
| 61 |
return text
|
| 62 |
|
| 63 |
# دالة لاستخراج النص من ملفات DOCX
|
|
|
|
| 81 |
|
| 82 |
# دالة لاستخراج المشاهد من النص
|
| 83 |
def extract_scenes(text):
|
| 84 |
+
scenes = re.split(r'داخلي|خارجي|... داخلي ...|... خارجي ...', text)
|
| 85 |
scenes = [scene.strip() for scene in scenes if scene.strip()]
|
| 86 |
return scenes
|
| 87 |
|
| 88 |
# دالة لاستخراج تفاصيل المشهد (المكان والوقت)
|
| 89 |
def extract_scene_details(scene):
|
| 90 |
details = {}
|
| 91 |
+
location_match = re.search(r'(داخلي|خارجي|... داخلي ...|... خارجي ...)', scene)
|
| 92 |
+
time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب|... ليل ...|... نهار ...)', scene)
|
| 93 |
|
| 94 |
if location_match:
|
| 95 |
details['location'] = location_match.group()
|
|
|
|
| 176 |
results.append(result)
|
| 177 |
|
| 178 |
# حفظ النتائج
|
| 179 |
+
base_filename = os.path.basename(file_path)
|
| 180 |
+
with open(os.path.join(output_directory, f"{base_filename}_sentences.txt"), "a", encoding="utf-8") as file:
|
| 181 |
+
file.write("\n".join(sentences) + "\n")
|
| 182 |
|
| 183 |
+
with open(os.path.join(output_directory, f"{base_filename}_quotes.txt"), "a", encoding="utf-8") as file:
|
| 184 |
+
file.write("\n".join(quotes) + "\n")
|
| 185 |
|
| 186 |
+
with open(os.path.join(output_directory, f"{base_filename}_token_count.txt"), "a", encoding="utf-8") as file:
|
| 187 |
+
file.write(str(token_count) + "\n")
|
| 188 |
|
| 189 |
+
with open(os.path.join(output_directory, f"{base_filename}_scenes.txt"), "a", encoding="utf-8") as file:
|
| 190 |
+
file.write("\n".join(scenes) + "\n")
|
| 191 |
|
| 192 |
+
with open(os.path.join(output_directory, f"{base_filename}_scene_details.txt"), "a", encoding="utf-8") as file:
|
| 193 |
+
file.write(str(scene_details) + "\n")
|
| 194 |
|
| 195 |
+
with open(os.path.join(output_directory, f"{base_filename}_ages.txt"), "a", encoding="utf-8") as file:
|
| 196 |
+
file.write(str(ages) + "\n")
|
| 197 |
|
| 198 |
+
with open(os.path.join(output_directory, f"{base_filename}_character_descriptions.txt"), "a", encoding="utf-8") as file:
|
| 199 |
+
file.write(str(character_descriptions) + "\n")
|
| 200 |
|
| 201 |
+
with open(os.path.join(output_directory, f"{base_filename}_dialogues.txt"), "a", encoding="utf-8") as file:
|
| 202 |
+
file.write(str(dialogues) + "\n")
|
| 203 |
|
| 204 |
return results
|
| 205 |
|