reab5555 commited on
Commit
1db0375
·
verified ·
1 Parent(s): 9a04f34

Create processing.py

Browse files
Files changed (1) hide show
  1. processing.py +109 -0
processing.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import shutil
4
+ from langdetect import detect
5
+ from transformers import AutoTokenizer
6
+ from langchain.chains import RetrievalQA
7
+ from diarization import process_video as diarize_video
8
+
9
+ # Include the necessary imports and lazy loading classes here
10
+
11
+ def detect_language(text):
12
+ try:
13
+ return detect(text)
14
+ except:
15
+ return "en" # default to English if detection fails
16
+
17
+ def count_words_and_tokens(text):
18
+ words = len(text.split())
19
+ tokens = len(AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3").tokenize(text))
20
+ return words, tokens
21
+
22
+ def process_text(input_file):
23
+ with open(input_file.name, 'r', encoding='utf-8') as file:
24
+ content = file.read()
25
+ words, tokens = count_words_and_tokens(content)
26
+ input_info = f"Text file processed. Words: {words}, Tokens: {tokens}"
27
+ return content, input_info
28
+
29
+ def process_pdf(input_file):
30
+ loader = PyPDFLoader(input_file.name)
31
+ pages = loader.load_and_split()
32
+ content = '\n'.join([page.page_content for page in pages])
33
+ words, tokens = count_words_and_tokens(content)
34
+ input_info = f"PDF file processed. Words: {words}, Tokens: {tokens}"
35
+ return content, input_info
36
+
37
+ def process_video(input_file, progress):
38
+ file_extension = os.path.splitext(input_file.name)[1].lower()
39
+ temp_video_path = "temp_video" + file_extension
40
+ shutil.copy2(input_file.name, temp_video_path)
41
+
42
+ if progress:
43
+ progress(0.2, desc="Transcribing video...")
44
+
45
+ language = "en" # Default to English for video files
46
+ diarize_video(temp_video_path, os.environ.get('hf_secret'), language)
47
+
48
+ srt_path = temp_video_path.replace(file_extension, "_combined.srt")
49
+ with open(srt_path, 'r', encoding='utf-8') as file:
50
+ content = file.read()
51
+ words, tokens = count_words_and_tokens(content)
52
+ input_info = f"Input Words: {words} / Input Tokens: {tokens}"
53
+
54
+ return content, input_info
55
+
56
+ def analyze_content(content, progress):
57
+ attachments_chain, bigfive_chain, personalities_chain = lazy_chains.get_chains()
58
+
59
+ if progress:
60
+ progress(0.6, desc="Analyzing attachments...")
61
+ attachments_result = attachments_chain({"query": content})
62
+ attachments_answer = attachments_result['result'].split("-----------\n\nAnswer:")[-1].strip()
63
+
64
+ if progress:
65
+ progress(0.7, desc="Analyzing Big Five traits...")
66
+ bigfive_result = bigfive_chain({"query": content})
67
+ bigfive_answer = bigfive_result['result'].split("-----------\n\nAnswer:")[-1].strip()
68
+
69
+ if progress:
70
+ progress(0.8, desc="Analyzing personalities...")
71
+ personalities_result = personalities_chain({"query": content})
72
+ personalities_answer = personalities_result['result'].split("-----------\n\nAnswer:")[-1].strip()
73
+
74
+ return attachments_answer, bigfive_answer, personalities_answer
75
+
76
+ def process_input(input_file, progress=None):
77
+ start_time = time.time()
78
+
79
+ if progress:
80
+ progress(0, desc="Processing file...")
81
+
82
+ file_extension = os.path.splitext(input_file.name)[1].lower()
83
+
84
+ if file_extension == '.txt':
85
+ content, input_info = process_text(input_file)
86
+ elif file_extension == '.pdf':
87
+ content, input_info = process_pdf(input_file)
88
+ elif file_extension in ['.mp4', '.avi', '.mov']:
89
+ content, input_info = process_video(input_file, progress)
90
+ else:
91
+ return "Unsupported file format. Please upload a TXT, PDF, or video file.", None, None, None, None, None, None
92
+
93
+ detected_language = detect_language(content)
94
+
95
+ if progress:
96
+ progress(0.4, desc="Analyzing content...")
97
+
98
+ attachments_answer, bigfive_answer, personalities_answer = analyze_content(content, progress)
99
+
100
+ end_time = time.time()
101
+ execution_time = end_time - start_time
102
+
103
+ execution_info = f"{execution_time:.2f} seconds"
104
+
105
+ if progress:
106
+ progress(1.0, desc="Analysis complete!")
107
+
108
+ return ("Analysis complete!", execution_info, detected_language, input_info,
109
+ attachments_answer, bigfive_answer, personalities_answer)