Create analyzer.py
Browse files- analyzer.py +135 -0
analyzer.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# analyzer.py (Key Point Extraction)
|
2 |
+
from transformers import pipeline
|
3 |
+
import re
|
4 |
+
from datetime import datetime, timedelta
|
5 |
+
import config
|
6 |
+
|
7 |
+
# Load NLP model
|
8 |
+
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum")
|
9 |
+
|
10 |
+
class MeetingAnalyzer:
|
11 |
+
def __init__(self):
|
12 |
+
self.transcript_chunks = []
|
13 |
+
self.speakers = {}
|
14 |
+
self.current_speaker = "Unknown"
|
15 |
+
self.action_items = []
|
16 |
+
self.decisions = []
|
17 |
+
|
18 |
+
def process_chunk(self, text_chunk):
|
19 |
+
self.transcript_chunks.append(text_chunk)
|
20 |
+
|
21 |
+
# Simple speaker detection
|
22 |
+
if ":" in text_chunk:
|
23 |
+
speaker, content = text_chunk.split(":", 1)
|
24 |
+
self.current_speaker = speaker.strip()
|
25 |
+
if self.current_speaker not in self.speakers:
|
26 |
+
self.speakers[self.current_speaker] = []
|
27 |
+
self.speakers[self.current_speaker].append(content.strip())
|
28 |
+
|
29 |
+
def generate_summary(self):
|
30 |
+
full_text = " ".join(self.transcript_chunks)
|
31 |
+
|
32 |
+
# If text is too short, skip summarization
|
33 |
+
if len(full_text.split()) < 50:
|
34 |
+
return "Not enough content for summary"
|
35 |
+
|
36 |
+
# Generate summary in chunks for long meetings
|
37 |
+
max_chunk_size = 1000
|
38 |
+
chunks = [full_text[i:i+max_chunk_size] for i in range(0, len(full_text), max_chunk_size)]
|
39 |
+
|
40 |
+
summaries = []
|
41 |
+
for chunk in chunks:
|
42 |
+
summary = summarizer(
|
43 |
+
chunk,
|
44 |
+
max_length=config.SUMMARY_MAX_LENGTH,
|
45 |
+
min_length=config.SUMMARY_MIN_LENGTH,
|
46 |
+
do_sample=False
|
47 |
+
)[0]['summary_text']
|
48 |
+
summaries.append(summary)
|
49 |
+
|
50 |
+
return " ".join(summaries)
|
51 |
+
|
52 |
+
def extract_action_items(self):
|
53 |
+
full_text = " ".join(self.transcript_chunks)
|
54 |
+
action_items = []
|
55 |
+
|
56 |
+
# Pattern matching for action items
|
57 |
+
patterns = [
|
58 |
+
r"(\bwill\b.*?\bby\b\s+\w+\s+\d{1,2})",
|
59 |
+
r"(\baction\b:\s*(.*?)(?:\bdeadline\b|\bby\b|\bfor\b)\s*(\w+\s+\d{1,2}))",
|
60 |
+
r"(\btodo\b:\s*(.*?)(?:\bdue\b|\bby\b)\s*(\w+\s+\d{1,2}))",
|
61 |
+
r"(\bassign(?:ed)? to\b\s+(\w+):\s*(.*?)(?:\bdeadline\b|\bby\b)\s*(\w+\s+\d{1,2}))"
|
62 |
+
]
|
63 |
+
|
64 |
+
for pattern in patterns:
|
65 |
+
for match in re.finditer(pattern, full_text, re.IGNORECASE):
|
66 |
+
groups = match.groups()
|
67 |
+
if groups:
|
68 |
+
# Different patterns have different group structures
|
69 |
+
if len(groups) == 1:
|
70 |
+
task = groups[0]
|
71 |
+
owner = "Unassigned"
|
72 |
+
deadline = "ASAP"
|
73 |
+
elif len(groups) == 3:
|
74 |
+
task = groups[1]
|
75 |
+
owner = groups[0]
|
76 |
+
deadline = groups[2]
|
77 |
+
else:
|
78 |
+
task = groups[0]
|
79 |
+
owner = "Unassigned"
|
80 |
+
deadline = "ASAP"
|
81 |
+
|
82 |
+
action_items.append({
|
83 |
+
"task": task.strip(),
|
84 |
+
"owner": owner.strip(),
|
85 |
+
"deadline": self.normalize_deadline(deadline.strip())
|
86 |
+
})
|
87 |
+
|
88 |
+
return action_items
|
89 |
+
|
90 |
+
def detect_urgent_action_items(self):
|
91 |
+
urgent_items = []
|
92 |
+
for item in self.action_items:
|
93 |
+
if "urgent" in item['task'].lower() or "asap" in item['deadline'].lower():
|
94 |
+
urgent_items.append(item)
|
95 |
+
return urgent_items
|
96 |
+
|
97 |
+
def extract_decisions(self):
|
98 |
+
full_text = " ".join(self.transcript_chunks)
|
99 |
+
decisions = []
|
100 |
+
|
101 |
+
# Pattern matching for decisions
|
102 |
+
patterns = [
|
103 |
+
r"\bdecided to\b (.*?)[\.\n]",
|
104 |
+
r"\bagreed that\b (.*?)[\.\n]",
|
105 |
+
r"\bconsensus is\b (.*?)[\.\n]",
|
106 |
+
r"\bresolution\b: (.*?)[\.\n]"
|
107 |
+
]
|
108 |
+
|
109 |
+
for pattern in patterns:
|
110 |
+
for match in re.finditer(pattern, full_text, re.IGNORECASE):
|
111 |
+
decision = match.group(1).strip()
|
112 |
+
decisions.append(decision)
|
113 |
+
|
114 |
+
return decisions
|
115 |
+
|
116 |
+
def normalize_deadline(self, deadline_str):
|
117 |
+
today = datetime.now()
|
118 |
+
lower_str = deadline_str.lower()
|
119 |
+
|
120 |
+
if "today" in lower_str:
|
121 |
+
return today.strftime("%Y-%m-%d")
|
122 |
+
elif "tomorrow" in lower_str:
|
123 |
+
return (today + timedelta(days=1)).strftime("%Y-%m-%d")
|
124 |
+
elif "next week" in lower_str:
|
125 |
+
return (today + timedelta(weeks=1)).strftime("%Y-%m-%d")
|
126 |
+
elif "eod" in lower_str:
|
127 |
+
return today.strftime("%Y-%m-%d")
|
128 |
+
elif "eow" in lower_str:
|
129 |
+
# Find next Friday
|
130 |
+
days_ahead = 4 - today.weekday() # 0 = Monday, 4 = Friday
|
131 |
+
if days_ahead <= 0: # If today is Friday or weekend
|
132 |
+
days_ahead += 7
|
133 |
+
return (today + timedelta(days=days_ahead)).strftime("%Y-%m-%d")
|
134 |
+
|
135 |
+
return deadline_str
|