File size: 6,811 Bytes
0bdb125 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import os
import convokit
import re
import matplotlib.pyplot as plt
import json
import random
class DialogProcessor:
def __init__(self):
self.corpus = None
def load_corpus(self, corpus_path):
# Load the corpus from a local file or download it
if os.path.exists(corpus_path):
self.corpus = convokit.Corpus(filename=corpus_path)
else:
print("Corpus not found locally. Downloading from Convokit...")
self.corpus = convokit.Corpus(filename=convokit.download('movie-corpus'))
import re
@staticmethod
def preprocess_text(text):
"""Clean text by removing tags, quotes, extra spaces, and expanding contractions."""
if not text:
return "" # Handle missing data gracefully
# Remove XML-like tags
cleaned_text = re.sub(r'<[^>]+>', '', text) # More general than <u>...</u>
# Remove double quotes
cleaned_text = cleaned_text.replace('"', '')
# Normalize spaces
cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
# Convert to lowercase
txt = cleaned_text.lower()
# Expanded contractions mapping
contractions = {
"i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
"what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is",
"it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are",
"you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have",
"they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
"she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
"you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
"they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not",
"won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not",
"shouldn't": "should not", "mightn't": "might not", "mustn't": "must not",
"isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
"haven't": "have not", "hasn't": "has not", "hadn't": "had not"
}
# Expand contractions
for contraction, expansion in contractions.items():
txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt)
# Remove non-alphanumeric characters except apostrophes
txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt) # Keeps letters, numbers, and apostrophes
txt = re.sub(r"\s+", " ", txt).strip() # Remove extra spaces
return txt
def group_conversations(self):
"""Process and structure conversations into distinct, numbered groups with tuple pairs."""
if self.corpus is None:
raise ValueError("Corpus is not loaded.")
grouped_dialogues = {} # Format: {conversation_id: [(input, response), ...]}
misc_dialogues = [] # Shorter convos go here
conversation_ids = list(self.corpus.get_conversation_ids())
current_dialog_id = 1 # Start numbering from 1
for conversation_id in conversation_ids:
conversation = self.corpus.get_conversation(conversation_id)
utterances = conversation.get_utterance_ids()
current_dialog = []
for i in range(len(utterances) - 1):
utterance = self.corpus.get_utterance(utterances[i])
# Ensure valid text before processing
if not utterance.text:
continue
data = self.preprocess_text(utterance.text)
current_dialog.append(data)
if len(current_dialog) >= 4: # Save only if the convo has at least 4 exchanges
grouped_dialogues[str(current_dialog_id)] = current_dialog
current_dialog_id += 1
else:
misc_dialogues.append(current_dialog)
current_dialog = [] # Reset for the next conversation
grouped_dialogues["0"] = [convo for convo in misc_dialogues]
print(f"Processed {len(grouped_dialogues)} grouped conversations, including {len(misc_dialogues)} miscellaneous.")
return grouped_dialogues
def plot_token_statistics(self, conversation_averages, conversation_top):
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(conversation_averages) + 1), conversation_averages, marker='o', linestyle='-',
label="Average Tokens per Input")
plt.plot(range(1, len(conversation_top) + 1), conversation_top, marker='s', linestyle='--',
label="Top Token Count")
plt.xlabel("Total Number of Conversations")
plt.ylabel("Token Count")
plt.title("Token Statistics Over Conversations")
plt.legend()
plt.grid(True)
plt.show()
def save_grouped_conversations(self, grouped_dialogues):
average_list = []
with open('preprocessed_dialogs.py', 'w', encoding="utf-8") as f:
f.write("dialog_data = {\n")
for keys, values in grouped_dialogues.items():
# print(list(values))
[average_list.append(str(sentences).split(" ")) for sentences in values if sentences not in average_list]
organized = [(values[i], values[i + 1]) for i in range(len(values)- 1)]
f.write(f' "{keys}": {organized},\n')
f.write("}")
average_numbers = []
top_number = []
for items in average_list:
new_data = len(items)
top_number.append(0)
if new_data > max(top_number) and new_data not in top_number:
top_number.append(new_data)
average_numbers.append(new_data)
else:
average_numbers.append(new_data)
print(list(average_numbers))
data = sum(average_numbers)/len(average_numbers)
print(f"Top Tokens: {max(top_number)}")
print(f"Average: {data} ")
self.plot_token_statistics(average_numbers, top_number)
if __name__ == "__main__":
dialog_processor = DialogProcessor()
# Specify the corpus path or use 'movie-corpus' to download
corpus_path = "D:\\movie-corpus" # Change to your actual local path
dialog_processor.load_corpus(corpus_path)
grouped_dialogues = dialog_processor.group_conversations()
# Save the processed dialogues as JSON
dialog_processor.save_grouped_conversations(grouped_dialogues)
|