File size: 6,811 Bytes
0bdb125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import convokit
import re
import matplotlib.pyplot as plt
import json
import random


class DialogProcessor:
    def __init__(self):
        self.corpus = None

    def load_corpus(self, corpus_path):
        # Load the corpus from a local file or download it
        if os.path.exists(corpus_path):
            self.corpus = convokit.Corpus(filename=corpus_path)
        else:
            print("Corpus not found locally. Downloading from Convokit...")
            self.corpus = convokit.Corpus(filename=convokit.download('movie-corpus'))

    import re

    @staticmethod
    def preprocess_text(text):
        """Clean text by removing tags, quotes, extra spaces, and expanding contractions."""
        if not text:
            return ""  # Handle missing data gracefully

        # Remove XML-like tags
        cleaned_text = re.sub(r'<[^>]+>', '', text)  # More general than <u>...</u>

        # Remove double quotes
        cleaned_text = cleaned_text.replace('"', '')

        # Normalize spaces
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

        # Convert to lowercase
        txt = cleaned_text.lower()

        # Expanded contractions mapping
        contractions = {
            "i'm": "i am", "he's": "he is", "she's": "she is", "that's": "that is",
            "what's": "what is", "where's": "where is", "who's": "who is", "how's": "how is",
            "it's": "it is", "let's": "let us", "they're": "they are", "we're": "we are",
            "you're": "you are", "i've": "i have", "you've": "you have", "we've": "we have",
            "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
            "she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
            "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
            "they'll": "they will", "don't": "do not", "doesn't": "does not", "didn't": "did not",
            "won't": "will not", "wouldn't": "would not", "can't": "cannot", "couldn't": "could not",
            "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not",
            "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
            "haven't": "have not", "hasn't": "has not", "hadn't": "had not"
        }

        # Expand contractions
        for contraction, expansion in contractions.items():
            txt = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, txt)

        # Remove non-alphanumeric characters except apostrophes
        txt = re.sub(r"[^a-zA-Z0-9' ]", " ", txt)  # Keeps letters, numbers, and apostrophes
        txt = re.sub(r"\s+", " ", txt).strip()  # Remove extra spaces

        return txt


    def group_conversations(self):
        """Process and structure conversations into distinct, numbered groups with tuple pairs."""
        if self.corpus is None:
            raise ValueError("Corpus is not loaded.")

        grouped_dialogues = {}  # Format: {conversation_id: [(input, response), ...]}
        misc_dialogues = []  # Shorter convos go here
        conversation_ids = list(self.corpus.get_conversation_ids())
        current_dialog_id = 1  # Start numbering from 1

        for conversation_id in conversation_ids:
            conversation = self.corpus.get_conversation(conversation_id)
            utterances = conversation.get_utterance_ids()
            current_dialog = []

            for i in range(len(utterances) - 1):
                utterance = self.corpus.get_utterance(utterances[i])

                # Ensure valid text before processing
                if not utterance.text:
                    continue

                data = self.preprocess_text(utterance.text)

                current_dialog.append(data)

            if len(current_dialog) >= 4:  # Save only if the convo has at least 4 exchanges
                grouped_dialogues[str(current_dialog_id)] = current_dialog
                current_dialog_id += 1
            else:
                misc_dialogues.append(current_dialog)
            current_dialog = []  # Reset for the next conversation

        grouped_dialogues["0"] = [convo for convo in misc_dialogues]

        print(f"Processed {len(grouped_dialogues)} grouped conversations, including {len(misc_dialogues)} miscellaneous.")
        return grouped_dialogues

    def plot_token_statistics(self, conversation_averages, conversation_top):
        plt.figure(figsize=(10, 5))
        plt.plot(range(1, len(conversation_averages) + 1), conversation_averages, marker='o', linestyle='-',
                 label="Average Tokens per Input")
        plt.plot(range(1, len(conversation_top) + 1), conversation_top, marker='s', linestyle='--',
                 label="Top Token Count")
        plt.xlabel("Total Number of Conversations")
        plt.ylabel("Token Count")
        plt.title("Token Statistics Over Conversations")
        plt.legend()
        plt.grid(True)
        plt.show()

    def save_grouped_conversations(self, grouped_dialogues):
        average_list = []
        with open('preprocessed_dialogs.py', 'w', encoding="utf-8") as f:
            f.write("dialog_data = {\n")
            for keys, values in grouped_dialogues.items():
                # print(list(values))
                [average_list.append(str(sentences).split(" ")) for sentences in values if sentences not in average_list]
                organized = [(values[i], values[i + 1]) for i in range(len(values)- 1)]
                f.write(f'  "{keys}": {organized},\n')

            f.write("}")

        average_numbers = []
        top_number = []

        for items in average_list:
            new_data = len(items)
            top_number.append(0)
            if new_data > max(top_number) and  new_data not in top_number:
                top_number.append(new_data)
                average_numbers.append(new_data)
            else:
                average_numbers.append(new_data)

        print(list(average_numbers))
        data = sum(average_numbers)/len(average_numbers)
        print(f"Top Tokens: {max(top_number)}")
        print(f"Average:  {data}  ")
        self.plot_token_statistics(average_numbers, top_number)


if __name__ == "__main__":
    dialog_processor = DialogProcessor()

    # Specify the corpus path or use 'movie-corpus' to download
    corpus_path = "D:\\movie-corpus"  # Change to your actual local path
    dialog_processor.load_corpus(corpus_path)

    grouped_dialogues = dialog_processor.group_conversations()

    # Save the processed dialogues as JSON
    dialog_processor.save_grouped_conversations(grouped_dialogues)