Roshanik commited on
Commit
ff0af81
·
verified ·
1 Parent(s): 329f9b2

Upload data_preprocess.py

Browse files
Files changed (1) hide show
  1. data_preprocess.py +319 -0
data_preprocess.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # In[1]:
4
+
5
+
6
+ from transformers import pipeline
7
+ from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM
8
+
9
+
10
+ # In[2]:
11
+
12
+
13
+ import pandas as pd
14
+ import numpy as np
15
+ from sklearn.feature_extraction.text import CountVectorizer
16
+ import nltk
17
+ from nltk.stem.porter import PorterStemmer
18
+ from nltk.stem import WordNetLemmatizer
19
+ import re
20
+ from sklearn.metrics.pairwise import cosine_similarity
21
+ from fuzzywuzzy import fuzz
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+
24
+
25
+ # In[47]:
26
+
27
+
28
+ data3 = pd.read_csv('final2.csv')
29
+
30
+
31
+ # In[5]:
32
+
33
+
34
+ data3.info()
35
+
36
+
37
+ # In[6]:
38
+
39
+
40
+ data3.head()
41
+
42
+
43
+ # In[9]:
44
+
45
+
46
+ data3['topic'] = data3.topic.astype("string")
47
+ data3['discription'] = data3.discription.astype("string")
48
+ data3['keyword'] = data3.keyword.astype("string")
49
+ data3['level'] = data3.level.astype("string")
50
+ data3.info()
51
+
52
+
53
+ # # Data Cleaning Process
54
+ # '
55
+ # '
56
+ #
57
+
58
+ # In[10]:
59
+
60
+
61
+ data3['tag'] = data3['discription'] + " " + data3['keyword'] +" " + data3['level']
62
+
63
+
64
+ # In[11]:
65
+
66
+
67
+ def remove_symbols(text):
68
+ # Create a regular expression pattern to match unwanted symbols
69
+ pattern = r'[^\w\s]' # Matches characters that are not alphanumeric or whitespace
70
+ # Substitute matched symbols with an empty string
71
+ return re.sub(pattern, '', text.lower())
72
+
73
+
74
+ # In[12]:
75
+
76
+
77
+ data3['tag'] = data3['tag'].fillna('')
78
+ data3['tag'] = data3['tag'].apply(remove_symbols)
79
+ data3['level'] = data3['level'].apply(lambda x: x.replace(" ",""))
80
+ data3['keyword'] = data3['keyword'].fillna('')
81
+ data3.head()
82
+
83
+
84
+ # In[13]:
85
+
86
+
87
+ data3['tag'][0]
88
+
89
+
90
+ # # Convert tag columns into vector
91
+
92
+ # In[14]:
93
+
94
+
95
+ cv = CountVectorizer( max_features = 5000, stop_words = 'english')
96
+ vector = cv.fit_transform(data3['tag']).toarray()
97
+
98
+
99
+ # In[15]:
100
+
101
+
102
+ vector[0]
103
+
104
+
105
+ # In[16]:
106
+
107
+
108
+ cv.get_feature_names_out()
109
+
110
+
111
+ # # Stemming And Lemmitization Process
112
+
113
+ # In[18]:
114
+
115
+
116
+ ps = PorterStemmer()
117
+
118
+
119
+ # In[30]:
120
+
121
+
122
+ def preprocess_query(query):
123
+
124
+ # Lowercase the query
125
+ cleaned_query = query.lower()
126
+
127
+ # Remove punctuation (adjust as needed)
128
+ import string
129
+ punctuation = string.punctuation
130
+ cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation])
131
+
132
+ # Remove stop words (optional, replace with your stop word list)
133
+ stop_words = ["the", "a", "is", "in", "of"]
134
+ cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words])
135
+
136
+ # Stemming
137
+ ps = PorterStemmer()
138
+ cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()])
139
+
140
+ # Lemmatization
141
+ wnl = WordNetLemmatizer()
142
+ cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()])
143
+
144
+ return cleaned_query
145
+
146
+
147
+ # In[32]:
148
+
149
+
150
+ preprocess_query('talked')
151
+
152
+
153
+ # In[31]:
154
+
155
+
156
+ preprocess_query('java james gosling website wikipedia document united states beginnertoadvance')
157
+
158
+
159
+ # In[23]:
160
+
161
+
162
+ data3['tag'].apply(stem) # apply on tag columns
163
+
164
+
165
+ # # Find Similarity score for finding most related topic from dataset
166
+
167
+ # In[24]:
168
+
169
+
170
+ similar = cosine_similarity(vector)
171
+
172
+
173
+ # In[27]:
174
+
175
+
176
+ sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5]
177
+
178
+
179
+ # In[29]:
180
+
181
+
182
+ summarizer = pipeline("summarization", model="facebook/bart-base")
183
+ text_generator = pipeline("text-generation", model="gpt2")
184
+
185
+
186
+ # In[34]:
187
+
188
+
189
+ documents = []
190
+ for index, row in data3.iterrows():
191
+ topic_description = preprocess_query(row["topic"])
192
+ keywords = preprocess_query(row["keyword"])
193
+ combined_text = f"{topic_description} {keywords}" # Combine for TF-IDF
194
+ documents.append(combined_text)
195
+
196
+
197
+ # In[35]:
198
+
199
+
200
+ # Create TF-IDF vectorizer
201
+ vectorizer = TfidfVectorizer()
202
+
203
+ # Fit the vectorizer on the documents
204
+ document_vectors = vectorizer.fit_transform(documents)
205
+
206
+ def recommend_from_dataset(query):
207
+
208
+ cleaned_query = preprocess_query(query)
209
+ query_vector = vectorizer.transform([cleaned_query])
210
+
211
+ # Calculate cosine similarity between query and documents
212
+ cosine_similarities = cosine_similarity(query_vector, document_vectors)
213
+ similarity_scores = cosine_similarities.flatten()
214
+
215
+ # Sort documents based on similarity scores
216
+ sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True)
217
+
218
+ # Return top N recommendations with scores, topic names, and links (if available)
219
+ top_n_results = sorted_results[:5]
220
+ recommendations = []
221
+ for result in top_n_results:
222
+ score = result[0]
223
+ document_id = result[1]
224
+ topic_name = data3.loc[document_id, "topic"]
225
+ link = data3.loc[document_id, "Links"] if "Links" in data3.columns else "No link available"
226
+ if score >= 0.3:
227
+ recommendations.append({"topic_name": topic_name, "link": link, "score": score})
228
+ return recommendations
229
+
230
+
231
+ # In[36]:
232
+
233
+
234
+ def fine_tune_model(model_name, train_dataset, validation_dataset, epochs=3):
235
+ # Load model and tokenizer
236
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
237
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
238
+
239
+ # Define training arguments (adjust parameters as needed)
240
+ training_args = TrainingArguments(
241
+ output_dir="./results", # Adjust output directory
242
+ per_device_train_batch_size=8,
243
+ per_device_eval_batch_size=8,
244
+ num_train_epochs=epochs,
245
+ save_steps=10_000,
246
+ )
247
+
248
+ # Create a Trainer instance for fine-tuning
249
+ trainer = Trainer(
250
+ model=model,
251
+ args=training_args,
252
+ train_dataset=train_dataset,
253
+ eval_dataset=validation_dataset,
254
+ tokenizer=tokenizer,
255
+ )
256
+
257
+ # Train the model
258
+ trainer.train()
259
+
260
+ return model
261
+
262
+
263
+ # In[39]:
264
+
265
+
266
+ # train_dataset = ... # Prepare your training dataset
267
+ # validation_dataset = ... # Prepare your validation dataset
268
+
269
+ # Fine-tune the model (replace model name if needed)
270
+ # fine_tuned_model = fine_tune_model("facebook/bart-base", train_dataset, validation_dataset)
271
+
272
+ # Update summarization pipeline with the fine-tuned model
273
+ # summarizer1 = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_model.tokenizer)
274
+
275
+
276
+ # In[45]:
277
+
278
+
279
+ def summarize_and_generate(user_query, recommendations):
280
+
281
+ # Summarize the user query
282
+ query_summary = summarizer(user_query, max_length=100, truncation=True)[0]["summary_text"]
283
+
284
+ # Generate creative text related to the query
285
+ generated_text = text_generator(f"Exploring the concept of {user_query}", max_length=100, num_return_sequences=1)[0]["generated_text"]
286
+
287
+ # Extract related links with scores
288
+ related_links = []
289
+ for recommendation in recommendations:
290
+ related_links.append({"topic": recommendation["topic_name"], "link": recommendation["link"], "score": recommendation["score"]})
291
+
292
+ return {
293
+ "query_summary": query_summary.strip(),
294
+ "generated_text": generated_text.strip(),
295
+ "related_links": related_links
296
+ }
297
+
298
+
299
+ # In[46]:
300
+
301
+
302
+ user_query = "java by james goslin"
303
+ recommendations = recommend_from_dataset(user_query)
304
+
305
+ # Get the summary, generated text, and related links
306
+ results = summarize_and_generate(user_query, recommendations)
307
+
308
+ print(f"Query Summary: {results['query_summary']}")
309
+ print(f"Creative Text: {results['generated_text']}")
310
+ print("Some Related Links for your query:")
311
+ for link in results["related_links"]:
312
+ print(f"- {link['topic']}:\n {link['link']} : \n Score: {link['score']}") #(Score: {link['score']})
313
+
314
+
315
+ # In[ ]:
316
+
317
+
318
+
319
+