|
|
|
|
|
|
|
|
|
|
|
from transformers import pipeline |
|
from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM |
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import nltk |
|
from nltk.stem.porter import PorterStemmer |
|
from nltk.stem import WordNetLemmatizer |
|
import re |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from fuzzywuzzy import fuzz |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
|
|
|
|
|
|
|
data3 = pd.read_csv('final2.csv') |
|
|
|
|
|
|
|
|
|
|
|
data3.info() |
|
|
|
|
|
|
|
|
|
|
|
data3.head() |
|
|
|
|
|
|
|
|
|
|
|
data3['topic'] = data3.topic.astype("string") |
|
data3['discription'] = data3.discription.astype("string") |
|
data3['keyword'] = data3.keyword.astype("string") |
|
data3['level'] = data3.level.astype("string") |
|
data3.info() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data3['tag'] = data3['discription'] + " " + data3['keyword'] +" " + data3['level'] |
|
|
|
|
|
|
|
|
|
|
|
def remove_symbols(text): |
|
|
|
pattern = r'[^\w\s]' |
|
|
|
return re.sub(pattern, '', text.lower()) |
|
|
|
|
|
|
|
|
|
|
|
data3['tag'] = data3['tag'].fillna('') |
|
data3['tag'] = data3['tag'].apply(remove_symbols) |
|
data3['level'] = data3['level'].apply(lambda x: x.replace(" ","")) |
|
data3['keyword'] = data3['keyword'].fillna('') |
|
data3.head() |
|
|
|
|
|
|
|
|
|
|
|
data3['tag'][0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cv = CountVectorizer( max_features = 5000, stop_words = 'english') |
|
vector = cv.fit_transform(data3['tag']).toarray() |
|
|
|
|
|
|
|
|
|
|
|
vector[0] |
|
|
|
|
|
|
|
|
|
|
|
cv.get_feature_names_out() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ps = PorterStemmer() |
|
|
|
|
|
|
|
|
|
|
|
def preprocess_query(query): |
|
|
|
|
|
cleaned_query = query.lower() |
|
|
|
|
|
import string |
|
punctuation = string.punctuation |
|
cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation]) |
|
|
|
|
|
stop_words = ["the", "a", "is", "in", "of"] |
|
cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words]) |
|
|
|
|
|
ps = PorterStemmer() |
|
cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()]) |
|
|
|
|
|
wnl = WordNetLemmatizer() |
|
cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()]) |
|
|
|
return cleaned_query |
|
|
|
|
|
|
|
|
|
|
|
preprocess_query('talked') |
|
|
|
|
|
|
|
|
|
|
|
preprocess_query('java james gosling website wikipedia document united states beginnertoadvance') |
|
|
|
|
|
|
|
|
|
|
|
data3['tag'].apply(stem) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
similar = cosine_similarity(vector) |
|
|
|
|
|
|
|
|
|
|
|
sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5] |
|
|
|
|
|
|
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-base") |
|
text_generator = pipeline("text-generation", model="gpt2") |
|
|
|
|
|
|
|
|
|
|
|
documents = [] |
|
for index, row in data3.iterrows(): |
|
topic_description = preprocess_query(row["topic"]) |
|
keywords = preprocess_query(row["keyword"]) |
|
combined_text = f"{topic_description} {keywords}" |
|
documents.append(combined_text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
|
|
|
|
document_vectors = vectorizer.fit_transform(documents) |
|
|
|
def recommend_from_dataset(query): |
|
|
|
cleaned_query = preprocess_query(query) |
|
query_vector = vectorizer.transform([cleaned_query]) |
|
|
|
|
|
cosine_similarities = cosine_similarity(query_vector, document_vectors) |
|
similarity_scores = cosine_similarities.flatten() |
|
|
|
|
|
sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True) |
|
|
|
|
|
top_n_results = sorted_results[:5] |
|
recommendations = [] |
|
for result in top_n_results: |
|
score = result[0] |
|
document_id = result[1] |
|
topic_name = data3.loc[document_id, "topic"] |
|
link = data3.loc[document_id, "Links"] if "Links" in data3.columns else "No link available" |
|
if score >= 0.3: |
|
recommendations.append({"topic_name": topic_name, "link": link, "score": score}) |
|
return recommendations |
|
|
|
|
|
|
|
|
|
|
|
def fine_tune_model(model_name, train_dataset, validation_dataset, epochs=3): |
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
num_train_epochs=epochs, |
|
save_steps=10_000, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=validation_dataset, |
|
tokenizer=tokenizer, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def summarize_and_generate(user_query, recommendations): |
|
|
|
|
|
query_summary = summarizer(user_query, max_length=100, truncation=True)[0]["summary_text"] |
|
|
|
|
|
generated_text = text_generator(f"Exploring the concept of {user_query}", max_length=100, num_return_sequences=1)[0]["generated_text"] |
|
|
|
|
|
related_links = [] |
|
for recommendation in recommendations: |
|
related_links.append({"topic": recommendation["topic_name"], "link": recommendation["link"], "score": recommendation["score"]}) |
|
|
|
return { |
|
"query_summary": query_summary.strip(), |
|
"generated_text": generated_text.strip(), |
|
"related_links": related_links |
|
} |
|
|
|
|
|
|
|
|
|
|
|
user_query = "java by james goslin" |
|
recommendations = recommend_from_dataset(user_query) |
|
|
|
|
|
results = summarize_and_generate(user_query, recommendations) |
|
|
|
print(f"Query Summary: {results['query_summary']}") |
|
print(f"Creative Text: {results['generated_text']}") |
|
print("Some Related Links for your query:") |
|
for link in results["related_links"]: |
|
print(f"- {link['topic']}:\n {link['link']} : \n Score: {link['score']}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|