File size: 30,826 Bytes

de83feb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ecbb6eac",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a4bac354",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import nltk\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "import re\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from fuzzywuzzy import fuzz\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "bfe7183c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data3 = pd.read_csv('final2.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "22f9643b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3720 entries, 0 to 3719\n",
      "Data columns (total 6 columns):\n",
      " #   Column       Non-Null Count  Dtype \n",
      "---  ------       --------------  ----- \n",
      " 0   Unnamed: 0   3720 non-null   int64 \n",
      " 1   topic        3720 non-null   object\n",
      " 2   discription  1748 non-null   object\n",
      " 3   keyword      3204 non-null   object\n",
      " 4   Links        3720 non-null   object\n",
      " 5   level        3720 non-null   object\n",
      "dtypes: int64(1), object(5)\n",
      "memory usage: 174.5+ KB\n"
     ]
    }
   ],
   "source": [
    "data3.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6ef84197",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>topic</th>\n",
       "      <th>discription</th>\n",
       "      <th>keyword</th>\n",
       "      <th>Links</th>\n",
       "      <th>level</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Java</td>\n",
       "      <td>Java is a general-purpose computer programming...</td>\n",
       "      <td>Java, James Gosling, website, wikipedia, docum...</td>\n",
       "      <td>website: https://oracle.com/java/, documentati...</td>\n",
       "      <td>beginner to advance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>JavaScript</td>\n",
       "      <td>JavaScript (), often abbreviated as JS, is a h...</td>\n",
       "      <td>JavaScript, Brendan Eich, reference, wikipedia...</td>\n",
       "      <td>reference: https://www.w3schools.com/js/js_res...</td>\n",
       "      <td>beginner to advance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>C</td>\n",
       "      <td>C (, as in the letter c) is a general-purpose,...</td>\n",
       "      <td>C, Dennis Ritchie, reference, wikipedia, docum...</td>\n",
       "      <td>reference: http://www.c4learn.com/c-programmin...</td>\n",
       "      <td>beginner to advance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Python</td>\n",
       "      <td>Python is a widely used high-level programming...</td>\n",
       "      <td>Python, Guido van Rossum, website, reference, ...</td>\n",
       "      <td>website: https://www.python.org/, reference: h...</td>\n",
       "      <td>beginner to advance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>SQL</td>\n",
       "      <td>SQL ( ( listen) ESS-kew-EL or  ( listen) SEE-k...</td>\n",
       "      <td>SQL, Donald D. Chamberlin and Raymond F. Boyce...</td>\n",
       "      <td>documentation: https://docs.data.world/documen...</td>\n",
       "      <td>beginner to advance</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0       topic                                        discription  \\\n",
       "0           0        Java  Java is a general-purpose computer programming...   \n",
       "1           1  JavaScript  JavaScript (), often abbreviated as JS, is a h...   \n",
       "2           2           C  C (, as in the letter c) is a general-purpose,...   \n",
       "3           3      Python  Python is a widely used high-level programming...   \n",
       "4           4         SQL  SQL ( ( listen) ESS-kew-EL or  ( listen) SEE-k...   \n",
       "\n",
       "                                             keyword  \\\n",
       "0  Java, James Gosling, website, wikipedia, docum...   \n",
       "1  JavaScript, Brendan Eich, reference, wikipedia...   \n",
       "2  C, Dennis Ritchie, reference, wikipedia, docum...   \n",
       "3  Python, Guido van Rossum, website, reference, ...   \n",
       "4  SQL, Donald D. Chamberlin and Raymond F. Boyce...   \n",
       "\n",
       "                                               Links                level  \n",
       "0  website: https://oracle.com/java/, documentati...  beginner to advance  \n",
       "1  reference: https://www.w3schools.com/js/js_res...  beginner to advance  \n",
       "2  reference: http://www.c4learn.com/c-programmin...  beginner to advance  \n",
       "3  website: https://www.python.org/, reference: h...  beginner to advance  \n",
       "4  documentation: https://docs.data.world/documen...  beginner to advance  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "acf74e04",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3720 entries, 0 to 3719\n",
      "Data columns (total 6 columns):\n",
      " #   Column       Non-Null Count  Dtype \n",
      "---  ------       --------------  ----- \n",
      " 0   Unnamed: 0   3720 non-null   int64 \n",
      " 1   topic        3720 non-null   string\n",
      " 2   discription  1748 non-null   string\n",
      " 3   keyword      3720 non-null   string\n",
      " 4   Links        3720 non-null   object\n",
      " 5   level        3720 non-null   string\n",
      "dtypes: int64(1), object(1), string(4)\n",
      "memory usage: 174.5+ KB\n"
     ]
    }
   ],
   "source": [
    "data3['topic'] = data3.topic.astype(\"string\")\n",
    "data3['discription'] = data3.discription.astype(\"string\")\n",
    "data3['keyword'] = data3.keyword.astype(\"string\")\n",
    "data3['level'] = data3.level.astype(\"string\")\n",
    "data3.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64f90df1",
   "metadata": {},
   "source": [
    "# Data Cleaning Process\n",
    "'\n",
    "'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b16989a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "data3['tag'] = data3['discription'] + \" \" + data3['keyword'] +\" \" + data3['level']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "caa02729",
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_symbols(text):\n",
    "  # Create a regular expression pattern to match unwanted symbols\n",
    "    pattern = r'[^\\w\\s]'  # Matches characters that are not alphanumeric or whitespace\n",
    "  # Substitute matched symbols with an empty string\n",
    "    return re.sub(pattern, '', text.lower()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a97fa574",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>topic</th>\n",
       "      <th>discription</th>\n",
       "      <th>keyword</th>\n",
       "      <th>Links</th>\n",
       "      <th>level</th>\n",
       "      <th>tag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Java</td>\n",
       "      <td>Java is a general-purpose computer programming...</td>\n",
       "      <td>Java, James Gosling, website, wikipedia, docum...</td>\n",
       "      <td>website: https://oracle.com/java/, documentati...</td>\n",
       "      <td>beginnertoadvance</td>\n",
       "      <td>java is a generalpurpose computer programming ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>JavaScript</td>\n",
       "      <td>JavaScript (), often abbreviated as JS, is a h...</td>\n",
       "      <td>JavaScript, Brendan Eich, reference, wikipedia...</td>\n",
       "      <td>reference: https://www.w3schools.com/js/js_res...</td>\n",
       "      <td>beginnertoadvance</td>\n",
       "      <td>javascript  often abbreviated as js is a highl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>C</td>\n",
       "      <td>C (, as in the letter c) is a general-purpose,...</td>\n",
       "      <td>C, Dennis Ritchie, reference, wikipedia, docum...</td>\n",
       "      <td>reference: http://www.c4learn.com/c-programmin...</td>\n",
       "      <td>beginnertoadvance</td>\n",
       "      <td>c  as in the letter c is a generalpurpose impe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Python</td>\n",
       "      <td>Python is a widely used high-level programming...</td>\n",
       "      <td>Python, Guido van Rossum, website, reference, ...</td>\n",
       "      <td>website: https://www.python.org/, reference: h...</td>\n",
       "      <td>beginnertoadvance</td>\n",
       "      <td>python is a widely used highlevel programming ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>SQL</td>\n",
       "      <td>SQL ( ( listen) ESS-kew-EL or  ( listen) SEE-k...</td>\n",
       "      <td>SQL, Donald D. Chamberlin and Raymond F. Boyce...</td>\n",
       "      <td>documentation: https://docs.data.world/documen...</td>\n",
       "      <td>beginnertoadvance</td>\n",
       "      <td>sql   listen esskewel or   listen seekwəl or  ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0       topic                                        discription  \\\n",
       "0           0        Java  Java is a general-purpose computer programming...   \n",
       "1           1  JavaScript  JavaScript (), often abbreviated as JS, is a h...   \n",
       "2           2           C  C (, as in the letter c) is a general-purpose,...   \n",
       "3           3      Python  Python is a widely used high-level programming...   \n",
       "4           4         SQL  SQL ( ( listen) ESS-kew-EL or  ( listen) SEE-k...   \n",
       "\n",
       "                                             keyword  \\\n",
       "0  Java, James Gosling, website, wikipedia, docum...   \n",
       "1  JavaScript, Brendan Eich, reference, wikipedia...   \n",
       "2  C, Dennis Ritchie, reference, wikipedia, docum...   \n",
       "3  Python, Guido van Rossum, website, reference, ...   \n",
       "4  SQL, Donald D. Chamberlin and Raymond F. Boyce...   \n",
       "\n",
       "                                               Links              level  \\\n",
       "0  website: https://oracle.com/java/, documentati...  beginnertoadvance   \n",
       "1  reference: https://www.w3schools.com/js/js_res...  beginnertoadvance   \n",
       "2  reference: http://www.c4learn.com/c-programmin...  beginnertoadvance   \n",
       "3  website: https://www.python.org/, reference: h...  beginnertoadvance   \n",
       "4  documentation: https://docs.data.world/documen...  beginnertoadvance   \n",
       "\n",
       "                                                 tag  \n",
       "0  java is a generalpurpose computer programming ...  \n",
       "1  javascript  often abbreviated as js is a highl...  \n",
       "2  c  as in the letter c is a generalpurpose impe...  \n",
       "3  python is a widely used highlevel programming ...  \n",
       "4  sql   listen esskewel or   listen seekwəl or  ...  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data3['tag'] = data3['tag'].fillna('')\n",
    "data3['tag'] = data3['tag'].apply(remove_symbols)\n",
    "data3['level'] = data3['level'].apply(lambda x: x.replace(\" \",\"\"))\n",
    "data3['keyword'] = data3['keyword'].fillna('')\n",
    "data3.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a5a4f1ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'java is a generalpurpose computer programming language that is concurrent classbased objectoriented and specifically designed to have as few implementation dependencies as possible it is intended to let application developers write once run anywhere wora meaning that compiled java code can run on all platforms that support java without the need for recompilation java applications are typically compiled to bytecode that can run on any java virtual machine jvm regardless of computer architecture as of 2016 java is one of the most popular programming languages in use particularly for clientserver web applications with a reported 9 million developers java was originally developed by james gosling at sun microsystems which has since been acquired by oracle corporation and released in 1995 as a core component of sun microsystems java platform the language derives much of its syntax from c and c but it has fewer lowlevel facilities than either of them the original and reference implementation java compilers virtual machines and class libraries were originally released by sun under proprietary licenses as of may 2007 in compliance with the specifications of the java community process sun relicensed most of its java technologies under the gnu general public license others have also developed alternative implementations of these sun technologies such as the gnu compiler for java bytecode compiler gnu classpath standard libraries and icedteaweb browser plugin for applets the latest version is java 9 released on september 21 2017 and is one of the two versions currently supported for free by oracle versions earlier than java 8 are supported by companies on a commercial basis eg by oracle back to java 6 as of october 2017 while they still highly recommend that you uninstall prejava 8 from at least windows computers java james gosling website wikipedia document united states beginnertoadvance'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data3['tag'][0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efb5aaba",
   "metadata": {},
   "source": [
    "# Convert tag columns into vector "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "86f2a927",
   "metadata": {},
   "outputs": [],
   "source": [
    "cv = CountVectorizer( max_features = 5000, stop_words = 'english')\n",
    "vector = cv.fit_transform(data3['tag']).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b99539f9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, ..., 0, 0, 0], dtype=int64)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vector[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6be0d7ec",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['10', '100', '1000', ..., 'λprolog', 'λx', 'μc'], dtype=object)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv.get_feature_names_out()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "019ce68a",
   "metadata": {},
   "source": [
    "# Stemming And Lemmitization Process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "be45a6b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "ps = PorterStemmer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "3635f58c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_query(query):\n",
    "    \n",
    "    # Lowercase the query\n",
    "    cleaned_query = query.lower()\n",
    "\n",
    "    # Remove punctuation (adjust as needed)\n",
    "    import string\n",
    "    punctuation = string.punctuation\n",
    "    cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation])\n",
    "\n",
    "    # Remove stop words (optional, replace with your stop word list)\n",
    "    stop_words = [\"the\", \"a\", \"is\", \"in\", \"of\"]\n",
    "    cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words])\n",
    "\n",
    "    # Stemming\n",
    "    ps = PorterStemmer()\n",
    "    cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()])\n",
    "\n",
    "    # Lemmatization\n",
    "    wnl = WordNetLemmatizer()\n",
    "    cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()])\n",
    "\n",
    "    return cleaned_query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "2787d4d3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'talk'"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocess_query('talked')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "6b8326d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'java jame gosl websit wikipedia document unit state beginnertoadv'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocess_query('java james gosling website wikipedia document united states beginnertoadvance')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "02ff3f52",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       java is a generalpurpos comput program languag...\n",
       "1       javascript often abbrevi as js is a highlevel ...\n",
       "2       c as in the letter c is a generalpurpos imper ...\n",
       "3       python is a wide use highlevel program languag...\n",
       "4       sql listen esskewel or listen seekwəl or skwee...\n",
       "                              ...                        \n",
       "3715    understandingtheprofessionaldataengineercertif...\n",
       "3716    atourofgooglecloudhandsonlab machinelearningen...\n",
       "3717    introductiontoaiandmachinelearningongoogleclou...\n",
       "3718    introductiontoaiandmachinelearningongoogleclou...\n",
       "3719    aifound machinelearningengineerlearningpathweb...\n",
       "Name: tag, Length: 3720, dtype: object"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data3['tag'].apply(stem)   # apply on tag columns "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "66adf3fd",
   "metadata": {},
   "source": [
    "# Find Similarity score for finding most related topic from dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "33126518",
   "metadata": {},
   "outputs": [],
   "source": [
    "similar = cosine_similarity(vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "e1f7379a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(1, 0.9999999999999998),\n",
       " (40, 0.4543441112511213),\n",
       " (350, 0.445852828483904),\n",
       " (134, 0.4049985302736412),\n",
       " (1485, 0.3754717312648463)]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "084d898b",
   "metadata": {},
   "outputs": [],
   "source": [
    "summarizer = pipeline(\"summarization\", model=\"facebook/bart-base\")\n",
    "text_generator = pipeline(\"text-generation\", model=\"gpt2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "0197db1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = []\n",
    "for index, row in data3.iterrows():\n",
    "    topic_description = preprocess_query(row[\"topic\"]) \n",
    "    keywords = preprocess_query(row[\"keyword\"])  \n",
    "    combined_text = f\"{topic_description} {keywords}\"  # Combine for TF-IDF\n",
    "    documents.append(combined_text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "d80d5e6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create TF-IDF vectorizer\n",
    "vectorizer = TfidfVectorizer()\n",
    "\n",
    "# Fit the vectorizer on the documents\n",
    "document_vectors = vectorizer.fit_transform(documents)\n",
    "\n",
    "def recommend_from_dataset(query):\n",
    "    \n",
    "    cleaned_query = preprocess_query(query)\n",
    "    query_vector = vectorizer.transform([cleaned_query])\n",
    "\n",
    "    # Calculate cosine similarity between query and documents\n",
    "    cosine_similarities = cosine_similarity(query_vector, document_vectors)\n",
    "    similarity_scores = cosine_similarities.flatten()\n",
    "\n",
    "    # Sort documents based on similarity scores\n",
    "    sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True)\n",
    "\n",
    "    # Return top N recommendations with scores, topic names, and links (if available)\n",
    "    top_n_results = sorted_results[:5]  \n",
    "    recommendations = []\n",
    "    for result in top_n_results:\n",
    "        score = result[0]\n",
    "        document_id = result[1]\n",
    "        topic_name = data3.loc[document_id, \"topic\"]  \n",
    "        link = data3.loc[document_id, \"Links\"] if \"Links\" in data3.columns else \"No link available\" \n",
    "        if score >= 0.3:\n",
    "            recommendations.append({\"topic_name\": topic_name, \"link\": link, \"score\": score})\n",
    "    return recommendations\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "e56ccfc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fine_tune_model(model_name, train_dataset, validation_dataset, epochs=3):\n",
    "    # Load model and tokenizer\n",
    "    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "    # Define training arguments (adjust parameters as needed)\n",
    "    training_args = TrainingArguments(\n",
    "        output_dir=\"./results\",  # Adjust output directory\n",
    "        per_device_train_batch_size=8,\n",
    "        per_device_eval_batch_size=8,\n",
    "        num_train_epochs=epochs,\n",
    "        save_steps=10_000,\n",
    "    )\n",
    "\n",
    "    # Create a Trainer instance for fine-tuning\n",
    "    trainer = Trainer(\n",
    "        model=model,\n",
    "        args=training_args,\n",
    "        train_dataset=train_dataset,\n",
    "        eval_dataset=validation_dataset,\n",
    "        tokenizer=tokenizer,\n",
    "    )\n",
    "\n",
    "    # Train the model\n",
    "    trainer.train()\n",
    "\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "9c1c02c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset =   # Prepare your training dataset\n",
    "validation_dataset = ...  # Prepare your validation dataset\n",
    "\n",
    "# Fine-tune the model (replace model name if needed)\n",
    "fine_tuned_model = fine_tune_model(\"facebook/bart-base\", train_dataset, validation_dataset)\n",
    "\n",
    "# Update summarization pipeline with the fine-tuned model\n",
    "summarizer1 = pipeline(\"text-generation\", model=fine_tuned_model, tokenizer=fine_tuned_model.tokenizer)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "49baeaf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarize_and_generate(user_query, recommendations):\n",
    "    \n",
    "    # Summarize the user query\n",
    "    query_summary = summarizer(user_query, max_length=100, truncation=True)[0][\"summary_text\"]\n",
    "\n",
    "    # Generate creative text related to the query\n",
    "    generated_text = text_generator(f\"Exploring the concept of {user_query}\", max_length=100, num_return_sequences=1)[0][\"generated_text\"]\n",
    "\n",
    "    # Extract related links with scores\n",
    "    related_links = []\n",
    "    for recommendation in recommendations:\n",
    "        related_links.append({\"topic\": recommendation[\"topic_name\"], \"link\": recommendation[\"link\"], \"score\": recommendation[\"score\"]})\n",
    "\n",
    "    return {\n",
    "        \"query_summary\": query_summary.strip(),\n",
    "        \"generated_text\": generated_text.strip(),\n",
    "        \"related_links\": related_links\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "fb9e58cc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Your max_length is set to 100, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)\n",
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query Summary: java by james goslinjames groslin\n",
      "Creative Text: Exploring the concept of java by james goslin is an impressive effort at the best of times and I'm very impressed by how well this was done. The code looks quite simple for simple purposes — there are only two basic methods, call() and destroy(). These two methods are used by most of the java libraries, so any Java that relies on call() or destroy() should use a proper method of your choice as well. Also, the code uses a single method, so that\n",
      "Some Related Links for your query:\n",
      "- Java:\n",
      " website: https://oracle.com/java/, documentation: https://docs.oracle.com/en/java/, wikipedia: https://en.wikipedia.org/wiki/Java_(programming_language) : \n",
      " Score: 0.625462748622542\n",
      "- Java Properties:\n",
      " wikipedia: https://en.wikipedia.org/wiki/.properties : \n",
      " Score: 0.3952596829701199\n",
      "- Java Bytecode:\n",
      " documentation: https://docs.oracle.com/javase/specs/jvms/se7/html/, wikipedia: https://en.wikipedia.org/wiki/Java_bytecode : \n",
      " Score: 0.38255306128391625\n",
      "- Query by Example:\n",
      " reference: https://semanticscholar.org/paper/f320e453ae65ddf0a3789f4383fa164481c7a8b3, wikipedia: https://en.wikipedia.org/wiki/Query_by_Example : \n",
      " Score: 0.3726562653850712\n",
      "- Join Java:\n",
      " wikipedia: https://en.wikipedia.org/wiki/Join_Java : \n",
      " Score: 0.3143513411797295\n"
     ]
    }
   ],
   "source": [
    "user_query = \"java by james goslin\"\n",
    "recommendations = recommend_from_dataset(user_query)\n",
    "\n",
    "# Get the summary, generated text, and related links\n",
    "results = summarize_and_generate(user_query, recommendations)\n",
    "\n",
    "print(f\"Query Summary: {results['query_summary']}\")\n",
    "print(f\"Creative Text: {results['generated_text']}\")\n",
    "print(\"Some Related Links for your query:\")\n",
    "for link in results[\"related_links\"]:\n",
    "    print(f\"- {link['topic']}:\\n {link['link']} : \\n Score: {link['score']}\") #(Score: {link['score']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46535752",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}