In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
movie_details = pd.read_json('/kaggle/input/movie-details/IMDB_movie_details.json', lines=True)

In [None]:
reviews = pd.read_json('/kaggle/input/bad-words-flag/better_reviews.json')

In [None]:
print(movie_details.head())
print(reviews.head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Preprocess and merge data
movie_details.dropna(subset=['plot_synopsis', 'plot_summary'], inplace=True)
reviews.dropna(subset=['review_text'], inplace=True)
data = pd.merge(reviews, movie_details, on='movie_id')

# data = data.head(10000)
data.head()

In [None]:
# Function to split the synopsis into three parts
def split_synopsis(text):
    parts = len(text.split()) // 3
    return text.split()[:parts], text.split()[parts:2*parts], text.split()[2*parts:]

# Calculate the proximity of review text to the end of the plot synopsis
def calculate_proximity(review, synopsis):
    _, _, end = split_synopsis(synopsis)
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([review, ' '.join(end)])
    return cosine_similarity(vectors)[0, 1]

In [None]:
from nltk.tokenize import sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
import nltk

nltk.download('punkt')
nltk.download('vader_lexicon')

counter = 0

def calculate_sentiment_proximity(review, synopsis):
    global counter
    counter += 1
    if counter % 5000 == 0:
        print(counter, "Records Ended!")
    review_sentences = sent_tokenize(review)
    synopsis_sentences = sent_tokenize(synopsis)
    vectorizer = TfidfVectorizer()
    sentiment_analyzer = SentimentIntensityAnalyzer()

    if len(synopsis_sentences) == 0:
        return 0
    # Vectorize the synopsis
    synopsis_vectors = vectorizer.fit_transform(synopsis_sentences)
    synopsis_sentiments = [sentiment_analyzer.polarity_scores(sentence)['compound'] for sentence in synopsis_sentences]

    proximity_scores = []
    for sentence in review_sentences:
        sentence_vector = vectorizer.transform([sentence])
        sentence_sentiment = sentiment_analyzer.polarity_scores(sentence)['compound']
        similarities = cosine_similarity(sentence_vector, synopsis_vectors)[0]

        # Weighing similarity by sentiment intensity and position
        sentiment_weights = [abs(sentence_sentiment - s_sentiment) for s_sentiment in synopsis_sentiments]
        weighted_similarities = similarities * np.array(sentiment_weights)
        proximity_scores.append(weighted_similarities.max())

    return np.mean(proximity_scores)

# Applying the function to calculate proximity based on sentiment
data['end_proximity'] = data.apply(lambda x: calculate_sentiment_proximity(x['review_text'], x['plot_synopsis']), axis=1)

In [None]:
scaler = MinMaxScaler()
data['end_proximity'] = scaler.fit_transform(data[['end_proximity']])

In [None]:
data.to_json('/kaggle/input/final_dataset2.json', orient='records', lines=True)