# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 23.3.1
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0


## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - openjdk


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2023.08.22 |       h06a4308_0         123 KB
    certifi-2023.11.17         |  py310h06a4308_0         158 KB
    openjdk-11.0.13            |       h87a67e3_0       341.0 MB
    ------------------------------------------------------------
                                           Total:       341.3 MB

The following NEW packages will be INSTALLED:

  openjdk            pkgs/main/linux-64::openjdk-11.0.13-h87a67e3_0 

The following packages will be UPDATED:

  ca-certificates    conda-forge::ca-certificates-2023.7.2~ --> pkgs/main::ca-certificates-2023.08.22-h06a4308_0 
  certifi            conda-forge/noarch::certifi-2023.7.22~ --> pkgs/main/linux-64::certifi-2023.11.17-py310h06a4308_0 


Downloading and Extracting Packages
openjdk-11.0.13      | 341.0 MB  |                                       |   0% 
ca-certificates-2023 | 123 KB    |                                       |   0% 

certifi-2023.11.17   | 158 KB    |                                       |   0% 
ca-certificates-2023 | 123 KB    | ##################################### | 100% 

                                                                                
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.
Collecting pyspark==3.4.0
  Using cached pyspark-3.4.0-py2.py3-none-any.whl
Collecting py4j==0.10.9.7 (from pyspark==3.4.0)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.4.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv

[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
Collecting spark-nlp==5.1.3
  Obtaining dependency information for spark-nlp==5.1.3 from https://files.pythonhosted.org/packages/cd/7d/bc0eca4c9ec4c9c1d9b28c42c2f07942af70980a7d912d0aceebf8db32dd/spark_nlp-5.1.3-py2.py3-none-any.whl.metadata
  Using cached spark_nlp-5.1.3-py2.py3-none-any.whl.metadata (53 kB)
Using cached spark_nlp-5.1.3-py2.py3-none-any.whl (537 kB)
Installing collected packages: spark-nlp
Successfully installed spark-nlp-5.1.3
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv

[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.


import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

True


import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.executor.memory", "12g")\
    .config("spark.executor.cores", "3")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider"
    )\
    .getOrCreate()

print(spark.version)

Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider

:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4fcb9748-541d-4d17-a6d5-5e71873aa972;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.errorprone#error_prone_annotations;2.18.0 in central
	found com.google.j2objc#j2objc-annotations;1.3 in central
	found com.google.http-client#google-http-client;1.43.0 in central
	found io.opencensus#opencensus-contrib-http-util;0.31.1 in central
	found com.google.http-client#google-http-client-jackson2;1.43.0 in central
	found com.google.http-client#google-http-client-gson;1.43.0 in central
	found com.google.api-client#google-api-client;2.2.0 in central
	found commons-codec#commons-codec;1.15 in central
	found com.google.oauth-client#google-oauth-client;1.34.1 in central
	found com.google.http-client#google-http-client-apache-v2;1.43.0 in central
	found com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central
	found com.google.code.gson#gson;2.10.1 in central
	found com.google.cloud#google-cloud-core;2.12.0 in central
	found io.grpc#grpc-context;1.53.0 in central
	found com.google.auto.value#auto-value-annotations;1.10.1 in central
	found com.google.auto.value#auto-value;1.10.1 in central
	found javax.annotation#javax.annotation-api;1.3.2 in central
	found commons-logging#commons-logging;1.2 in central
	found com.google.cloud#google-cloud-core-http;2.12.0 in central
	found com.google.http-client#google-http-client-appengine;1.43.0 in central
	found com.google.api#gax-httpjson;0.108.2 in central
	found com.google.cloud#google-cloud-core-grpc;2.12.0 in central
	found io.grpc#grpc-alts;1.53.0 in central
	found io.grpc#grpc-grpclb;1.53.0 in central
	found org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central
	found io.grpc#grpc-auth;1.53.0 in central
	found io.grpc#grpc-protobuf;1.53.0 in central
	found io.grpc#grpc-protobuf-lite;1.53.0 in central
	found io.grpc#grpc-core;1.53.0 in central
	found com.google.api#gax;2.23.2 in central
	found com.google.api#gax-grpc;2.23.2 in central
	found com.google.auth#google-auth-library-credentials;1.16.0 in central
	found com.google.auth#google-auth-library-oauth2-http;1.16.0 in central
	found com.google.api#api-common;2.6.2 in central
	found io.opencensus#opencensus-api;0.31.1 in central
	found com.google.api.grpc#proto-google-iam-v1;1.9.2 in central
	found com.google.protobuf#protobuf-java;3.21.12 in central
	found com.google.protobuf#protobuf-java-util;3.21.12 in central
	found com.google.api.grpc#proto-google-common-protos;2.14.2 in central
	found org.threeten#threetenbp;1.6.5 in central
	found com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central
	found com.fasterxml.jackson.core#jackson-core;2.14.2 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found io.grpc#grpc-api;1.53.0 in central
	found io.grpc#grpc-stub;1.53.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
	found io.perfmark#perfmark-api;0.26.0 in central
	found com.google.android#annotations;4.1.1.4 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.22 in central
	found io.opencensus#opencensus-proto;0.2.0 in central
	found io.grpc#grpc-services;1.53.0 in central
	found com.google.re2j#re2j;1.6 in central
	found io.grpc#grpc-netty-shaded;1.53.0 in central
	found io.grpc#grpc-googleapis;1.53.0 in central
	found io.grpc#grpc-xds;1.53.0 in central
	found com.navigamez#greex;1.0 in central
	found dk.brics.automaton#automaton;1.11-8 in central
	found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central
	found com.microsoft.onnxruntime#onnxruntime;1.15.0 in central
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
:: resolution report :: resolve 3713ms :: artifacts dl 391ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default]
	com.fasterxml.jackson.core#jackson-core;2.14.2 from central in [default]
	com.github.universal-automata#liblevenshtein;3.0.0 from central in [default]
	com.google.android#annotations;4.1.1.4 from central in [default]
	com.google.api#api-common;2.6.2 from central in [default]
	com.google.api#gax;2.23.2 from central in [default]
	com.google.api#gax-grpc;2.23.2 from central in [default]
	com.google.api#gax-httpjson;0.108.2 from central in [default]
	com.google.api-client#google-api-client;2.2.0 from central in [default]
	com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]
	com.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]
	com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]
	com.google.auth#google-auth-library-credentials;1.16.0 from central in [default]
	com.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]
	com.google.auto.value#auto-value;1.10.1 from central in [default]
	com.google.auto.value#auto-value-annotations;1.10.1 from central in [default]
	com.google.cloud#google-cloud-core;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-http;2.12.0 from central in [default]
	com.google.cloud#google-cloud-storage;2.20.1 from central in [default]
	com.google.code.findbugs#jsr305;3.0.2 from central in [default]
	com.google.code.gson#gson;2.10.1 from central in [default]
	com.google.errorprone#error_prone_annotations;2.18.0 from central in [default]
	com.google.guava#failureaccess;1.0.1 from central in [default]
	com.google.guava#guava;31.1-jre from central in [default]
	com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]
	com.google.http-client#google-http-client;1.43.0 from central in [default]
	com.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]
	com.google.http-client#google-http-client-appengine;1.43.0 from central in [default]
	com.google.http-client#google-http-client-gson;1.43.0 from central in [default]
	com.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]
	com.google.j2objc#j2objc-annotations;1.3 from central in [default]
	com.google.oauth-client#google-oauth-client;1.34.1 from central in [default]
	com.google.protobuf#protobuf-java;3.21.12 from central in [default]
	com.google.protobuf#protobuf-java-util;3.21.12 from central in [default]
	com.google.re2j#re2j;1.6 from central in [default]
	com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 from central in [default]
	com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]
	com.microsoft.onnxruntime#onnxruntime;1.15.0 from central in [default]
	com.navigamez#greex;1.0 from central in [default]
	com.typesafe#config;1.4.2 from central in [default]
	commons-codec#commons-codec;1.15 from central in [default]
	commons-logging#commons-logging;1.2 from central in [default]
	dk.brics.automaton#automaton;1.11-8 from central in [default]
	io.grpc#grpc-alts;1.53.0 from central in [default]
	io.grpc#grpc-api;1.53.0 from central in [default]
	io.grpc#grpc-auth;1.53.0 from central in [default]
	io.grpc#grpc-context;1.53.0 from central in [default]
	io.grpc#grpc-core;1.53.0 from central in [default]
	io.grpc#grpc-googleapis;1.53.0 from central in [default]
	io.grpc#grpc-grpclb;1.53.0 from central in [default]
	io.grpc#grpc-netty-shaded;1.53.0 from central in [default]
	io.grpc#grpc-protobuf;1.53.0 from central in [default]
	io.grpc#grpc-protobuf-lite;1.53.0 from central in [default]
	io.grpc#grpc-services;1.53.0 from central in [default]
	io.grpc#grpc-stub;1.53.0 from central in [default]
	io.grpc#grpc-xds;1.53.0 from central in [default]
	io.opencensus#opencensus-api;0.31.1 from central in [default]
	io.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]
	io.opencensus#opencensus-proto;0.2.0 from central in [default]
	io.perfmark#perfmark-api;0.26.0 from central in [default]
	it.unimi.dsi#fastutil;7.0.12 from central in [default]
	javax.annotation#javax.annotation-api;1.3.2 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	org.checkerframework#checker-qual;3.31.0 from central in [default]
	org.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]
	org.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]
	org.projectlombok#lombok;1.16.8 from central in [default]
	org.rocksdb#rocksdbjni;6.29.5 from central in [default]
	org.threeten#threetenbp;1.6.5 from central in [default]
	:: evicted modules:
	com.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]
	com.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]
	com.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]
	com.amazonaws#aws-java-sdk-bundle;1.11.563 by [com.amazonaws#aws-java-sdk-bundle;1.11.828] in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   77  |   0   |   0   |   4   ||   73  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-4fcb9748-541d-4d17-a6d5-5e71873aa972
	confs: [default]
	0 artifacts copied, 73 already retrieved (0kB/54ms)
23/11/23 19:21:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

3.4.0


reviews_df = pd.read_csv("../../data/csv/rotten_tomatoes_movie_reviews.csv")
movie_df = pd.read_csv("../../data/csv/rotten_tomatoes_movies.csv")


movie_df = movie_df[['id', 'title']]
reviews_df = reviews_df[['id', 'reviewText']]
#left outer join on id
merged_df = pd.merge(movie_df, reviews_df, on='id', how='left')
merged_df.isnull().sum()
merged_df = merged_df.dropna()


def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    return text
# Clean the 'reviewText' column
merged_df['cleanedText'] = merged_df['reviewText'].apply(clean_text)


df = spark.createDataFrame(merged_df)


df = df.cache()


df.rdd.getNumPartitions()

4


df = df.repartition(40)


df.rdd.getNumPartitions()

23/11/23 19:23:42 WARN TaskSetManager: Stage 0 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB.
[Stage 0:>                                                          (0 + 4) / 4]

40


df.show()

23/11/23 19:23:56 WARN TaskSetManager: Stage 1 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB.
[Stage 1:>                                                          (0 + 4) / 4]

+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|          reviewText|         cleanedText|
+--------------------+--------------------+--------------------+--------------------+
|     the_escape_2018|          The Escape|Sturdy melodrama,...|sturdy melodrama ...|
|       the_old_guard|       The Old Guard|Despite an appeal...|despite an appeal...|
|   anniversary_party|The Anniversary P...|Everyone gets the...|everyone gets the...|
|the_secret_life_o...|The Secret Life o...|The Secret Life o...|the secret life o...|
|      the_mummy_2017|           The Mummy|Some movies feel ...|some movies feel ...|
|       live_by_night|       Live by Night|It's really easy ...|its really easy t...|
|         neruda_2016|              Neruda|One of the most s...|one of the most s...|
|the_snowtown_murders|The Snowtown Murders|It's the escalati...|its the escalatin...|
|   gods_country_2022|       God's Country|Higgins and Newto...|higgins and newto...|
|        mogul_mowgli|        Mogul Mowgli|Tariq's embrace o...|tariqs embrace of...|
|  cats_of_mirikitani|The Cats of Mirik...|It should not hav...|it should not hav...|
|          sharkwater|          Sharkwater|How strange to ga...|how strange to ga...|
|    he_said_she_said|   He Said, She Said|Two likable leads...|two likable leads...|
|       monsters_ball|      Monster's Ball|Please read revie...|please read revie...|
|  summer_in_february|  Summer in February|Blah Britpic offe...|blah britpic offe...|
|the_best_exotic_m...|The Best Exotic M...|[Madden directs] ...|madden directs a ...|
|            millions|            Millions|Rich, rewarding, ...|rich rewarding an...|
|          tully_2018|               Tully|An emotionally de...|an emotionally de...|
|             cop_car|             Cop Car|Despite the real,...|despite the real ...|
|the_incredible_je...|The Incredible Je...|A comedy centered...|a comedy centered...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 20 rows


documentAssembler = DocumentAssembler()\
    .setInputCol("cleanedText")\
    .setOutputCol("document")
# Regex Tokenizer to break words
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')
# Normalizing and setting case insensitive to be true
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)
# Lemmatizing
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')
# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['lemma']) \
     .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[OK!]


pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           finisher
     ])


pipelineModel = pipeline.fit(df)
result = pipelineModel.transform(df)

WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/spark-core_2.12-3.4.0.jar) to field java.util.regex.Pattern.pattern
WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release


result = result.withColumn("final_text", F.concat_ws(" ", "finished_lemma"))


result.show()

23/11/23 19:25:12 WARN TaskSetManager: Stage 6 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB.
[Stage 6:>                                                          (0 + 4) / 4]

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|          reviewText|         cleanedText|            document|               token|          normalized|               lemma|      finished_lemma|          final_text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     the_escape_2018|          The Escape|Sturdy melodrama,...|sturdy melodrama ...|[{document, 0, 10...|[{token, 0, 5, st...|[{token, 0, 5, st...|[{token, 0, 5, st...|[sturdy, melodram...|sturdy melodrama ...|
|       the_old_guard|       The Old Guard|Despite an appeal...|despite an appeal...|[{document, 0, 18...|[{token, 0, 6, de...|[{token, 0, 6, de...|[{token, 0, 6, de...|[despite, an, app...|despite an appeal...|
|   anniversary_party|The Anniversary P...|Everyone gets the...|everyone gets the...|[{document, 0, 13...|[{token, 0, 7, ev...|[{token, 0, 7, ev...|[{token, 0, 7, ev...|[everyone, get, t...|everyone get they...|
|the_secret_life_o...|The Secret Life o...|The Secret Life o...|the secret life o...|[{document, 0, 18...|[{token, 0, 2, th...|[{token, 0, 2, th...|[{token, 0, 2, th...|[the, secret, lif...|the secret life o...|
|      the_mummy_2017|           The Mummy|Some movies feel ...|some movies feel ...|[{document, 0, 15...|[{token, 0, 3, so...|[{token, 0, 3, so...|[{token, 0, 3, so...|[some, movie, fee...|some movie feel l...|
|       live_by_night|       Live by Night|It's really easy ...|its really easy t...|[{document, 0, 11...|[{token, 0, 2, it...|[{token, 0, 2, it...|[{token, 0, 2, it...|[it, really, easy...|it really easy to...|
|         neruda_2016|              Neruda|One of the most s...|one of the most s...|[{document, 0, 93...|[{token, 0, 2, on...|[{token, 0, 2, on...|[{token, 0, 2, on...|[one, of, the, mo...|one of the most s...|
|the_snowtown_murders|The Snowtown Murders|It's the escalati...|its the escalatin...|[{document, 0, 21...|[{token, 0, 2, it...|[{token, 0, 2, it...|[{token, 0, 2, it...|[it, the, escalat...|it the escalate s...|
|   gods_country_2022|       God's Country|Higgins and Newto...|higgins and newto...|[{document, 0, 80...|[{token, 0, 6, hi...|[{token, 0, 6, hi...|[{token, 0, 6, hi...|[higgins, and, ne...|higgins and newto...|
|        mogul_mowgli|        Mogul Mowgli|Tariq's embrace o...|tariqs embrace of...|[{document, 0, 16...|[{token, 0, 5, ta...|[{token, 0, 5, ta...|[{token, 0, 5, ta...|[tariqs, embrace,...|tariqs embrace of...|
|  cats_of_mirikitani|The Cats of Mirik...|It should not hav...|it should not hav...|[{document, 0, 88...|[{token, 0, 1, it...|[{token, 0, 1, it...|[{token, 0, 1, it...|[it, should, not,...|it should not hav...|
|          sharkwater|          Sharkwater|How strange to ga...|how strange to ga...|[{document, 0, 13...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[how, strange, to...|how strange to ga...|
|    he_said_she_said|   He Said, She Said|Two likable leads...|two likable leads...|[{document, 0, 53...|[{token, 0, 2, tw...|[{token, 0, 2, tw...|[{token, 0, 2, tw...|[two, likable, le...|two likable lead ...|
|       monsters_ball|      Monster's Ball|Please read revie...|please read revie...|[{document, 0, 50...|[{token, 0, 5, pl...|[{token, 0, 5, pl...|[{token, 0, 5, pl...|[please, read, re...|please read revie...|
|  summer_in_february|  Summer in February|Blah Britpic offe...|blah britpic offe...|[{document, 0, 42...|[{token, 0, 3, bl...|[{token, 0, 3, bl...|[{token, 0, 3, bl...|[blah, britpic, o...|blah britpic offe...|
|the_best_exotic_m...|The Best Exotic M...|[Madden directs] ...|madden directs a ...|[{document, 0, 17...|[{token, 0, 5, ma...|[{token, 0, 5, ma...|[{token, 0, 5, ma...|[madden, direct, ...|madden direct a t...|
|            millions|            Millions|Rich, rewarding, ...|rich rewarding an...|[{document, 0, 42...|[{token, 0, 3, ri...|[{token, 0, 3, ri...|[{token, 0, 3, ri...|[rich, reward, an...|rich reward and d...|
|          tully_2018|               Tully|An emotionally de...|an emotionally de...|[{document, 0, 16...|[{token, 0, 1, an...|[{token, 0, 1, an...|[{token, 0, 1, an...|[an, emotionally,...|an emotionally de...|
|             cop_car|             Cop Car|Despite the real,...|despite the real ...|[{document, 0, 14...|[{token, 0, 6, de...|[{token, 0, 6, de...|[{token, 0, 6, de...|[despite, the, re...|despite the real ...|
|the_incredible_je...|The Incredible Je...|A comedy centered...|a comedy centered...|[{document, 0, 23...|[{token, 0, 0, a,...|[{token, 0, 0, a,...|[{token, 0, 0, a,...|[a, comedy, cente...|a comedy center a...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 20 rows


result.write.parquet("s3a://project-group34/project/sentiment_analysis/cleaned_reviews/")

23/11/23 19:26:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
23/11/23 19:26:08 WARN TaskSetManager: Stage 9 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB.


# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.executor.memory", "12g")\
    .config("spark.executor.cores", "3")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider"
    )\
    .getOrCreate()

print(spark.version)

Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider

:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e8291c0b-a674-4016-9c62-cb05c896f65d;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.errorprone#error_prone_annotations;2.18.0 in central
	found com.google.j2objc#j2objc-annotations;1.3 in central
	found com.google.http-client#google-http-client;1.43.0 in central
	found io.opencensus#opencensus-contrib-http-util;0.31.1 in central
	found com.google.http-client#google-http-client-jackson2;1.43.0 in central
	found com.google.http-client#google-http-client-gson;1.43.0 in central
	found com.google.api-client#google-api-client;2.2.0 in central
	found commons-codec#commons-codec;1.15 in central
	found com.google.oauth-client#google-oauth-client;1.34.1 in central
	found com.google.http-client#google-http-client-apache-v2;1.43.0 in central
	found com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central
	found com.google.code.gson#gson;2.10.1 in central
	found com.google.cloud#google-cloud-core;2.12.0 in central
	found io.grpc#grpc-context;1.53.0 in central
	found com.google.auto.value#auto-value-annotations;1.10.1 in central
	found com.google.auto.value#auto-value;1.10.1 in central
	found javax.annotation#javax.annotation-api;1.3.2 in central
	found commons-logging#commons-logging;1.2 in central
	found com.google.cloud#google-cloud-core-http;2.12.0 in central
	found com.google.http-client#google-http-client-appengine;1.43.0 in central
	found com.google.api#gax-httpjson;0.108.2 in central
	found com.google.cloud#google-cloud-core-grpc;2.12.0 in central
	found io.grpc#grpc-alts;1.53.0 in central
	found io.grpc#grpc-grpclb;1.53.0 in central
	found org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central
	found io.grpc#grpc-auth;1.53.0 in central
	found io.grpc#grpc-protobuf;1.53.0 in central
	found io.grpc#grpc-protobuf-lite;1.53.0 in central
	found io.grpc#grpc-core;1.53.0 in central
	found com.google.api#gax;2.23.2 in central
	found com.google.api#gax-grpc;2.23.2 in central
	found com.google.auth#google-auth-library-credentials;1.16.0 in central
	found com.google.auth#google-auth-library-oauth2-http;1.16.0 in central
	found com.google.api#api-common;2.6.2 in central
	found io.opencensus#opencensus-api;0.31.1 in central
	found com.google.api.grpc#proto-google-iam-v1;1.9.2 in central
	found com.google.protobuf#protobuf-java;3.21.12 in central
	found com.google.protobuf#protobuf-java-util;3.21.12 in central
	found com.google.api.grpc#proto-google-common-protos;2.14.2 in central
	found org.threeten#threetenbp;1.6.5 in central
	found com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central
	found com.fasterxml.jackson.core#jackson-core;2.14.2 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found io.grpc#grpc-api;1.53.0 in central
	found io.grpc#grpc-stub;1.53.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
	found io.perfmark#perfmark-api;0.26.0 in central
	found com.google.android#annotations;4.1.1.4 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.22 in central
	found io.opencensus#opencensus-proto;0.2.0 in central
	found io.grpc#grpc-services;1.53.0 in central
	found com.google.re2j#re2j;1.6 in central
	found io.grpc#grpc-netty-shaded;1.53.0 in central
	found io.grpc#grpc-googleapis;1.53.0 in central
	found io.grpc#grpc-xds;1.53.0 in central
	found com.navigamez#greex;1.0 in central
	found dk.brics.automaton#automaton;1.11-8 in central
	found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central
	found com.microsoft.onnxruntime#onnxruntime;1.15.0 in central
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
:: resolution report :: resolve 4147ms :: artifacts dl 517ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default]
	com.fasterxml.jackson.core#jackson-core;2.14.2 from central in [default]
	com.github.universal-automata#liblevenshtein;3.0.0 from central in [default]
	com.google.android#annotations;4.1.1.4 from central in [default]
	com.google.api#api-common;2.6.2 from central in [default]
	com.google.api#gax;2.23.2 from central in [default]
	com.google.api#gax-grpc;2.23.2 from central in [default]
	com.google.api#gax-httpjson;0.108.2 from central in [default]
	com.google.api-client#google-api-client;2.2.0 from central in [default]
	com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]
	com.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]
	com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]
	com.google.auth#google-auth-library-credentials;1.16.0 from central in [default]
	com.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]
	com.google.auto.value#auto-value;1.10.1 from central in [default]
	com.google.auto.value#auto-value-annotations;1.10.1 from central in [default]
	com.google.cloud#google-cloud-core;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-http;2.12.0 from central in [default]
	com.google.cloud#google-cloud-storage;2.20.1 from central in [default]
	com.google.code.findbugs#jsr305;3.0.2 from central in [default]
	com.google.code.gson#gson;2.10.1 from central in [default]
	com.google.errorprone#error_prone_annotations;2.18.0 from central in [default]
	com.google.guava#failureaccess;1.0.1 from central in [default]
	com.google.guava#guava;31.1-jre from central in [default]
	com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]
	com.google.http-client#google-http-client;1.43.0 from central in [default]
	com.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]
	com.google.http-client#google-http-client-appengine;1.43.0 from central in [default]
	com.google.http-client#google-http-client-gson;1.43.0 from central in [default]
	com.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]
	com.google.j2objc#j2objc-annotations;1.3 from central in [default]
	com.google.oauth-client#google-oauth-client;1.34.1 from central in [default]
	com.google.protobuf#protobuf-java;3.21.12 from central in [default]
	com.google.protobuf#protobuf-java-util;3.21.12 from central in [default]
	com.google.re2j#re2j;1.6 from central in [default]
	com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 from central in [default]
	com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]
	com.microsoft.onnxruntime#onnxruntime;1.15.0 from central in [default]
	com.navigamez#greex;1.0 from central in [default]
	com.typesafe#config;1.4.2 from central in [default]
	commons-codec#commons-codec;1.15 from central in [default]
	commons-logging#commons-logging;1.2 from central in [default]
	dk.brics.automaton#automaton;1.11-8 from central in [default]
	io.grpc#grpc-alts;1.53.0 from central in [default]
	io.grpc#grpc-api;1.53.0 from central in [default]
	io.grpc#grpc-auth;1.53.0 from central in [default]
	io.grpc#grpc-context;1.53.0 from central in [default]
	io.grpc#grpc-core;1.53.0 from central in [default]
	io.grpc#grpc-googleapis;1.53.0 from central in [default]
	io.grpc#grpc-grpclb;1.53.0 from central in [default]
	io.grpc#grpc-netty-shaded;1.53.0 from central in [default]
	io.grpc#grpc-protobuf;1.53.0 from central in [default]
	io.grpc#grpc-protobuf-lite;1.53.0 from central in [default]
	io.grpc#grpc-services;1.53.0 from central in [default]
	io.grpc#grpc-stub;1.53.0 from central in [default]
	io.grpc#grpc-xds;1.53.0 from central in [default]
	io.opencensus#opencensus-api;0.31.1 from central in [default]
	io.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]
	io.opencensus#opencensus-proto;0.2.0 from central in [default]
	io.perfmark#perfmark-api;0.26.0 from central in [default]
	it.unimi.dsi#fastutil;7.0.12 from central in [default]
	javax.annotation#javax.annotation-api;1.3.2 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	org.checkerframework#checker-qual;3.31.0 from central in [default]
	org.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]
	org.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]
	org.projectlombok#lombok;1.16.8 from central in [default]
	org.rocksdb#rocksdbjni;6.29.5 from central in [default]
	org.threeten#threetenbp;1.6.5 from central in [default]
	:: evicted modules:
	com.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]
	com.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]
	com.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]
	com.amazonaws#aws-java-sdk-bundle;1.11.563 by [com.amazonaws#aws-java-sdk-bundle;1.11.828] in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   77  |   0   |   0   |   4   ||   73  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-e8291c0b-a674-4016-9c62-cb05c896f65d
	confs: [default]
	0 artifacts copied, 73 already retrieved (0kB/169ms)
23/11/23 22:09:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

3.4.0


result = spark.read.parquet("s3a://project-group34/project/sentiment_analysis/cleaned_reviews/")

23/11/23 22:09:36 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


result.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|          reviewText|         cleanedText|            document|               token|          normalized|               lemma|      finished_lemma|          final_text|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   i_do_until_i_dont|I Do... Until I D...|If you enjoy watc...|if you enjoy watc...|[{document, 0, 59...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, you, enjoy, ...|if you enjoy watc...|
|1009716-hobsons_c...|     Hobson's Choice|So much of what m...|so much of what m...|[{document, 0, 14...|[{token, 0, 1, so...|[{token, 0, 1, so...|[{token, 0, 1, so...|[so, much, of, wh...|so much of what m...|
|     the_bubble_2022|          The Bubble|The fact that The...|the fact that the...|[{document, 0, 19...|[{token, 0, 2, th...|[{token, 0, 2, th...|[{token, 0, 2, th...|[the, fact, that,...|the fact that the...|
| bright_young_things| Bright Young Things|If there was ever...|if there was ever...|[{document, 0, 12...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, there, be, e...|if there be ever ...|
|  la_cage_aux_folles|  Birds of a Feather|How The Birdcage ...|how the birdcage ...|[{document, 0, 51...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[how, the, birdca...|how the birdcage ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows


documentAssembler = DocumentAssembler()\
    .setInputCol("final_text")\
    .setOutputCol("document")
    
# Paths to the models
tfhub_use_path = "../../../cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099/"
sentimentdl_use_twitter_path = "../../../cache_pretrained/sentimentdl_use_twitter_en_2.7.1_2.4_1610983524713/"


# Load models from local path
use = UniversalSentenceEncoder.load(tfhub_use_path)\
         .setInputCols(["document"])\
         .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLModel.load(sentimentdl_use_twitter_path)\
                 .setInputCols(["sentence_embeddings"])\
                 .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
#         sentimentdl1
      ])

2023-11-23 22:10:09.324275: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-23 22:10:13.686569: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory.
2023-11-23 22:10:13.733440: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory.
2023-11-23 22:10:13.780734: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory.
2023-11-23 22:10:13.825948: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory.
2023-11-23 22:10:13.875391: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory.
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/spark-core_2.12-3.4.0.jar) to field java.lang.ref.Reference.referent
WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release


# use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
#  .setInputCols(["document"])\
#  .setOutputCol("sentence_embeddings")

# sentimentdl = SentimentDLModel.pretrained(name="sentimentdl_use_twitter", lang="en")\
#     .setInputCols(["sentence_embeddings"])\
#     .setOutputCol("sentiment")

# # sentimentdl1 = ClassifierDLModel.pretrained(name="classifierdl_use_emotion")\
# #     .setInputCols(["sentence_embeddings"])\
# #     .setOutputCol("sentiment_emotion")


sentiment_model = nlpPipeline.fit(result)
result = sentiment_model.transform(result)


result.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|          reviewText|         cleanedText|            document|               token|          normalized|               lemma|      finished_lemma|          final_text| sentence_embeddings|           sentiment|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   i_do_until_i_dont|I Do... Until I D...|If you enjoy watc...|if you enjoy watc...|[{document, 0, 52...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, you, enjoy, ...|if you enjoy watc...|[{sentence_embedd...|[{category, 0, 52...|
|1009716-hobsons_c...|     Hobson's Choice|So much of what m...|so much of what m...|[{document, 0, 13...|[{token, 0, 1, so...|[{token, 0, 1, so...|[{token, 0, 1, so...|[so, much, of, wh...|so much of what m...|[{sentence_embedd...|[{category, 0, 13...|
|     the_bubble_2022|          The Bubble|The fact that The...|the fact that the...|[{document, 0, 19...|[{token, 0, 2, th...|[{token, 0, 2, th...|[{token, 0, 2, th...|[the, fact, that,...|the fact that the...|[{sentence_embedd...|[{category, 0, 19...|
| bright_young_things| Bright Young Things|If there was ever...|if there was ever...|[{document, 0, 11...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, there, be, e...|if there be ever ...|[{sentence_embedd...|[{category, 0, 11...|
|  la_cage_aux_folles|  Birds of a Feather|How The Birdcage ...|how the birdcage ...|[{document, 0, 46...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[how, the, birdca...|how the birdcage ...|[{sentence_embedd...|[{category, 0, 46...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows


result.write.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_extracted/", mode="overwrite")


import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.executor.memory", "12g")\
    .config("spark.executor.cores", "3")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider"
    )\
    .getOrCreate()

print(spark.version)

3.4.0


result = spark.read.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_extracted/")

23/11/23 22:55:13 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


result = result.select(
    F.explode(F.arrays_zip('sentiment.result', 'sentiment.metadata')).alias("cols"),
    F.expr("title").alias("title"),
    F.expr("cleanedText").alias("text")
)


from pyspark.sql.functions import col

# Selecting the necessary columns
result = result.select(
    col("cols.result").alias("sentiment"),
    col("cols.metadata")["positive"].alias("positive_score"),
    col("cols.metadata")["negative"].alias("negative_score"),
    "title",
    "text"
)

# # Showing the first few rows of the modified DataFrame
# finalresult.show(truncate=False)


result.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+---------+--------------+--------------+--------------------+--------------------+
|sentiment|positive_score|negative_score|               title|                text|
+---------+--------------+--------------+--------------------+--------------------+
| positive|           1.0|           0.0|I Do... Until I D...|if you enjoy watc...|
| positive|     0.8947771|    0.10522288|     Hobson's Choice|so much of what m...|
| positive|    0.77628094|    0.22371912|          The Bubble|the fact that the...|
| positive|           1.0|           0.0| Bright Young Things|if there was ever...|
| positive|           1.0|           0.0|  Birds of a Feather|how the birdcage ...|
+---------+--------------+--------------+--------------------+--------------------+
only showing top 5 rows


result.write.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_processed/", mode="overwrite")


import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.executor.memory", "12g")\
    .config("spark.executor.cores", "3")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider"
    )\
    .getOrCreate()

print(spark.version)

3.4.0


result = spark.read.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_processed/")


# Group by the 'title' and calculate the average of 'positive_score' and 'negative_score'
result = result.groupBy("title").agg(
    F.avg("positive_score").alias("average_positive_score"),
    F.avg("negative_score").alias("average_negative_score"),
    F.count(col("positive_score")).alias("num_reviews")
)


result.show(5)

[Stage 29:===========================================>              (3 + 1) / 4]

+--------------------+----------------------+----------------------+-----------+
|               title|average_positive_score|average_negative_score|num_reviews|
+--------------------+----------------------+----------------------+-----------+
|                 Kin|    0.7215052546283167|    0.2784947445187336|         98|
|Taking Sides: Le ...|    0.8364778709963958|    0.1635221320418551|         48|
|         How to Deal|    0.6003653085125116|    0.3996346922415671|         90|
|           On My Way|    0.8729917092682926|   0.12700829512315298|         41|
|      Black Nativity|    0.8128262822475695|    0.1871737217950096|         90|
+--------------------+----------------------+----------------------+-----------+
only showing top 5 rows


result.write.parquet("s3a://project-group34/project/sentiment_analysis/average_scores_per_movie/", mode="overwrite")


from pyspark.sql.functions import mean, median, min, max

# Assuming your DataFrame is named `df`
stats = result.agg(
    mean(col("average_positive_score")).alias("mean_positive"),
    mean(col("average_negative_score")).alias("mean_negative"),
    median(col("num_reviews")).alias("median_num_reviews"),  # Note: PySpark doesn't have a built-in median function
    min(col("num_reviews")).alias("min_num_reviews"),
    max(col("num_reviews")).alias("max_num_reviews")
)

stats.show()

[Stage 68:=============================>                            (2 + 2) / 4]

+------------------+-------------------+------------------+---------------+---------------+
|     mean_positive|      mean_negative|median_num_reviews|min_num_reviews|max_num_reviews|
+------------------+-------------------+------------------+---------------+---------------+
|0.7525733910082344|0.24742646776123378|               4.0|              1|           1921|
+------------------+-------------------+------------------+---------------+---------------+


num_movies_less_than_4_reviews = result.filter(col("num_reviews") > 10).count()

print(f"Number of movies with fewer than 4 reviews: {num_movies_less_than_4_reviews}")

[Stage 62:=============================>                            (2 + 2) / 4]

Number of movies with fewer than 4 reviews: 19794


# Multiply average_positive_score and average_negative_score by num_reviews
result = result.withColumn("normalized_positive_score", col("average_positive_score") * col("num_reviews")) \
                  .withColumn("normalized_negative_score", col("average_negative_score") * col("num_reviews"))


result.show(5)

[Stage 35:===========================================>              (3 + 1) / 4]

+--------------------+----------------------+----------------------+-----------+-------------------------+-------------------------+
|               title|average_positive_score|average_negative_score|num_reviews|normalized_positive_score|normalized_negative_score|
+--------------------+----------------------+----------------------+-----------+-------------------------+-------------------------+
|                 Kin|    0.7215052546283167|    0.2784947445187336|         98|        70.70751495357504|        27.29248496283589|
|Taking Sides: Le ...|    0.8364778709963958|    0.1635221320418551|         48|          40.150937807827|        7.849062338009045|
|         How to Deal|    0.6003653085125116|    0.3996346922415671|         90|        54.03287776612604|        35.96712230174104|
|           On My Way|    0.8729917092682926|   0.12700829512315298|         41|              35.79266008|        5.207340100049272|
|      Black Nativity|    0.8128262822475695|    0.1871737217950096|         90|        73.15436540228126|       16.845634961550864|
+--------------------+----------------------+----------------------+-----------+-------------------------+-------------------------+
only showing top 5 rows


from pyspark.sql.functions import col

# Constants from your dataset
C_positive = 0.7525733910082344  # Mean score of positive scores
C_negative = 0.24742646776123378 # Mean score of negative scores
m = 4  # Median number of reviews, can adjust as needed

# Adding a new column for weighted rating
positive_weighted_rating_df = result.withColumn("weighted_rating", 
                                   (col("num_reviews") / (col("num_reviews") + m) * col("average_positive_score")) + 
                                   (m / (col("num_reviews") + m) * C_positive))

# Adding a new column for weighted rating
negative_weighted_rating_df = result.withColumn("weighted_rating", 
                                   (col("num_reviews") / (col("num_reviews") + m) * col("average_negative_score")) + 
                                   (m / (col("num_reviews") + m) * C_negative))


# Get top 20 movies with highest average positive scores
top_20_positive = positive_weighted_rating_df.orderBy(F.desc("weighted_rating")).limit(20)


top_20_positive.select("title", "weighted_rating").show()

[Stage 77:=============================>                            (2 + 2) / 4]

+--------------------+------------------+
|               title|   weighted_rating|
+--------------------+------------------+
|Jim Allison: Brea...|0.9809671839237104|
|Hava Nagila (The ...|0.9789505983669412|
|      Trifling Women|0.9786921542506863|
|Blinky Bill the M...|0.9776142600840194|
|Molly's Theory of...|0.9775049055462031|
|Blind Willow, Sle...|0.9725081545564704|
|California Typewr...|0.9717155569098395|
|          Half Magic| 0.969509225369288|
|   Things Never Said|0.9682975731008235|
|     I'm Leaving Now|0.9682241038760293|
|        The Farthest|0.9680655310978367|
|Bill Cunningham N...|0.9677857000523757|
|Richard Linklater...|0.9676412249723696|
|       Breaking Fast|0.9675419343236431|
|    Moonage Daydream|0.9661923959624714|
|              Suzi Q|0.9661492989421452|
|Salvatore: Shoema...|0.9657396017786927|
|Is the Man Who Is...|0.9652498774769747|
|The Eyes of Orson...| 0.964369877480659|
|       Gentleman Jim| 0.963343549038257|
+--------------------+------------------+


# Get top 20 movies with highest average negative scores
top_20_negative = negative_weighted_rating_df.orderBy(F.desc("weighted_rating")).limit(20)


top_20_negative.select("title", "weighted_rating").show()

[Stage 89:=============================>                            (2 + 2) / 4]

+--------------------+------------------+
|               title|   weighted_rating|
+--------------------+------------------+
|Dinner With the P...|0.8108478919403085|
|               Fangs|0.7490668759204112|
|      Elephant Tales|0.7477467559204113|
|             Arisaka| 0.745482372587078|
|  The Phantom Planet|0.7426795559204112|
|         12 in a Box|0.6973159521044934|
|      Audrie & Daisy|0.6911893001508967|
|A Tree of Life: T...|0.6902592416188412|
|       Tears of Gaza| 0.689034361316461|
|         Moscow Zero|0.6797639392537446|
|     Of Fish and Foe|0.6789700581044935|
|                  37|0.6702393639265335|
|      Gut Renovation|0.6672347836460667|
|     Bloody Birthday| 0.666751160094994|
|    Cries From Syria|0.6652684999080672|
|    American Relapse|0.6638483609204112|
|The Bad News Bear...| 0.660546783256329|
|           May Fools|0.6593032419403084|
|    Private Violence|0.6576097419403084|
|   Radioland Murders|0.6575381655495395|
+--------------------+------------------+


top_20_positive_pd = top_20_positive.toPandas()


top_20_negative_pd = top_20_negative.toPandas()


top_20_positive_pd.to_csv("../../data/csv/20_positive_movieReviews.csv", index = False)


top_20_negative_pd.to_csv("../../data/csv/20_negative_movieReviews.csv", index = False)

SENTIMENT ANALYSIS¶

DATA CLEANING¶

SENTIMENT MODEL¶

SENTIMENT PROCESSING¶

SENTIMENT NEGATIVE AND POSITIVE PROCESSING¶