# Setup - Run only once per Kernel App
%conda install openjdk -y
# install PySpark
%pip install pyspark==3.4.0
# install spark-nlp
%pip install spark-nlp==5.1.3
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
Collecting package metadata (current_repodata.json): done Solving environment: done ==> WARNING: A newer version of conda exists. <== current version: 23.3.1 latest version: 23.10.0 Please update conda by running $ conda update -n base -c defaults conda Or to minimize the number of packages updated during conda update use conda install conda=23.10.0 ## Package Plan ## environment location: /opt/conda added / updated specs: - openjdk The following packages will be downloaded: package | build ---------------------------|----------------- ca-certificates-2023.08.22 | h06a4308_0 123 KB certifi-2023.11.17 | py310h06a4308_0 158 KB openjdk-11.0.13 | h87a67e3_0 341.0 MB ------------------------------------------------------------ Total: 341.3 MB The following NEW packages will be INSTALLED: openjdk pkgs/main/linux-64::openjdk-11.0.13-h87a67e3_0 The following packages will be UPDATED: ca-certificates conda-forge::ca-certificates-2023.7.2~ --> pkgs/main::ca-certificates-2023.08.22-h06a4308_0 certifi conda-forge/noarch::certifi-2023.7.22~ --> pkgs/main/linux-64::certifi-2023.11.17-py310h06a4308_0 Downloading and Extracting Packages openjdk-11.0.13 | 341.0 MB | | 0% ca-certificates-2023 | 123 KB | | 0% certifi-2023.11.17 | 158 KB | | 0% ca-certificates-2023 | 123 KB | ##################################### | 100% Preparing transaction: done Verifying transaction: done Executing transaction: done Note: you may need to restart the kernel to use updated packages. Collecting pyspark==3.4.0 Using cached pyspark-3.4.0-py2.py3-none-any.whl Collecting py4j==0.10.9.7 (from pyspark==3.4.0) Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB) Installing collected packages: py4j, pyspark Successfully installed py4j-0.10.9.7 pyspark-3.4.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv [notice] A new release of pip is available: 23.2.1 -> 23.3.1 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages. Collecting spark-nlp==5.1.3 Obtaining dependency information for spark-nlp==5.1.3 from https://files.pythonhosted.org/packages/cd/7d/bc0eca4c9ec4c9c1d9b28c42c2f07942af70980a7d912d0aceebf8db32dd/spark_nlp-5.1.3-py2.py3-none-any.whl.metadata Using cached spark_nlp-5.1.3-py2.py3-none-any.whl.metadata (53 kB) Using cached spark_nlp-5.1.3-py2.py3-none-any.whl (537 kB) Installing collected packages: spark-nlp Successfully installed spark-nlp-5.1.3 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv [notice] A new release of pip is available: 23.2.1 -> 23.3.1 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
[nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to /root/nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
True
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
# Import pyspark and build Spark session
from pyspark.sql import SparkSession
# Import pyspark and build Spark session
spark = SparkSession.builder \
.appName("Spark NLP")\
.master("local[*]")\
.config("spark.driver.memory","16G")\
.config("spark.executor.memory", "12g")\
.config("spark.executor.cores", "3")\
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
.config(
"fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.ContainerCredentialsProvider"
)\
.getOrCreate()
print(spark.version)
Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider
:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2/cache The jars for the packages stored in: /root/.ivy2/jars com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency org.apache.hadoop#hadoop-aws added as a dependency :: resolving dependencies :: org.apache.spark#spark-submit-parent-4fcb9748-541d-4d17-a6d5-5e71873aa972;1.0 confs: [default] found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central found com.typesafe#config;1.4.2 in central found org.rocksdb#rocksdbjni;6.29.5 in central found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central found com.github.universal-automata#liblevenshtein;3.0.0 in central found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central found com.google.code.gson#gson;2.3 in central found it.unimi.dsi#fastutil;7.0.12 in central found org.projectlombok#lombok;1.16.8 in central found com.google.cloud#google-cloud-storage;2.20.1 in central found com.google.guava#guava;31.1-jre in central found com.google.guava#failureaccess;1.0.1 in central found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central found com.google.errorprone#error_prone_annotations;2.18.0 in central found com.google.j2objc#j2objc-annotations;1.3 in central found com.google.http-client#google-http-client;1.43.0 in central found io.opencensus#opencensus-contrib-http-util;0.31.1 in central found com.google.http-client#google-http-client-jackson2;1.43.0 in central found com.google.http-client#google-http-client-gson;1.43.0 in central found com.google.api-client#google-api-client;2.2.0 in central found commons-codec#commons-codec;1.15 in central found com.google.oauth-client#google-oauth-client;1.34.1 in central found com.google.http-client#google-http-client-apache-v2;1.43.0 in central found com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central found com.google.code.gson#gson;2.10.1 in central found com.google.cloud#google-cloud-core;2.12.0 in central found io.grpc#grpc-context;1.53.0 in central found com.google.auto.value#auto-value-annotations;1.10.1 in central found com.google.auto.value#auto-value;1.10.1 in central found javax.annotation#javax.annotation-api;1.3.2 in central found commons-logging#commons-logging;1.2 in central found com.google.cloud#google-cloud-core-http;2.12.0 in central found com.google.http-client#google-http-client-appengine;1.43.0 in central found com.google.api#gax-httpjson;0.108.2 in central found com.google.cloud#google-cloud-core-grpc;2.12.0 in central found io.grpc#grpc-alts;1.53.0 in central found io.grpc#grpc-grpclb;1.53.0 in central found org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central found io.grpc#grpc-auth;1.53.0 in central found io.grpc#grpc-protobuf;1.53.0 in central found io.grpc#grpc-protobuf-lite;1.53.0 in central found io.grpc#grpc-core;1.53.0 in central found com.google.api#gax;2.23.2 in central found com.google.api#gax-grpc;2.23.2 in central found com.google.auth#google-auth-library-credentials;1.16.0 in central found com.google.auth#google-auth-library-oauth2-http;1.16.0 in central found com.google.api#api-common;2.6.2 in central found io.opencensus#opencensus-api;0.31.1 in central found com.google.api.grpc#proto-google-iam-v1;1.9.2 in central found com.google.protobuf#protobuf-java;3.21.12 in central found com.google.protobuf#protobuf-java-util;3.21.12 in central found com.google.api.grpc#proto-google-common-protos;2.14.2 in central found org.threeten#threetenbp;1.6.5 in central found com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central found com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central found com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central found com.fasterxml.jackson.core#jackson-core;2.14.2 in central found com.google.code.findbugs#jsr305;3.0.2 in central found io.grpc#grpc-api;1.53.0 in central found io.grpc#grpc-stub;1.53.0 in central found org.checkerframework#checker-qual;3.31.0 in central found io.perfmark#perfmark-api;0.26.0 in central found com.google.android#annotations;4.1.1.4 in central found org.codehaus.mojo#animal-sniffer-annotations;1.22 in central found io.opencensus#opencensus-proto;0.2.0 in central found io.grpc#grpc-services;1.53.0 in central found com.google.re2j#re2j;1.6 in central found io.grpc#grpc-netty-shaded;1.53.0 in central found io.grpc#grpc-googleapis;1.53.0 in central found io.grpc#grpc-xds;1.53.0 in central found com.navigamez#greex;1.0 in central found dk.brics.automaton#automaton;1.11-8 in central found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central found com.microsoft.onnxruntime#onnxruntime;1.15.0 in central found org.apache.hadoop#hadoop-aws;3.2.2 in central :: resolution report :: resolve 3713ms :: artifacts dl 391ms :: modules in use: com.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default] com.fasterxml.jackson.core#jackson-core;2.14.2 from central in [default] com.github.universal-automata#liblevenshtein;3.0.0 from central in [default] com.google.android#annotations;4.1.1.4 from central in [default] com.google.api#api-common;2.6.2 from central in [default] com.google.api#gax;2.23.2 from central in [default] com.google.api#gax-grpc;2.23.2 from central in [default] com.google.api#gax-httpjson;0.108.2 from central in [default] com.google.api-client#google-api-client;2.2.0 from central in [default] com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default] com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default] com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default] com.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default] com.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default] com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default] com.google.auth#google-auth-library-credentials;1.16.0 from central in [default] com.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default] com.google.auto.value#auto-value;1.10.1 from central in [default] com.google.auto.value#auto-value-annotations;1.10.1 from central in [default] com.google.cloud#google-cloud-core;2.12.0 from central in [default] com.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default] com.google.cloud#google-cloud-core-http;2.12.0 from central in [default] com.google.cloud#google-cloud-storage;2.20.1 from central in [default] com.google.code.findbugs#jsr305;3.0.2 from central in [default] com.google.code.gson#gson;2.10.1 from central in [default] com.google.errorprone#error_prone_annotations;2.18.0 from central in [default] com.google.guava#failureaccess;1.0.1 from central in [default] com.google.guava#guava;31.1-jre from central in [default] com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default] com.google.http-client#google-http-client;1.43.0 from central in [default] com.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default] com.google.http-client#google-http-client-appengine;1.43.0 from central in [default] com.google.http-client#google-http-client-gson;1.43.0 from central in [default] com.google.http-client#google-http-client-jackson2;1.43.0 from central in [default] com.google.j2objc#j2objc-annotations;1.3 from central in [default] com.google.oauth-client#google-oauth-client;1.34.1 from central in [default] com.google.protobuf#protobuf-java;3.21.12 from central in [default] com.google.protobuf#protobuf-java-util;3.21.12 from central in [default] com.google.re2j#re2j;1.6 from central in [default] com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 from central in [default] com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default] com.microsoft.onnxruntime#onnxruntime;1.15.0 from central in [default] com.navigamez#greex;1.0 from central in [default] com.typesafe#config;1.4.2 from central in [default] commons-codec#commons-codec;1.15 from central in [default] commons-logging#commons-logging;1.2 from central in [default] dk.brics.automaton#automaton;1.11-8 from central in [default] io.grpc#grpc-alts;1.53.0 from central in [default] io.grpc#grpc-api;1.53.0 from central in [default] io.grpc#grpc-auth;1.53.0 from central in [default] io.grpc#grpc-context;1.53.0 from central in [default] io.grpc#grpc-core;1.53.0 from central in [default] io.grpc#grpc-googleapis;1.53.0 from central in [default] io.grpc#grpc-grpclb;1.53.0 from central in [default] io.grpc#grpc-netty-shaded;1.53.0 from central in [default] io.grpc#grpc-protobuf;1.53.0 from central in [default] io.grpc#grpc-protobuf-lite;1.53.0 from central in [default] io.grpc#grpc-services;1.53.0 from central in [default] io.grpc#grpc-stub;1.53.0 from central in [default] io.grpc#grpc-xds;1.53.0 from central in [default] io.opencensus#opencensus-api;0.31.1 from central in [default] io.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default] io.opencensus#opencensus-proto;0.2.0 from central in [default] io.perfmark#perfmark-api;0.26.0 from central in [default] it.unimi.dsi#fastutil;7.0.12 from central in [default] javax.annotation#javax.annotation-api;1.3.2 from central in [default] org.apache.hadoop#hadoop-aws;3.2.2 from central in [default] org.checkerframework#checker-qual;3.31.0 from central in [default] org.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default] org.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default] org.projectlombok#lombok;1.16.8 from central in [default] org.rocksdb#rocksdbjni;6.29.5 from central in [default] org.threeten#threetenbp;1.6.5 from central in [default] :: evicted modules: com.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default] com.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default] com.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default] com.amazonaws#aws-java-sdk-bundle;1.11.563 by [com.amazonaws#aws-java-sdk-bundle;1.11.828] in [default] --------------------------------------------------------------------- | | modules || artifacts | | conf | number| search|dwnlded|evicted|| number|dwnlded| --------------------------------------------------------------------- | default | 77 | 0 | 0 | 4 || 73 | 0 | --------------------------------------------------------------------- :: retrieving :: org.apache.spark#spark-submit-parent-4fcb9748-541d-4d17-a6d5-5e71873aa972 confs: [default] 0 artifacts copied, 73 already retrieved (0kB/54ms) 23/11/23 19:21:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
3.4.0
reviews_df = pd.read_csv("../../data/csv/rotten_tomatoes_movie_reviews.csv")
movie_df = pd.read_csv("../../data/csv/rotten_tomatoes_movies.csv")
movie_df = movie_df[['id', 'title']]
reviews_df = reviews_df[['id', 'reviewText']]
#left outer join on id
merged_df = pd.merge(movie_df, reviews_df, on='id', how='left')
merged_df.isnull().sum()
merged_df = merged_df.dropna()
def clean_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
# Remove numbers
text = re.sub(r'\d+', '', text)
return text
# Clean the 'reviewText' column
merged_df['cleanedText'] = merged_df['reviewText'].apply(clean_text)
df = spark.createDataFrame(merged_df)
df = df.cache()
df.rdd.getNumPartitions()
4
df = df.repartition(40)
df.rdd.getNumPartitions()
23/11/23 19:23:42 WARN TaskSetManager: Stage 0 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB. [Stage 0:> (0 + 4) / 4]
40
df.show()
23/11/23 19:23:56 WARN TaskSetManager: Stage 1 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB. [Stage 1:> (0 + 4) / 4]
+--------------------+--------------------+--------------------+--------------------+ | id| title| reviewText| cleanedText| +--------------------+--------------------+--------------------+--------------------+ | the_escape_2018| The Escape|Sturdy melodrama,...|sturdy melodrama ...| | the_old_guard| The Old Guard|Despite an appeal...|despite an appeal...| | anniversary_party|The Anniversary P...|Everyone gets the...|everyone gets the...| |the_secret_life_o...|The Secret Life o...|The Secret Life o...|the secret life o...| | the_mummy_2017| The Mummy|Some movies feel ...|some movies feel ...| | live_by_night| Live by Night|It's really easy ...|its really easy t...| | neruda_2016| Neruda|One of the most s...|one of the most s...| |the_snowtown_murders|The Snowtown Murders|It's the escalati...|its the escalatin...| | gods_country_2022| God's Country|Higgins and Newto...|higgins and newto...| | mogul_mowgli| Mogul Mowgli|Tariq's embrace o...|tariqs embrace of...| | cats_of_mirikitani|The Cats of Mirik...|It should not hav...|it should not hav...| | sharkwater| Sharkwater|How strange to ga...|how strange to ga...| | he_said_she_said| He Said, She Said|Two likable leads...|two likable leads...| | monsters_ball| Monster's Ball|Please read revie...|please read revie...| | summer_in_february| Summer in February|Blah Britpic offe...|blah britpic offe...| |the_best_exotic_m...|The Best Exotic M...|[Madden directs] ...|madden directs a ...| | millions| Millions|Rich, rewarding, ...|rich rewarding an...| | tully_2018| Tully|An emotionally de...|an emotionally de...| | cop_car| Cop Car|Despite the real,...|despite the real ...| |the_incredible_je...|The Incredible Je...|A comedy centered...|a comedy centered...| +--------------------+--------------------+--------------------+--------------------+ only showing top 20 rows
documentAssembler = DocumentAssembler()\
.setInputCol("cleanedText")\
.setOutputCol("document")
# Regex Tokenizer to break words
tokenizer = Tokenizer() \
.setInputCols(['document']) \
.setOutputCol('token')
# Normalizing and setting case insensitive to be true
normalizer = Normalizer() \
.setInputCols(['token']) \
.setOutputCol('normalized') \
.setLowercase(True)
# Lemmatizing
lemmatizer = LemmatizerModel.pretrained() \
.setInputCols(['normalized']) \
.setOutputCol('lemma')
# finisher converts tokens to human-readable output
finisher = Finisher() \
.setInputCols(['lemma']) \
.setCleanAnnotations(False)
lemma_antbnc download started this may take some time. Approximate size to download 907.6 KB [ | ]lemma_antbnc download started this may take some time. Approximate size to download 907.6 KB Download done! Loading the resource. [OK!]
pipeline = Pipeline() \
.setStages([
documentAssembler,
tokenizer,
normalizer,
lemmatizer,
finisher
])
pipelineModel = pipeline.fit(df)
result = pipelineModel.transform(df)
WARNING: An illegal reflective access operation has occurred WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/spark-core_2.12-3.4.0.jar) to field java.util.regex.Pattern.pattern WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$ WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations WARNING: All illegal access operations will be denied in a future release
result = result.withColumn("final_text", F.concat_ws(" ", "finished_lemma"))
result.show()
23/11/23 19:25:12 WARN TaskSetManager: Stage 6 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB. [Stage 6:> (0 + 4) / 4]
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ | id| title| reviewText| cleanedText| document| token| normalized| lemma| finished_lemma| final_text| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ | the_escape_2018| The Escape|Sturdy melodrama,...|sturdy melodrama ...|[{document, 0, 10...|[{token, 0, 5, st...|[{token, 0, 5, st...|[{token, 0, 5, st...|[sturdy, melodram...|sturdy melodrama ...| | the_old_guard| The Old Guard|Despite an appeal...|despite an appeal...|[{document, 0, 18...|[{token, 0, 6, de...|[{token, 0, 6, de...|[{token, 0, 6, de...|[despite, an, app...|despite an appeal...| | anniversary_party|The Anniversary P...|Everyone gets the...|everyone gets the...|[{document, 0, 13...|[{token, 0, 7, ev...|[{token, 0, 7, ev...|[{token, 0, 7, ev...|[everyone, get, t...|everyone get they...| |the_secret_life_o...|The Secret Life o...|The Secret Life o...|the secret life o...|[{document, 0, 18...|[{token, 0, 2, th...|[{token, 0, 2, th...|[{token, 0, 2, th...|[the, secret, lif...|the secret life o...| | the_mummy_2017| The Mummy|Some movies feel ...|some movies feel ...|[{document, 0, 15...|[{token, 0, 3, so...|[{token, 0, 3, so...|[{token, 0, 3, so...|[some, movie, fee...|some movie feel l...| | live_by_night| Live by Night|It's really easy ...|its really easy t...|[{document, 0, 11...|[{token, 0, 2, it...|[{token, 0, 2, it...|[{token, 0, 2, it...|[it, really, easy...|it really easy to...| | neruda_2016| Neruda|One of the most s...|one of the most s...|[{document, 0, 93...|[{token, 0, 2, on...|[{token, 0, 2, on...|[{token, 0, 2, on...|[one, of, the, mo...|one of the most s...| |the_snowtown_murders|The Snowtown Murders|It's the escalati...|its the escalatin...|[{document, 0, 21...|[{token, 0, 2, it...|[{token, 0, 2, it...|[{token, 0, 2, it...|[it, the, escalat...|it the escalate s...| | gods_country_2022| God's Country|Higgins and Newto...|higgins and newto...|[{document, 0, 80...|[{token, 0, 6, hi...|[{token, 0, 6, hi...|[{token, 0, 6, hi...|[higgins, and, ne...|higgins and newto...| | mogul_mowgli| Mogul Mowgli|Tariq's embrace o...|tariqs embrace of...|[{document, 0, 16...|[{token, 0, 5, ta...|[{token, 0, 5, ta...|[{token, 0, 5, ta...|[tariqs, embrace,...|tariqs embrace of...| | cats_of_mirikitani|The Cats of Mirik...|It should not hav...|it should not hav...|[{document, 0, 88...|[{token, 0, 1, it...|[{token, 0, 1, it...|[{token, 0, 1, it...|[it, should, not,...|it should not hav...| | sharkwater| Sharkwater|How strange to ga...|how strange to ga...|[{document, 0, 13...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[how, strange, to...|how strange to ga...| | he_said_she_said| He Said, She Said|Two likable leads...|two likable leads...|[{document, 0, 53...|[{token, 0, 2, tw...|[{token, 0, 2, tw...|[{token, 0, 2, tw...|[two, likable, le...|two likable lead ...| | monsters_ball| Monster's Ball|Please read revie...|please read revie...|[{document, 0, 50...|[{token, 0, 5, pl...|[{token, 0, 5, pl...|[{token, 0, 5, pl...|[please, read, re...|please read revie...| | summer_in_february| Summer in February|Blah Britpic offe...|blah britpic offe...|[{document, 0, 42...|[{token, 0, 3, bl...|[{token, 0, 3, bl...|[{token, 0, 3, bl...|[blah, britpic, o...|blah britpic offe...| |the_best_exotic_m...|The Best Exotic M...|[Madden directs] ...|madden directs a ...|[{document, 0, 17...|[{token, 0, 5, ma...|[{token, 0, 5, ma...|[{token, 0, 5, ma...|[madden, direct, ...|madden direct a t...| | millions| Millions|Rich, rewarding, ...|rich rewarding an...|[{document, 0, 42...|[{token, 0, 3, ri...|[{token, 0, 3, ri...|[{token, 0, 3, ri...|[rich, reward, an...|rich reward and d...| | tully_2018| Tully|An emotionally de...|an emotionally de...|[{document, 0, 16...|[{token, 0, 1, an...|[{token, 0, 1, an...|[{token, 0, 1, an...|[an, emotionally,...|an emotionally de...| | cop_car| Cop Car|Despite the real,...|despite the real ...|[{document, 0, 14...|[{token, 0, 6, de...|[{token, 0, 6, de...|[{token, 0, 6, de...|[despite, the, re...|despite the real ...| |the_incredible_je...|The Incredible Je...|A comedy centered...|a comedy centered...|[{document, 0, 23...|[{token, 0, 0, a,...|[{token, 0, 0, a,...|[{token, 0, 0, a,...|[a, comedy, cente...|a comedy center a...| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ only showing top 20 rows
result.write.parquet("s3a://project-group34/project/sentiment_analysis/cleaned_reviews/")
23/11/23 19:26:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties 23/11/23 19:26:08 WARN TaskSetManager: Stage 9 contains a task of very large size (94106 KiB). The maximum recommended task size is 1000 KiB.
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
# Import pyspark and build Spark session
from pyspark.sql import SparkSession
# Import pyspark and build Spark session
spark = SparkSession.builder \
.appName("Spark NLP")\
.master("local[*]")\
.config("spark.driver.memory","16G")\
.config("spark.executor.memory", "12g")\
.config("spark.executor.cores", "3")\
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
.config(
"fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.ContainerCredentialsProvider"
)\
.getOrCreate()
print(spark.version)
Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider
:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2/cache The jars for the packages stored in: /root/.ivy2/jars com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency org.apache.hadoop#hadoop-aws added as a dependency :: resolving dependencies :: org.apache.spark#spark-submit-parent-e8291c0b-a674-4016-9c62-cb05c896f65d;1.0 confs: [default] found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central found com.typesafe#config;1.4.2 in central found org.rocksdb#rocksdbjni;6.29.5 in central found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central found com.github.universal-automata#liblevenshtein;3.0.0 in central found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central found com.google.code.gson#gson;2.3 in central found it.unimi.dsi#fastutil;7.0.12 in central found org.projectlombok#lombok;1.16.8 in central found com.google.cloud#google-cloud-storage;2.20.1 in central found com.google.guava#guava;31.1-jre in central found com.google.guava#failureaccess;1.0.1 in central found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central found com.google.errorprone#error_prone_annotations;2.18.0 in central found com.google.j2objc#j2objc-annotations;1.3 in central found com.google.http-client#google-http-client;1.43.0 in central found io.opencensus#opencensus-contrib-http-util;0.31.1 in central found com.google.http-client#google-http-client-jackson2;1.43.0 in central found com.google.http-client#google-http-client-gson;1.43.0 in central found com.google.api-client#google-api-client;2.2.0 in central found commons-codec#commons-codec;1.15 in central found com.google.oauth-client#google-oauth-client;1.34.1 in central found com.google.http-client#google-http-client-apache-v2;1.43.0 in central found com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central found com.google.code.gson#gson;2.10.1 in central found com.google.cloud#google-cloud-core;2.12.0 in central found io.grpc#grpc-context;1.53.0 in central found com.google.auto.value#auto-value-annotations;1.10.1 in central found com.google.auto.value#auto-value;1.10.1 in central found javax.annotation#javax.annotation-api;1.3.2 in central found commons-logging#commons-logging;1.2 in central found com.google.cloud#google-cloud-core-http;2.12.0 in central found com.google.http-client#google-http-client-appengine;1.43.0 in central found com.google.api#gax-httpjson;0.108.2 in central found com.google.cloud#google-cloud-core-grpc;2.12.0 in central found io.grpc#grpc-alts;1.53.0 in central found io.grpc#grpc-grpclb;1.53.0 in central found org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central found io.grpc#grpc-auth;1.53.0 in central found io.grpc#grpc-protobuf;1.53.0 in central found io.grpc#grpc-protobuf-lite;1.53.0 in central found io.grpc#grpc-core;1.53.0 in central found com.google.api#gax;2.23.2 in central found com.google.api#gax-grpc;2.23.2 in central found com.google.auth#google-auth-library-credentials;1.16.0 in central found com.google.auth#google-auth-library-oauth2-http;1.16.0 in central found com.google.api#api-common;2.6.2 in central found io.opencensus#opencensus-api;0.31.1 in central found com.google.api.grpc#proto-google-iam-v1;1.9.2 in central found com.google.protobuf#protobuf-java;3.21.12 in central found com.google.protobuf#protobuf-java-util;3.21.12 in central found com.google.api.grpc#proto-google-common-protos;2.14.2 in central found org.threeten#threetenbp;1.6.5 in central found com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central found com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central found com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central found com.fasterxml.jackson.core#jackson-core;2.14.2 in central found com.google.code.findbugs#jsr305;3.0.2 in central found io.grpc#grpc-api;1.53.0 in central found io.grpc#grpc-stub;1.53.0 in central found org.checkerframework#checker-qual;3.31.0 in central found io.perfmark#perfmark-api;0.26.0 in central found com.google.android#annotations;4.1.1.4 in central found org.codehaus.mojo#animal-sniffer-annotations;1.22 in central found io.opencensus#opencensus-proto;0.2.0 in central found io.grpc#grpc-services;1.53.0 in central found com.google.re2j#re2j;1.6 in central found io.grpc#grpc-netty-shaded;1.53.0 in central found io.grpc#grpc-googleapis;1.53.0 in central found io.grpc#grpc-xds;1.53.0 in central found com.navigamez#greex;1.0 in central found dk.brics.automaton#automaton;1.11-8 in central found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central found com.microsoft.onnxruntime#onnxruntime;1.15.0 in central found org.apache.hadoop#hadoop-aws;3.2.2 in central :: resolution report :: resolve 4147ms :: artifacts dl 517ms :: modules in use: com.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default] com.fasterxml.jackson.core#jackson-core;2.14.2 from central in [default] com.github.universal-automata#liblevenshtein;3.0.0 from central in [default] com.google.android#annotations;4.1.1.4 from central in [default] com.google.api#api-common;2.6.2 from central in [default] com.google.api#gax;2.23.2 from central in [default] com.google.api#gax-grpc;2.23.2 from central in [default] com.google.api#gax-httpjson;0.108.2 from central in [default] com.google.api-client#google-api-client;2.2.0 from central in [default] com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default] com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default] com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default] com.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default] com.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default] com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default] com.google.auth#google-auth-library-credentials;1.16.0 from central in [default] com.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default] com.google.auto.value#auto-value;1.10.1 from central in [default] com.google.auto.value#auto-value-annotations;1.10.1 from central in [default] com.google.cloud#google-cloud-core;2.12.0 from central in [default] com.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default] com.google.cloud#google-cloud-core-http;2.12.0 from central in [default] com.google.cloud#google-cloud-storage;2.20.1 from central in [default] com.google.code.findbugs#jsr305;3.0.2 from central in [default] com.google.code.gson#gson;2.10.1 from central in [default] com.google.errorprone#error_prone_annotations;2.18.0 from central in [default] com.google.guava#failureaccess;1.0.1 from central in [default] com.google.guava#guava;31.1-jre from central in [default] com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default] com.google.http-client#google-http-client;1.43.0 from central in [default] com.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default] com.google.http-client#google-http-client-appengine;1.43.0 from central in [default] com.google.http-client#google-http-client-gson;1.43.0 from central in [default] com.google.http-client#google-http-client-jackson2;1.43.0 from central in [default] com.google.j2objc#j2objc-annotations;1.3 from central in [default] com.google.oauth-client#google-oauth-client;1.34.1 from central in [default] com.google.protobuf#protobuf-java;3.21.12 from central in [default] com.google.protobuf#protobuf-java-util;3.21.12 from central in [default] com.google.re2j#re2j;1.6 from central in [default] com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 from central in [default] com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default] com.microsoft.onnxruntime#onnxruntime;1.15.0 from central in [default] com.navigamez#greex;1.0 from central in [default] com.typesafe#config;1.4.2 from central in [default] commons-codec#commons-codec;1.15 from central in [default] commons-logging#commons-logging;1.2 from central in [default] dk.brics.automaton#automaton;1.11-8 from central in [default] io.grpc#grpc-alts;1.53.0 from central in [default] io.grpc#grpc-api;1.53.0 from central in [default] io.grpc#grpc-auth;1.53.0 from central in [default] io.grpc#grpc-context;1.53.0 from central in [default] io.grpc#grpc-core;1.53.0 from central in [default] io.grpc#grpc-googleapis;1.53.0 from central in [default] io.grpc#grpc-grpclb;1.53.0 from central in [default] io.grpc#grpc-netty-shaded;1.53.0 from central in [default] io.grpc#grpc-protobuf;1.53.0 from central in [default] io.grpc#grpc-protobuf-lite;1.53.0 from central in [default] io.grpc#grpc-services;1.53.0 from central in [default] io.grpc#grpc-stub;1.53.0 from central in [default] io.grpc#grpc-xds;1.53.0 from central in [default] io.opencensus#opencensus-api;0.31.1 from central in [default] io.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default] io.opencensus#opencensus-proto;0.2.0 from central in [default] io.perfmark#perfmark-api;0.26.0 from central in [default] it.unimi.dsi#fastutil;7.0.12 from central in [default] javax.annotation#javax.annotation-api;1.3.2 from central in [default] org.apache.hadoop#hadoop-aws;3.2.2 from central in [default] org.checkerframework#checker-qual;3.31.0 from central in [default] org.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default] org.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default] org.projectlombok#lombok;1.16.8 from central in [default] org.rocksdb#rocksdbjni;6.29.5 from central in [default] org.threeten#threetenbp;1.6.5 from central in [default] :: evicted modules: com.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default] com.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default] com.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default] com.amazonaws#aws-java-sdk-bundle;1.11.563 by [com.amazonaws#aws-java-sdk-bundle;1.11.828] in [default] --------------------------------------------------------------------- | | modules || artifacts | | conf | number| search|dwnlded|evicted|| number|dwnlded| --------------------------------------------------------------------- | default | 77 | 0 | 0 | 4 || 73 | 0 | --------------------------------------------------------------------- :: retrieving :: org.apache.spark#spark-submit-parent-e8291c0b-a674-4016-9c62-cb05c896f65d confs: [default] 0 artifacts copied, 73 already retrieved (0kB/169ms) 23/11/23 22:09:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
3.4.0
result = spark.read.parquet("s3a://project-group34/project/sentiment_analysis/cleaned_reviews/")
23/11/23 22:09:36 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
result.show(5)
[Stage 1:> (0 + 1) / 1]
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ | id| title| reviewText| cleanedText| document| token| normalized| lemma| finished_lemma| final_text| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ | i_do_until_i_dont|I Do... Until I D...|If you enjoy watc...|if you enjoy watc...|[{document, 0, 59...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, you, enjoy, ...|if you enjoy watc...| |1009716-hobsons_c...| Hobson's Choice|So much of what m...|so much of what m...|[{document, 0, 14...|[{token, 0, 1, so...|[{token, 0, 1, so...|[{token, 0, 1, so...|[so, much, of, wh...|so much of what m...| | the_bubble_2022| The Bubble|The fact that The...|the fact that the...|[{document, 0, 19...|[{token, 0, 2, th...|[{token, 0, 2, th...|[{token, 0, 2, th...|[the, fact, that,...|the fact that the...| | bright_young_things| Bright Young Things|If there was ever...|if there was ever...|[{document, 0, 12...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, there, be, e...|if there be ever ...| | la_cage_aux_folles| Birds of a Feather|How The Birdcage ...|how the birdcage ...|[{document, 0, 51...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[how, the, birdca...|how the birdcage ...| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ only showing top 5 rows
documentAssembler = DocumentAssembler()\
.setInputCol("final_text")\
.setOutputCol("document")
# Paths to the models
tfhub_use_path = "../../../cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099/"
sentimentdl_use_twitter_path = "../../../cache_pretrained/sentimentdl_use_twitter_en_2.7.1_2.4_1610983524713/"
# Load models from local path
use = UniversalSentenceEncoder.load(tfhub_use_path)\
.setInputCols(["document"])\
.setOutputCol("sentence_embeddings")
sentimentdl = SentimentDLModel.load(sentimentdl_use_twitter_path)\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("sentiment")
nlpPipeline = Pipeline(
stages = [
documentAssembler,
use,
sentimentdl
# sentimentdl1
])
2023-11-23 22:10:09.324275: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-11-23 22:10:13.686569: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory. 2023-11-23 22:10:13.733440: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory. 2023-11-23 22:10:13.780734: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory. 2023-11-23 22:10:13.825948: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory. 2023-11-23 22:10:13.875391: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory. WARNING: An illegal reflective access operation has occurred WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/spark-core_2.12-3.4.0.jar) to field java.lang.ref.Reference.referent WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$ WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations WARNING: All illegal access operations will be denied in a future release
# use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
# .setInputCols(["document"])\
# .setOutputCol("sentence_embeddings")
# sentimentdl = SentimentDLModel.pretrained(name="sentimentdl_use_twitter", lang="en")\
# .setInputCols(["sentence_embeddings"])\
# .setOutputCol("sentiment")
# # sentimentdl1 = ClassifierDLModel.pretrained(name="classifierdl_use_emotion")\
# # .setInputCols(["sentence_embeddings"])\
# # .setOutputCol("sentiment_emotion")
sentiment_model = nlpPipeline.fit(result)
result = sentiment_model.transform(result)
result.show(5)
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ | id| title| reviewText| cleanedText| document| token| normalized| lemma| finished_lemma| final_text| sentence_embeddings| sentiment| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ | i_do_until_i_dont|I Do... Until I D...|If you enjoy watc...|if you enjoy watc...|[{document, 0, 52...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, you, enjoy, ...|if you enjoy watc...|[{sentence_embedd...|[{category, 0, 52...| |1009716-hobsons_c...| Hobson's Choice|So much of what m...|so much of what m...|[{document, 0, 13...|[{token, 0, 1, so...|[{token, 0, 1, so...|[{token, 0, 1, so...|[so, much, of, wh...|so much of what m...|[{sentence_embedd...|[{category, 0, 13...| | the_bubble_2022| The Bubble|The fact that The...|the fact that the...|[{document, 0, 19...|[{token, 0, 2, th...|[{token, 0, 2, th...|[{token, 0, 2, th...|[the, fact, that,...|the fact that the...|[{sentence_embedd...|[{category, 0, 19...| | bright_young_things| Bright Young Things|If there was ever...|if there was ever...|[{document, 0, 11...|[{token, 0, 1, if...|[{token, 0, 1, if...|[{token, 0, 1, if...|[if, there, be, e...|if there be ever ...|[{sentence_embedd...|[{category, 0, 11...| | la_cage_aux_folles| Birds of a Feather|How The Birdcage ...|how the birdcage ...|[{document, 0, 46...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[{token, 0, 2, ho...|[how, the, birdca...|how the birdcage ...|[{sentence_embedd...|[{category, 0, 46...| +--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ only showing top 5 rows
result.write.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_extracted/", mode="overwrite")
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
# Import pyspark and build Spark session
from pyspark.sql import SparkSession
# Import pyspark and build Spark session
spark = SparkSession.builder \
.appName("Spark NLP")\
.master("local[*]")\
.config("spark.driver.memory","16G")\
.config("spark.executor.memory", "12g")\
.config("spark.executor.cores", "3")\
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
.config(
"fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.ContainerCredentialsProvider"
)\
.getOrCreate()
print(spark.version)
3.4.0
result = spark.read.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_extracted/")
23/11/23 22:55:13 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
result = result.select(
F.explode(F.arrays_zip('sentiment.result', 'sentiment.metadata')).alias("cols"),
F.expr("title").alias("title"),
F.expr("cleanedText").alias("text")
)
from pyspark.sql.functions import col
# Selecting the necessary columns
result = result.select(
col("cols.result").alias("sentiment"),
col("cols.metadata")["positive"].alias("positive_score"),
col("cols.metadata")["negative"].alias("negative_score"),
"title",
"text"
)
# # Showing the first few rows of the modified DataFrame
# finalresult.show(truncate=False)
result.show(5)
[Stage 1:> (0 + 1) / 1]
+---------+--------------+--------------+--------------------+--------------------+ |sentiment|positive_score|negative_score| title| text| +---------+--------------+--------------+--------------------+--------------------+ | positive| 1.0| 0.0|I Do... Until I D...|if you enjoy watc...| | positive| 0.8947771| 0.10522288| Hobson's Choice|so much of what m...| | positive| 0.77628094| 0.22371912| The Bubble|the fact that the...| | positive| 1.0| 0.0| Bright Young Things|if there was ever...| | positive| 1.0| 0.0| Birds of a Feather|how the birdcage ...| +---------+--------------+--------------+--------------------+--------------------+ only showing top 5 rows
result.write.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_processed/", mode="overwrite")
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
LemmatizerModel, StopWordsCleaner)
import pyspark.sql.functions as F
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
# Import pyspark and build Spark session
from pyspark.sql import SparkSession
# Import pyspark and build Spark session
spark = SparkSession.builder \
.appName("Spark NLP")\
.master("local[*]")\
.config("spark.driver.memory","16G")\
.config("spark.executor.memory", "12g")\
.config("spark.executor.cores", "3")\
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
.config(
"fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.ContainerCredentialsProvider"
)\
.getOrCreate()
print(spark.version)
3.4.0
result = spark.read.parquet("s3a://project-group34/project/sentiment_analysis/sentiment_processed/")
# Group by the 'title' and calculate the average of 'positive_score' and 'negative_score'
result = result.groupBy("title").agg(
F.avg("positive_score").alias("average_positive_score"),
F.avg("negative_score").alias("average_negative_score"),
F.count(col("positive_score")).alias("num_reviews")
)
result.show(5)
[Stage 29:===========================================> (3 + 1) / 4]
+--------------------+----------------------+----------------------+-----------+ | title|average_positive_score|average_negative_score|num_reviews| +--------------------+----------------------+----------------------+-----------+ | Kin| 0.7215052546283167| 0.2784947445187336| 98| |Taking Sides: Le ...| 0.8364778709963958| 0.1635221320418551| 48| | How to Deal| 0.6003653085125116| 0.3996346922415671| 90| | On My Way| 0.8729917092682926| 0.12700829512315298| 41| | Black Nativity| 0.8128262822475695| 0.1871737217950096| 90| +--------------------+----------------------+----------------------+-----------+ only showing top 5 rows
result.write.parquet("s3a://project-group34/project/sentiment_analysis/average_scores_per_movie/", mode="overwrite")
from pyspark.sql.functions import mean, median, min, max
# Assuming your DataFrame is named `df`
stats = result.agg(
mean(col("average_positive_score")).alias("mean_positive"),
mean(col("average_negative_score")).alias("mean_negative"),
median(col("num_reviews")).alias("median_num_reviews"), # Note: PySpark doesn't have a built-in median function
min(col("num_reviews")).alias("min_num_reviews"),
max(col("num_reviews")).alias("max_num_reviews")
)
stats.show()
[Stage 68:=============================> (2 + 2) / 4]
+------------------+-------------------+------------------+---------------+---------------+ | mean_positive| mean_negative|median_num_reviews|min_num_reviews|max_num_reviews| +------------------+-------------------+------------------+---------------+---------------+ |0.7525733910082344|0.24742646776123378| 4.0| 1| 1921| +------------------+-------------------+------------------+---------------+---------------+
num_movies_less_than_4_reviews = result.filter(col("num_reviews") > 10).count()
print(f"Number of movies with fewer than 4 reviews: {num_movies_less_than_4_reviews}")
[Stage 62:=============================> (2 + 2) / 4]
Number of movies with fewer than 4 reviews: 19794
# Multiply average_positive_score and average_negative_score by num_reviews
result = result.withColumn("normalized_positive_score", col("average_positive_score") * col("num_reviews")) \
.withColumn("normalized_negative_score", col("average_negative_score") * col("num_reviews"))
result.show(5)
[Stage 35:===========================================> (3 + 1) / 4]
+--------------------+----------------------+----------------------+-----------+-------------------------+-------------------------+ | title|average_positive_score|average_negative_score|num_reviews|normalized_positive_score|normalized_negative_score| +--------------------+----------------------+----------------------+-----------+-------------------------+-------------------------+ | Kin| 0.7215052546283167| 0.2784947445187336| 98| 70.70751495357504| 27.29248496283589| |Taking Sides: Le ...| 0.8364778709963958| 0.1635221320418551| 48| 40.150937807827| 7.849062338009045| | How to Deal| 0.6003653085125116| 0.3996346922415671| 90| 54.03287776612604| 35.96712230174104| | On My Way| 0.8729917092682926| 0.12700829512315298| 41| 35.79266008| 5.207340100049272| | Black Nativity| 0.8128262822475695| 0.1871737217950096| 90| 73.15436540228126| 16.845634961550864| +--------------------+----------------------+----------------------+-----------+-------------------------+-------------------------+ only showing top 5 rows
from pyspark.sql.functions import col
# Constants from your dataset
C_positive = 0.7525733910082344 # Mean score of positive scores
C_negative = 0.24742646776123378 # Mean score of negative scores
m = 4 # Median number of reviews, can adjust as needed
# Adding a new column for weighted rating
positive_weighted_rating_df = result.withColumn("weighted_rating",
(col("num_reviews") / (col("num_reviews") + m) * col("average_positive_score")) +
(m / (col("num_reviews") + m) * C_positive))
# Adding a new column for weighted rating
negative_weighted_rating_df = result.withColumn("weighted_rating",
(col("num_reviews") / (col("num_reviews") + m) * col("average_negative_score")) +
(m / (col("num_reviews") + m) * C_negative))
# Get top 20 movies with highest average positive scores
top_20_positive = positive_weighted_rating_df.orderBy(F.desc("weighted_rating")).limit(20)
top_20_positive.select("title", "weighted_rating").show()
[Stage 77:=============================> (2 + 2) / 4]
+--------------------+------------------+ | title| weighted_rating| +--------------------+------------------+ |Jim Allison: Brea...|0.9809671839237104| |Hava Nagila (The ...|0.9789505983669412| | Trifling Women|0.9786921542506863| |Blinky Bill the M...|0.9776142600840194| |Molly's Theory of...|0.9775049055462031| |Blind Willow, Sle...|0.9725081545564704| |California Typewr...|0.9717155569098395| | Half Magic| 0.969509225369288| | Things Never Said|0.9682975731008235| | I'm Leaving Now|0.9682241038760293| | The Farthest|0.9680655310978367| |Bill Cunningham N...|0.9677857000523757| |Richard Linklater...|0.9676412249723696| | Breaking Fast|0.9675419343236431| | Moonage Daydream|0.9661923959624714| | Suzi Q|0.9661492989421452| |Salvatore: Shoema...|0.9657396017786927| |Is the Man Who Is...|0.9652498774769747| |The Eyes of Orson...| 0.964369877480659| | Gentleman Jim| 0.963343549038257| +--------------------+------------------+
# Get top 20 movies with highest average negative scores
top_20_negative = negative_weighted_rating_df.orderBy(F.desc("weighted_rating")).limit(20)
top_20_negative.select("title", "weighted_rating").show()
[Stage 89:=============================> (2 + 2) / 4]
+--------------------+------------------+ | title| weighted_rating| +--------------------+------------------+ |Dinner With the P...|0.8108478919403085| | Fangs|0.7490668759204112| | Elephant Tales|0.7477467559204113| | Arisaka| 0.745482372587078| | The Phantom Planet|0.7426795559204112| | 12 in a Box|0.6973159521044934| | Audrie & Daisy|0.6911893001508967| |A Tree of Life: T...|0.6902592416188412| | Tears of Gaza| 0.689034361316461| | Moscow Zero|0.6797639392537446| | Of Fish and Foe|0.6789700581044935| | 37|0.6702393639265335| | Gut Renovation|0.6672347836460667| | Bloody Birthday| 0.666751160094994| | Cries From Syria|0.6652684999080672| | American Relapse|0.6638483609204112| |The Bad News Bear...| 0.660546783256329| | May Fools|0.6593032419403084| | Private Violence|0.6576097419403084| | Radioland Murders|0.6575381655495395| +--------------------+------------------+
top_20_positive_pd = top_20_positive.toPandas()
top_20_negative_pd = top_20_negative.toPandas()
top_20_positive_pd.to_csv("../../data/csv/20_positive_movieReviews.csv", index = False)
top_20_negative_pd.to_csv("../../data/csv/20_negative_movieReviews.csv", index = False)