bucket_name = "sk2224-projectdata"
!aws s3 mb s3://{bucket_name}


# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider",
    )\
    .getOrCreate()

Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider

:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-44e4036b-aa68-4a54-9d07-ccdb0aeaabac;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.errorprone#error_prone_annotations;2.18.0 in central
	found com.google.j2objc#j2objc-annotations;1.3 in central
	found com.google.http-client#google-http-client;1.43.0 in central
	found io.opencensus#opencensus-contrib-http-util;0.31.1 in central
	found com.google.http-client#google-http-client-jackson2;1.43.0 in central
	found com.google.http-client#google-http-client-gson;1.43.0 in central
	found com.google.api-client#google-api-client;2.2.0 in central
	found commons-codec#commons-codec;1.15 in central
	found com.google.oauth-client#google-oauth-client;1.34.1 in central
	found com.google.http-client#google-http-client-apache-v2;1.43.0 in central
	found com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central
	found com.google.code.gson#gson;2.10.1 in central
	found com.google.cloud#google-cloud-core;2.12.0 in central
	found io.grpc#grpc-context;1.53.0 in central
	found com.google.auto.value#auto-value-annotations;1.10.1 in central
	found com.google.auto.value#auto-value;1.10.1 in central
	found javax.annotation#javax.annotation-api;1.3.2 in central
	found commons-logging#commons-logging;1.2 in central
	found com.google.cloud#google-cloud-core-http;2.12.0 in central
	found com.google.http-client#google-http-client-appengine;1.43.0 in central
	found com.google.api#gax-httpjson;0.108.2 in central
	found com.google.cloud#google-cloud-core-grpc;2.12.0 in central
	found io.grpc#grpc-alts;1.53.0 in central
	found io.grpc#grpc-grpclb;1.53.0 in central
	found org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central
	found io.grpc#grpc-auth;1.53.0 in central
	found io.grpc#grpc-protobuf;1.53.0 in central
	found io.grpc#grpc-protobuf-lite;1.53.0 in central
	found io.grpc#grpc-core;1.53.0 in central
	found com.google.api#gax;2.23.2 in central
	found com.google.api#gax-grpc;2.23.2 in central
	found com.google.auth#google-auth-library-credentials;1.16.0 in central
	found com.google.auth#google-auth-library-oauth2-http;1.16.0 in central
	found com.google.api#api-common;2.6.2 in central
	found io.opencensus#opencensus-api;0.31.1 in central
	found com.google.api.grpc#proto-google-iam-v1;1.9.2 in central
	found com.google.protobuf#protobuf-java;3.21.12 in central
	found com.google.protobuf#protobuf-java-util;3.21.12 in central
	found com.google.api.grpc#proto-google-common-protos;2.14.2 in central
	found org.threeten#threetenbp;1.6.5 in central
	found com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central
	found com.fasterxml.jackson.core#jackson-core;2.14.2 in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found io.grpc#grpc-api;1.53.0 in central
	found io.grpc#grpc-stub;1.53.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
	found io.perfmark#perfmark-api;0.26.0 in central
	found com.google.android#annotations;4.1.1.4 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.22 in central
	found io.opencensus#opencensus-proto;0.2.0 in central
	found io.grpc#grpc-services;1.53.0 in central
	found com.google.re2j#re2j;1.6 in central
	found io.grpc#grpc-netty-shaded;1.53.0 in central
	found io.grpc#grpc-googleapis;1.53.0 in central
	found io.grpc#grpc-xds;1.53.0 in central
	found com.navigamez#greex;1.0 in central
	found dk.brics.automaton#automaton;1.11-8 in central
	found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central
	found com.microsoft.onnxruntime#onnxruntime;1.15.0 in central
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
:: resolution report :: resolve 3890ms :: artifacts dl 416ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default]
	com.fasterxml.jackson.core#jackson-core;2.14.2 from central in [default]
	com.github.universal-automata#liblevenshtein;3.0.0 from central in [default]
	com.google.android#annotations;4.1.1.4 from central in [default]
	com.google.api#api-common;2.6.2 from central in [default]
	com.google.api#gax;2.23.2 from central in [default]
	com.google.api#gax-grpc;2.23.2 from central in [default]
	com.google.api#gax-httpjson;0.108.2 from central in [default]
	com.google.api-client#google-api-client;2.2.0 from central in [default]
	com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]
	com.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]
	com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]
	com.google.auth#google-auth-library-credentials;1.16.0 from central in [default]
	com.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]
	com.google.auto.value#auto-value;1.10.1 from central in [default]
	com.google.auto.value#auto-value-annotations;1.10.1 from central in [default]
	com.google.cloud#google-cloud-core;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-http;2.12.0 from central in [default]
	com.google.cloud#google-cloud-storage;2.20.1 from central in [default]
	com.google.code.findbugs#jsr305;3.0.2 from central in [default]
	com.google.code.gson#gson;2.10.1 from central in [default]
	com.google.errorprone#error_prone_annotations;2.18.0 from central in [default]
	com.google.guava#failureaccess;1.0.1 from central in [default]
	com.google.guava#guava;31.1-jre from central in [default]
	com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]
	com.google.http-client#google-http-client;1.43.0 from central in [default]
	com.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]
	com.google.http-client#google-http-client-appengine;1.43.0 from central in [default]
	com.google.http-client#google-http-client-gson;1.43.0 from central in [default]
	com.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]
	com.google.j2objc#j2objc-annotations;1.3 from central in [default]
	com.google.oauth-client#google-oauth-client;1.34.1 from central in [default]
	com.google.protobuf#protobuf-java;3.21.12 from central in [default]
	com.google.protobuf#protobuf-java-util;3.21.12 from central in [default]
	com.google.re2j#re2j;1.6 from central in [default]
	com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 from central in [default]
	com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]
	com.microsoft.onnxruntime#onnxruntime;1.15.0 from central in [default]
	com.navigamez#greex;1.0 from central in [default]
	com.typesafe#config;1.4.2 from central in [default]
	commons-codec#commons-codec;1.15 from central in [default]
	commons-logging#commons-logging;1.2 from central in [default]
	dk.brics.automaton#automaton;1.11-8 from central in [default]
	io.grpc#grpc-alts;1.53.0 from central in [default]
	io.grpc#grpc-api;1.53.0 from central in [default]
	io.grpc#grpc-auth;1.53.0 from central in [default]
	io.grpc#grpc-context;1.53.0 from central in [default]
	io.grpc#grpc-core;1.53.0 from central in [default]
	io.grpc#grpc-googleapis;1.53.0 from central in [default]
	io.grpc#grpc-grpclb;1.53.0 from central in [default]
	io.grpc#grpc-netty-shaded;1.53.0 from central in [default]
	io.grpc#grpc-protobuf;1.53.0 from central in [default]
	io.grpc#grpc-protobuf-lite;1.53.0 from central in [default]
	io.grpc#grpc-services;1.53.0 from central in [default]
	io.grpc#grpc-stub;1.53.0 from central in [default]
	io.grpc#grpc-xds;1.53.0 from central in [default]
	io.opencensus#opencensus-api;0.31.1 from central in [default]
	io.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]
	io.opencensus#opencensus-proto;0.2.0 from central in [default]
	io.perfmark#perfmark-api;0.26.0 from central in [default]
	it.unimi.dsi#fastutil;7.0.12 from central in [default]
	javax.annotation#javax.annotation-api;1.3.2 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	org.checkerframework#checker-qual;3.31.0 from central in [default]
	org.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]
	org.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]
	org.projectlombok#lombok;1.16.8 from central in [default]
	org.rocksdb#rocksdbjni;6.29.5 from central in [default]
	org.threeten#threetenbp;1.6.5 from central in [default]
	:: evicted modules:
	com.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]
	com.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]
	com.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]
	com.amazonaws#aws-java-sdk-bundle;1.11.563 by [com.amazonaws#aws-java-sdk-bundle;1.11.828] in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   77  |   0   |   0   |   4   ||   73  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-44e4036b-aa68-4a54-9d07-ccdb0aeaabac
	confs: [default]
	0 artifacts copied, 73 already retrieved (0kB/47ms)
23/11/21 01:33:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


%%time
bucket = "sk2224-projectdata"
session = sagemaker.Session()
output_prefix_data_comments = "submissions/suggestions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
print(f"reading submissions from {s3_path}")
submissions = spark.read.parquet(s3_path, header=True)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading submissions from s3a://sk2224-projectdata/submissions/suggestions/yyyy=*

23/11/21 01:33:53 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties

CPU times: user 267 ms, sys: 15.4 ms, total: 283 ms
Wall time: 8.51 s

23/11/21 01:34:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


submissions.printSchema()

root
 |-- adserver_click_url: string (nullable = true)
 |-- adserver_imp_pixel: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- brand_safe: boolean (nullable = true)
 |-- contest_mode: boolean (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- crosspost_parent: string (nullable = true)
 |-- crosspost_parent_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- approved_at_utc: string (nullable = true)
 |    |    |-- approved_by: string (nullable = true)
 |    |    |-- archived: boolean (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- author_flair_css_class: string (nullable = true)
 |    |    |-- author_flair_text: string (nullable = true)
 |    |    |-- banned_at_utc: string (nullable = true)
 |    |    |-- banned_by: string (nullable = true)
 |    |    |-- brand_safe: boolean (nullable = true)
 |    |    |-- can_gild: boolean (nullable = true)
 |    |    |-- can_mod_post: boolean (nullable = true)
 |    |    |-- clicked: boolean (nullable = true)
 |    |    |-- contest_mode: boolean (nullable = true)
 |    |    |-- created: double (nullable = true)
 |    |    |-- created_utc: double (nullable = true)
 |    |    |-- distinguished: string (nullable = true)
 |    |    |-- domain: string (nullable = true)
 |    |    |-- downs: long (nullable = true)
 |    |    |-- edited: boolean (nullable = true)
 |    |    |-- gilded: long (nullable = true)
 |    |    |-- hidden: boolean (nullable = true)
 |    |    |-- hide_score: boolean (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- is_crosspostable: boolean (nullable = true)
 |    |    |-- is_reddit_media_domain: boolean (nullable = true)
 |    |    |-- is_self: boolean (nullable = true)
 |    |    |-- is_video: boolean (nullable = true)
 |    |    |-- likes: string (nullable = true)
 |    |    |-- link_flair_css_class: string (nullable = true)
 |    |    |-- link_flair_text: string (nullable = true)
 |    |    |-- locked: boolean (nullable = true)
 |    |    |-- media: string (nullable = true)
 |    |    |-- mod_reports: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- num_comments: long (nullable = true)
 |    |    |-- num_crossposts: long (nullable = true)
 |    |    |-- num_reports: string (nullable = true)
 |    |    |-- over_18: boolean (nullable = true)
 |    |    |-- parent_whitelist_status: string (nullable = true)
 |    |    |-- permalink: string (nullable = true)
 |    |    |-- pinned: boolean (nullable = true)
 |    |    |-- quarantine: boolean (nullable = true)
 |    |    |-- removal_reason: string (nullable = true)
 |    |    |-- report_reasons: string (nullable = true)
 |    |    |-- saved: boolean (nullable = true)
 |    |    |-- score: long (nullable = true)
 |    |    |-- secure_media: string (nullable = true)
 |    |    |-- selftext: string (nullable = true)
 |    |    |-- selftext_html: string (nullable = true)
 |    |    |-- spoiler: boolean (nullable = true)
 |    |    |-- stickied: boolean (nullable = true)
 |    |    |-- subreddit: string (nullable = true)
 |    |    |-- subreddit_id: string (nullable = true)
 |    |    |-- subreddit_name_prefixed: string (nullable = true)
 |    |    |-- subreddit_type: string (nullable = true)
 |    |    |-- suggested_sort: string (nullable = true)
 |    |    |-- thumbnail: string (nullable = true)
 |    |    |-- thumbnail_height: string (nullable = true)
 |    |    |-- thumbnail_width: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- ups: long (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- user_reports: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- view_count: string (nullable = true)
 |    |    |-- visited: boolean (nullable = true)
 |    |    |-- whitelist_status: string (nullable = true)
 |-- disable_comments: boolean (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- domain_override: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- embed_type: string (nullable = true)
 |-- embed_url: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- hidden: boolean (nullable = true)
 |-- hide_score: boolean (nullable = true)
 |-- href_url: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imp_pixel: string (nullable = true)
 |-- is_crosspostable: boolean (nullable = true)
 |-- is_reddit_media_domain: boolean (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- is_video: boolean (nullable = true)
 |-- link_flair_css_class: string (nullable = true)
 |-- link_flair_text: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- oembed: struct (nullable = true)
 |    |    |-- author_name: string (nullable = true)
 |    |    |-- author_url: string (nullable = true)
 |    |    |-- cache_age: long (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- html: string (nullable = true)
 |    |    |-- provider_name: string (nullable = true)
 |    |    |-- provider_url: string (nullable = true)
 |    |    |-- thumbnail_height: long (nullable = true)
 |    |    |-- thumbnail_url: string (nullable = true)
 |    |    |-- thumbnail_width: long (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- version: string (nullable = true)
 |    |    |-- width: long (nullable = true)
 |    |-- reddit_video: struct (nullable = true)
 |    |    |-- dash_url: string (nullable = true)
 |    |    |-- duration: long (nullable = true)
 |    |    |-- fallback_url: string (nullable = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- hls_url: string (nullable = true)
 |    |    |-- is_gif: boolean (nullable = true)
 |    |    |-- scrubber_media_url: string (nullable = true)
 |    |    |-- transcoding_status: string (nullable = true)
 |    |    |-- width: long (nullable = true)
 |    |-- type: string (nullable = true)
 |-- media_embed: struct (nullable = true)
 |    |-- content: string (nullable = true)
 |    |-- height: long (nullable = true)
 |    |-- scrolling: boolean (nullable = true)
 |    |-- width: long (nullable = true)
 |-- mobile_ad_url: string (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- num_crossposts: long (nullable = true)
 |-- original_link: string (nullable = true)
 |-- over_18: boolean (nullable = true)
 |-- parent_whitelist_status: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- pinned: boolean (nullable = true)
 |-- post_hint: string (nullable = true)
 |-- preview: struct (nullable = true)
 |    |-- enabled: boolean (nullable = true)
 |    |-- images: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |-- variants: struct (nullable = true)
 |    |    |    |    |-- gif: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |-- mp4: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |-- nsfw: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |-- obfuscated: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |-- promoted: boolean (nullable = true)
 |-- promoted_by: string (nullable = true)
 |-- promoted_display_name: string (nullable = true)
 |-- promoted_url: string (nullable = true)
 |-- retrieved_on: timestamp (nullable = true)
 |-- score: long (nullable = true)
 |-- secure_media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- oembed: struct (nullable = true)
 |    |    |-- author_name: string (nullable = true)
 |    |    |-- author_url: string (nullable = true)
 |    |    |-- cache_age: long (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- html: string (nullable = true)
 |    |    |-- provider_name: string (nullable = true)
 |    |    |-- provider_url: string (nullable = true)
 |    |    |-- thumbnail_height: long (nullable = true)
 |    |    |-- thumbnail_url: string (nullable = true)
 |    |    |-- thumbnail_width: long (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- version: string (nullable = true)
 |    |    |-- width: long (nullable = true)
 |    |-- type: string (nullable = true)
 |-- secure_media_embed: struct (nullable = true)
 |    |-- content: string (nullable = true)
 |    |-- height: long (nullable = true)
 |    |-- media_domain_url: string (nullable = true)
 |    |-- scrolling: boolean (nullable = true)
 |    |-- width: long (nullable = true)
 |-- selftext: string (nullable = true)
 |-- spoiler: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- suggested_sort: string (nullable = true)
 |-- third_party_trackers: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- third_party_tracking: string (nullable = true)
 |-- third_party_tracking_2: string (nullable = true)
 |-- thumbnail: string (nullable = true)
 |-- thumbnail_height: long (nullable = true)
 |-- thumbnail_width: long (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- whitelist_status: string (nullable = true)


# Filter out rows where 'text' or 'author' is '[deleted]'
submissions_filtered = submissions.filter((submissions.selftext != '[deleted]') & (submissions.selftext != '[removed]') & (submissions.author != '[deleted]') & (submissions.author != '[removed]')& (submissions.title != '[deleted]') &  (submissions.title != '[removed]')) 

# Show the filtered DataFrame
submissions_filtered = submissions_filtered.select("subreddit", "author", "title", "selftext",
                             "created_utc", "num_comments", "score", 
                             "over_18", "media", "pinned", "locked", 
                             "disable_comments", "domain", "hidden", 
                             "distinguished", "hide_score")


submissions_filtered.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+----------------+-----------------+--------------------+--------------------+-------------------+------------+-----+-------+-----+------+------+----------------+--------------------+------+-------------+----------+
|       subreddit|           author|               title|            selftext|        created_utc|num_comments|score|over_18|media|pinned|locked|disable_comments|              domain|hidden|distinguished|hide_score|
+----------------+-----------------+--------------------+--------------------+-------------------+------------+-----+-------+-----+------+------+----------------+--------------------+------+-------------+----------+
|    Animesuggest|      RektoriusYT|Never watched mec...|So basically for ...|2021-02-18 15:46:14|          12|    3|  false| null| false| false|            null|   self.Animesuggest| false|         null|     false|
|    Animesuggest|           bff_op|Anime like Highsc...|Hello I need sugg...|2021-02-18 15:50:06|           6|    1|  false| null| false| false|            null|   self.Animesuggest| false|         null|     false|
|MovieSuggestions|    scottymac0707|        Blockbusters|Drop your best bl...|2021-02-18 15:50:53|           8|    3|  false| null| false| false|            null|self.MovieSuggest...| false|         null|     false|
|MovieSuggestions|       stone78221|Family Movies lik...|I like Conviction...|2021-02-18 15:51:41|           4|    1|  false| null| false| false|            null|self.MovieSuggest...| false|         null|     false|
|MovieSuggestions|Mighty_Dragon_001|Looking for movie...|I really really l...|2021-02-18 15:54:03|           5|    4|  false| null| false| false|            null|self.MovieSuggest...| false|         null|     false|
+----------------+-----------------+--------------------+--------------------+-------------------+------------+-----+-------+-----+------+------+----------------+--------------------+------+-------------+----------+
only showing top 5 rows


from pyspark.sql.functions import concat_ws

# Combine 'title' and 'selftext' columns into a new column 'RedditText'
submissions_combined = submissions_filtered.withColumn(
    "RedditText", concat_ws(" ", "title", "selftext")
)


# Show the first 5 rows of the DataFrame with the new column
submissions_combined.select("subreddit", "author", "RedditText", "created_utc", "num_comments", "score").show(5)

+----------------+-----------------+--------------------+-------------------+------------+-----+
|       subreddit|           author|          RedditText|        created_utc|num_comments|score|
+----------------+-----------------+--------------------+-------------------+------------+-----+
|    Animesuggest|      RektoriusYT|Never watched mec...|2021-02-18 15:46:14|          12|    3|
|    Animesuggest|           bff_op|Anime like Highsc...|2021-02-18 15:50:06|           6|    1|
|MovieSuggestions|    scottymac0707|Blockbusters Drop...|2021-02-18 15:50:53|           8|    3|
|MovieSuggestions|       stone78221|Family Movies lik...|2021-02-18 15:51:41|           4|    1|
|MovieSuggestions|Mighty_Dragon_001|Looking for movie...|2021-02-18 15:54:03|           5|    4|
+----------------+-----------------+--------------------+-------------------+------------+-----+
only showing top 5 rows


submissions_combined = submissions_combined.select("subreddit", "author", "RedditText", "created_utc", "num_comments", "score")


from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)


documentAssembler = DocumentAssembler()\
    .setInputCol("RedditText")\
    .setOutputCol("document")


# Regex Tokenizer to break words
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')


# Normalizing and setting case insensitive to be true
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)


!pip install nltk

Requirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (3.7)
Requirement already satisfied: click in /opt/conda/lib/python3.10/site-packages (from nltk) (8.1.7)
Requirement already satisfied: joblib in /opt/conda/lib/python3.10/site-packages (from nltk) (1.3.2)
Requirement already satisfied: regex>=2021.8.3 in /opt/conda/lib/python3.10/site-packages (from nltk) (2022.7.9)
Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from nltk) (4.64.1)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv

[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip


import nltk
from nltk.corpus import stopwords

# Download stopwords data
nltk.download('stopwords')

# Now you can use stopwords from nltk.corpus
stopwords_list = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Remove Stopwords
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(stopwords_list)


# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)


# Lemmatizing
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]

[OK!]


pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,  # Add the lemmatizer stage here
        stopwords_cleaner.setInputCols(['lemma']),
        finisher
    ])


from pyspark.ml import PipelineModel

# Create an empty DataFrame with the same schema as your actual data
empty_df = spark.createDataFrame([], schema=submissions_combined.schema)


# Fit the pipeline model on the empty DataFrame
pipeline_model = pipeline.fit(empty_df)

WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/spark-core_2.12-3.4.0.jar) to field java.util.regex.Pattern.pattern
WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release


# Transform your actual DataFrame using the fitted pipeline model
submission_clean = pipeline_model.transform(submissions_combined)


submission_clean.select(submission_clean.finished_clean_lemma).show(5)

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+
|finished_clean_lemma|
+--------------------+
|[never, watch, me...|
|[anime, like, hig...|
|[blockbuster, dro...|
|[family, movie, l...|
|[look, movie, lik...|
+--------------------+
only showing top 5 rows


submission_clean.columns

['subreddit',
 'author',
 'RedditText',
 'created_utc',
 'num_comments',
 'score',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma']


# Remove 'lemma', 'clean_lemma' from the DataFrame
submission_clean = submission_clean.drop('lemma', 'clean_lemma',
 'document',
 'token',
 'normalized',)

# Rename 'finished_clean_lemma' to 'clean_RedditText'
submission_clean = submission_clean.withColumnRenamed('finished_clean_lemma', 'clean_RedditText')


submission_clean.show(5)

+----------------+-----------------+--------------------+-------------------+------------+-----+--------------------+
|       subreddit|           author|          RedditText|        created_utc|num_comments|score|    clean_RedditText|
+----------------+-----------------+--------------------+-------------------+------------+-----+--------------------+
|    Animesuggest|      RektoriusYT|Never watched mec...|2021-02-18 15:46:14|          12|    3|[never, watch, me...|
|    Animesuggest|           bff_op|Anime like Highsc...|2021-02-18 15:50:06|           6|    1|[anime, like, hig...|
|MovieSuggestions|    scottymac0707|Blockbusters Drop...|2021-02-18 15:50:53|           8|    3|[blockbuster, dro...|
|MovieSuggestions|       stone78221|Family Movies lik...|2021-02-18 15:51:41|           4|    1|[family, movie, l...|
|MovieSuggestions|Mighty_Dragon_001|Looking for movie...|2021-02-18 15:54:03|           5|    4|[look, movie, lik...|
+----------------+-----------------+--------------------+-------------------+------------+-----+--------------------+
only showing top 5 rows


# Assuming 'subreddit' is a column in your DataFrame
submission_clean_movie = submission_clean.filter(submission_clean['subreddit'] == 'MovieSuggestions')


submission_clean_movie.show(5)

+----------------+--------------------+--------------------+-------------------+------------+-----+--------------------+
|       subreddit|              author|          RedditText|        created_utc|num_comments|score|    clean_RedditText|
+----------------+--------------------+--------------------+-------------------+------------+-----+--------------------+
|MovieSuggestions|       scottymac0707|Blockbusters Drop...|2021-02-18 15:50:53|           8|    3|[blockbuster, dro...|
|MovieSuggestions|          stone78221|Family Movies lik...|2021-02-18 15:51:41|           4|    1|[family, movie, l...|
|MovieSuggestions|   Mighty_Dragon_001|Looking for movie...|2021-02-18 15:54:03|           5|    4|[look, movie, lik...|
|MovieSuggestions|  theRealestAintReal|Detective movies ...|2021-06-30 22:21:40|          13|    3|[detective, movie...|
|MovieSuggestions|Lazy-Paleontologist9|Larger than life ...|2021-06-30 22:24:27|          11|    2|[large, life, fil...|
+----------------+--------------------+--------------------+-------------------+------------+-----+--------------------+
only showing top 5 rows


from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import explode, split
from pyspark.sql.functions import desc

#'clean_RedditText' is the column containing tokenized and cleaned words
words_column = 'clean_RedditText'

# Convert array of strings to a single string with space as a separator
submission_clean_movie = submission_clean_movie.withColumn('clean_text', concat_ws(' ', words_column))

# Split the words and explode the array to create a new row for each word
word_count = submission_clean_movie.select(explode(split('clean_text', ' ')).alias('word'))

# Group by word and count occurrences
word_count = word_count.groupBy('word').count()

# Sort by count in descending order
word_count = word_count.sort(desc('count'))


# Show the top words
word_count.show(10)

[Stage 15:=======================================================>(96 + 1) / 97]

+---------+-----+
|     word|count|
+---------+-----+
|    movie|72187|
|     like|27869|
|     look|17926|
|    watch|17677|
|       im|15842|
|     film|14465|
|     good|13604|
|      see|10841|
|     love| 9366|
|something| 8858|
+---------+-----+
only showing top 10 rows


# Assuming 'subreddit' is a column in your DataFrame
submission_clean_anime = submission_clean.filter(submission_clean['subreddit'] == 'Animesuggest')


from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import explode, split
from pyspark.sql.functions import desc

#'clean_RedditText' is the column containing tokenized and cleaned words
words_column = 'clean_RedditText'

# Convert array of strings to a single string with space as a separator
submission_clean_anime = submission_clean_anime.withColumn('clean_text', concat_ws(' ', words_column))

# Split the words and explode the array to create a new row for each word
word_count_a = submission_clean_anime.select(explode(split('clean_text', ' ')).alias('word'))

# Group by word and count occurrences
word_count_a = word_count_a.groupBy('word').count()

# Sort by count in descending order
word_count_a = word_count_a.sort(desc('count'))


word_count_a.show(10)

[Stage 18:=======================================================>(96 + 1) / 97]

+---------+-----+
|     word|count|
+---------+-----+
|    anime|73632|
|     like|45544|
|    watch|34677|
|     look|23720|
|       im|23468|
|     good|19125|
|      one|17144|
|something|16997|
|       mc|15713|
|character|14876|
+---------+-----+
only showing top 10 rows


from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
import pandas as pd


# Combine the tokenized words into sentences for TF-IDF vectorization
concat_udf = udf(lambda x: ' '.join(x), StringType())
submission_clean_movie = submission_clean_movie.withColumn("concatenated_text", concat_udf(submission_clean_movie["clean_RedditText"]))

# Tokenization
tokenizer = Tokenizer(inputCol="concatenated_text", outputCol="words")
submission_clean_movie = tokenizer.transform(submission_clean_movie)


# TF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
submission_clean_movie = hashingTF.transform(submission_clean_movie)

# IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(submission_clean_movie)
submission_clean_movie = idfModel.transform(submission_clean_movie)


idf_scores = idfModel.idf.toArray()


submission_clean_movie.select("features", "words").show(truncate=False)

[Stage 23:>                                                         (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                                                                                                                                                                                                                           |words                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(20,[0,1,3,7,8,13,15,17,19],[0.328260144767276,1.2564512272285042,0.18337126468235918,0.3113800523433715,0.47305072296526773,0.35463092534401797,0.31670543709976823,0.29719711749051475,0.07176626435254482])                                                                                                                                                                                                                                     |[blockbuster, drop, good, blockbuster, movie, choice, look, one, watch, afternoon, thank, suggestion, advance]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
|(20,[0,1,3,4,6,10,11,12,13,17,19],[0.328260144767276,0.6282256136142521,0.18337126468235918,0.8329179030414992,0.4918452395525049,0.6234350097115241,0.7833015867288391,1.4139115577825587,0.35463092534401797,0.5943942349810295,0.14353252870508965])                                                                                                                                                                                            |[family, movie, like, conviction, like, conviction, movie, sister, try, save, brother, similar, kind, movie, show, family, loyalty, care, stand, together, different, genre, crime, comedy, drama]                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
|(20,[0,1,3,4,5,8,10,11,12,15,16,17,19],[0.328260144767276,0.6282256136142521,0.18337126468235918,0.4164589515207496,0.989029437525459,0.15768357432175592,0.6234350097115241,0.39165079336441955,0.5655646231130235,0.9501163112993047,0.8256013657718139,0.5943942349810295,0.10764939652881723])                                                                                                                                                 |[look, movie, like, contact, really, really, love, film, mix, futuristic, space, alien, time, strong, story, awesome, protagonist, someone, recommend, great, movie, like, one]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
|(20,[0,1,3,4,5,7,8,10,12,13,14,15,18,19],[0.656520289534552,1.2564512272285042,1.100227588094155,0.4164589515207496,1.978058875050918,0.3113800523433715,0.47305072296526773,0.41562333980768273,0.28278231155651173,0.7092618506880359,0.7401704405258923,0.6334108741995365,0.6001222500965673,0.25118192523390687])                                                                                                                             |[detective, movie, like, bone, collector, along, come, spider, im, look, detective, movie, fun, watch, keep, engage, throughout, film, im, really, interest, smart, detective, movie, clever, story, im, open, oscar, worthy, movie, indie, b, movie, fit, bill]                                                                                                                                                                                                                                                                                                                                                                                                      |
|(20,[0,2,3,4,5,6,7,8,9,10,11,12,15,17,18,19],[0.984780434301828,1.98734770360836,0.9168563234117959,0.4164589515207496,0.4945147187627295,0.4918452395525049,0.9341401570301145,0.31536714864351184,0.4329524983232148,0.20781166990384137,0.7833015867288391,0.28278231155651173,0.6334108741995365,0.8915913524715442,0.6001222500965673,0.07176626435254482])                                                                                   |[large, life, film, fill, challenge, big, question, know, film, along, line, space, odyssey, tree, life, basically, film, inherently, deep, provoke, lot, question, may, even, answer, dont, mind, genre, even, film, lighthouse, would, consider, along, line]                                                                                                                                                                                                                                                                                                                                                                                                       |
|(20,[0,1,3,4,5,6,8,10,11,12,13,14,15,17,18,19],[0.656520289534552,0.6282256136142521,0.36674252936471835,0.4164589515207496,0.4945147187627295,0.1639484131841683,0.31536714864351184,0.20781166990384137,1.5666031734576782,0.28278231155651173,2.1277855520641076,0.7401704405258923,1.266821748399073,0.29719711749051475,0.6001222500965673,0.07176626435254482])                                                                              |[didnt, like, shaun, dead, world, end, think, alright, definitely, check, hot, fuzz, one, good, comedy, movie, history, put, hot, fuzz, long, time, see, two, think, ok, hot, fuzz, different, level, holy, shit]                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
|(20,[0,1,2,3,4,6,7,8,10,12,13,15,16,17,19],[0.656520289534552,0.6282256136142521,0.49683692590209,0.36674252936471835,0.4164589515207496,0.3278968263683366,1.245520209373486,0.15768357432175592,0.6234350097115241,0.28278231155651173,0.7092618506880359,1.266821748399073,0.8256013657718139,1.4859855874525738,0.07176626435254482])                                                                                                          |[movie, doormatpeople, pleasinggenerally, nice, people, finally, decide, stand, look, moviesseries, genuinely, nice, people, finally, snap, point, decide, stand, change, approach, towards, life, preferably, story, center, around, niceness, take, toll, everyday, life, thank, advance]                                                                                                                                                                                                                                                                                                                                                                           |
|(20,[0,2,3,6,8,9,10,11,12,13,15,16,17,18,19],[0.328260144767276,0.49683692590209,0.5501137940470775,0.3278968263683366,0.15768357432175592,0.4329524983232148,0.20781166990384137,0.39165079336441955,0.28278231155651173,0.35463092534401797,0.9501163112993047,0.41280068288590693,0.29719711749051475,1.2002445001931346,0.17941566088136207])                                                                                                  |[look, movie, im, look, movie, similar, vein, like, zero, dark, thirty, spy, game, kingdom, etc, basically, political, intrigue, type, stuff, deal, specifically, middle, east, suggestion]                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
|(20,[0,3,4],[0.328260144767276,0.18337126468235918,0.4164589515207496])                                                                                                                                                                                                                                                                                                                                                                            |[without, remorse, trailer]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
|(20,[0,1,4,7,8,10,11,12,13,17,18,19],[0.328260144767276,1.2564512272285042,1.2493768545622488,0.622760104686743,0.31536714864351184,0.41562333980768273,0.39165079336441955,0.28278231155651173,0.35463092534401797,1.188788469962059,0.6001222500965673,0.10764939652881723])                                                                                                                                                                     |[movie, like, mommy, peanut, butter, falcon, manchester, sea, hunt, also, would, like, everyone, drop, letterboxd, username, know, everybody, watch, thank, advance, shruthikv, username]                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
|(20,[0,3,4,7,9,10,11,13,15,17],[0.656520289534552,0.5501137940470775,0.8329179030414992,0.3113800523433715,0.4329524983232148,0.20781166990384137,0.39165079336441955,0.7092618506880359,0.31670543709976823,0.8915913524715442])                                                                                                                                                                                                                  |[protagonist, introvert, protagonist, introvert, different, take, people, around, mock, silly, shit, people, real, life, like, mr, robot]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
|(20,[0,1,2,3,4,5,6,7,8,10,11,12,13,15,17,18,19],[1.313040579069104,0.6282256136142521,0.49683692590209,0.18337126468235918,0.8329179030414992,0.4945147187627295,0.8197420659208415,0.622760104686743,1.2614685945740474,0.6234350097115241,1.1749523800932586,0.28278231155651173,0.35463092534401797,0.31670543709976823,0.5943942349810295,1.2002445001931346,0.07176626435254482])                                                             |[good, recent, slasher, movie, im, mood, watch, something, like, scream, happy, death, day, right, something, corny, good, plot, well, make, also, suburban, theme, would, nice, would, well, except, something, lovable, scream, overall, good, watch, slasher, type, movie, suggestion, appreicated, thank]                                                                                                                                                                                                                                                                                                                                                         |
|(20,[0,3,8,11,14,15,16,19],[0.656520289534552,0.36674252936471835,0.15768357432175592,0.39165079336441955,0.37008522026294616,0.31670543709976823,1.2384020486577207,0.03588313217627241])                                                                                                                                                                                                                                                         |[suggestion, film, student, movie, think, film, student, watch, would, love, see, suggest]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
|(20,[0,2,3,4,5,6,7,8,9,10,11,13,14,15,17,19],[0.328260144767276,0.49683692590209,0.5501137940470775,2.082294757603748,0.4945147187627295,0.6557936527366732,0.9341401570301145,0.15768357432175592,0.8659049966464296,0.8312466796153655,0.39165079336441955,0.7092618506880359,0.37008522026294616,0.31670543709976823,0.5943942349810295,0.17941566088136207])                                                                                   |[gritty, new, yorkcity, drama, watch, lot, movie, within, genre, im, love, citytime, period, long, gritty, feel, movie, like, mean, street, dog, day, afternoon, taxi, driver, uncut, gem, see, requiem, dream, kind, like, right, thing, hour, thank, help]                                                                                                                                                                                                                                                                                                                                                                                                          |
|(20,[0,2,5,6,8,10,12,13,15,19],[0.328260144767276,0.49683692590209,0.4945147187627295,0.4918452395525049,0.47305072296526773,0.20781166990384137,0.28278231155651173,0.35463092534401797,0.31670543709976823,0.07176626435254482])                                                                                                                                                                                                                 |[great, movie, watch, stoney, baloney, look, new, stuff, watch, stone, whether, trippy, comedy, leave, suggestion]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
|(20,[0,1,2,3,4,5,6,7,8,9,10,11,13,15,16,17,18,19],[1.313040579069104,2.5129024544570084,1.49051077770627,0.36674252936471835,0.8329179030414992,0.989029437525459,0.4918452395525049,0.622760104686743,0.31536714864351184,0.8659049966464296,0.8312466796153655,1.1749523800932586,1.4185237013760719,0.31670543709976823,0.8256013657718139,1.4859855874525738,0.6001222500965673,0.07176626435254482])                                          |[movie, dont, stop, moment, would, like, watch, movie, never, stop, little, subjective, think, mean, film, never, leave, hang, fall, asleep, always, something, happen, doesnt, need, necesserily, fast, pace, something, make, care, go, guess, boring, part, example, project, x, dont, breathe, anyway, idk, express, clearly, dont, post, often, thx]                                                                                                                                                                                                                                                                                                             |
|(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[1.969560868603656,1.2564512272285042,2.98102155541254,1.4669701174588734,2.082294757603748,0.989029437525459,1.1476388922891783,1.868280314060229,1.4191521688958033,1.2988574949696443,0.6234350097115241,1.1749523800932586,1.131129246226047,0.7092618506880359,1.8504261013147307,0.9501163112993047,1.6512027315436277,1.188788469962059,2.400489000386269,0.2870650574101793])      |[look, old, dumb, fun, movie, ive, recently, watch, tokyo, drift, first, time, love, remind, different, time, use, watch, tv, happen, upon, random, thing, hook, remind, old, murican, badass, eg, chuck, norris, avoid, death, hour, straight, skinny, asian, martial, artist, eg, bruce, lee, beat, buff, guy, film, also, funny, stuff, like, rush, hour, come, america, love, feel, mindlessly, glue, actionpacked, blockbuster, flick, hour, im, big, movie, guy, feel, free, drop, popular, title, without, assume, already, know, id, love, revisit, something, forget, even, watch, tv, year, ago, edit, know, tokyo, drift, id, prefer, stuff, somewhat, old]|
|(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.328260144767276,1.8846768408427563,0.99367385180418,0.36674252936471835,1.2493768545622488,1.978058875050918,0.8197420659208415,0.9341401570301145,0.6307342972870237,2.164762491616074,0.41562333980768273,0.7833015867288391,1.6966938693390703,0.7092618506880359,0.37008522026294616,1.266821748399073,1.2384020486577207,0.8915913524715442,0.6001222500965673,0.3229481895864517])|[blizzard, soul, also, know, rifleman, okay, real, good, sugestion, war, drama, movie, blizzard, soul, also, sometimes, name, rifleman, boy, join, latvian, rifleman, regiment, ww, independence, war, im, shure, english, dub, subtitle, dunno, find, tough, movie, good, one, movie, get, emotional, movie, realy, good, atmosphere, even, say, caries, historic, weight, im, say, well, tough, act, sometimes, bit, stiff, atmosphere, gorgeus, story, tell, meaningfull, beutifull, realy, recomend]                                                                                                                                                              |
|(20,[0,2,3,4,7,8,9,10,11,12,13,17,19],[0.984780434301828,0.99367385180418,0.36674252936471835,1.2493768545622488,0.622760104686743,0.31536714864351184,1.2988574949696443,0.41562333980768273,0.7833015867288391,0.28278231155651173,0.35463092534401797,0.29719711749051475,0.21529879305763447])                                                                                                                                                 |[look, movie, single, father, hi, im, work, personal, project, right, im, look, movie, focus, relationship, single, father, child, preferably, son, come, term, death, wivesmothers, respectively, help, would, really, appreciate, lt]                                                                                                                                                                                                                                                                                                                                                                                                                               |
|(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19],[0.656520289534552,2.5129024544570084,0.99367385180418,0.5501137940470775,1.6658358060829983,0.4945147187627295,0.3278968263683366,1.5569002617168577,1.1037850202522914,0.8659049966464296,1.2468700194230482,1.1749523800932586,1.4139115577825587,2.1277855520641076,0.6334108741995365,0.8256013657718139,0.8915913524715442,2.400489000386269,0.43059758611526894])                      |[action, movie, bored, dad, dad, doesnt, really, like, stuff, tv, since, like, one, movie, channel, air, good, movie, every, blue, moon, hes, always, ask, get, action, movie, watch, dont, watch, action, movie, im, rec, far, ive, show, john, wick, trilogy, like, well, enough, fall, asleep, scene, drag, especially, long, morocco, fight, scene, parabellum, fall, asleep, easily, also, show, old, guard, didnt, watch, say, like, also, like, chinese, action, movie, like, crouch, tiger, hide, dragon, thank]                                                                                                                                              |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 20 rows

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 193, in manager
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 874, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 596, in read_int
    raise EOFError
EOFError

Starting Spark Session¶

Importing necessary Libraries¶

Reading Sumbmissions Data¶

Removing unwanted rows, and columns¶

Combining 'title' and 'selftext' columns into a new column 'RedditText'¶

Creating Pipeline for text Cleaning on the data¶

Finding the most used words in the data for Movie and Anime Subreddits¶

TF-IDF to find important words¶