# Setup - Run only once per Kernel App%conda install openjdk -y# install PySpark%pip install pyspark==3.2.0# restart kernelfrom IPython.core.display import HTMLHTML("<script>Jupyter.notebook.kernel.restart()</script>")# Import pyspark and build Spark sessionfrom pyspark.sql import SparkSessionspark = ( SparkSession.builder.appName("PySparkApp") .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2") .config("fs.s3a.aws.credentials.provider","com.amazonaws.auth.ContainerCredentialsProvider", ) .getOrCreate())print(spark.version)
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 23.3.1
latest version: 23.11.0
Please update conda by running
$ conda update -n base -c defaults conda
Or to minimize the number of packages updated during conda update use
conda install conda=23.11.0
## Package Plan ##
environment location: /opt/conda
added / updated specs:
- openjdk
The following packages will be downloaded:
package | build
---------------------------|-----------------
ca-certificates-2023.08.22 | h06a4308_0 123 KB
certifi-2023.11.17 | py310h06a4308_0 158 KB
openjdk-11.0.13 | h87a67e3_0 341.0 MB
------------------------------------------------------------
Total: 341.3 MB
The following NEW packages will be INSTALLED:
openjdk pkgs/main/linux-64::openjdk-11.0.13-h87a67e3_0
The following packages will be UPDATED:
ca-certificates conda-forge::ca-certificates-2023.7.2~ --> pkgs/main::ca-certificates-2023.08.22-h06a4308_0
certifi conda-forge/noarch::certifi-2023.7.22~ --> pkgs/main/linux-64::certifi-2023.11.17-py310h06a4308_0
Downloading and Extracting Packages
openjdk-11.0.13 | 341.0 MB | | 0%
certifi-2023.11.17 | 158 KB | | 0%
ca-certificates-2023 | 123 KB | | 0%
ca-certificates-2023 | 123 KB | ##################################### | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Note: you may need to restart the kernel to use updated packages.
Collecting pyspark==3.2.0
Using cached pyspark-3.2.0-py2.py3-none-any.whl
Collecting py4j==0.10.9.2 (from pyspark==3.2.0)
Using cached py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.0.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8afe1d5f-af95-49e4-af32-ce0b0801deb1;1.0
confs: [default]
found org.apache.hadoop#hadoop-aws;3.2.2 in central
found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 428ms :: artifacts dl 27ms
:: modules in use:
com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
---------------------------------------------------------------------
| | modules || artifacts |
| conf | number| search|dwnlded|evicted|| number|dwnlded|
---------------------------------------------------------------------
| default | 2 | 0 | 0 | 0 || 2 | 0 |
---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-8afe1d5f-af95-49e4-af32-ce0b0801deb1
confs: [default]
0 artifacts copied, 2 already retrieved (0kB/22ms)
23/12/05 07:31:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
3.2.0
Import packages
#import packageimport pyspark.sql.functions as Ffrom pyspark.sql.functions importsumas _sum, mean, stddev, maxas _max, minas _min, count, percentile_approx, year, month, dayofmonth, ceil, col, dayofweek, hour, explode, date_format, lower, size, split, regexp_replace, isnan, when# from pyspark.sql.functions import sum as _sum
%%timeimport sagemakersession = sagemaker.Session()bucket = session.default_bucket()output_prefix_data_comments ="project/comments/yyyy=*"s3_path =f"s3a://{bucket}/{output_prefix_data_comments}"#s3_path = "s3a://sagemaker-us-east-1-038932893404/project/comments/yyyy=2021/part-00000-90796409-5783-4705-92c0-27c27eda8c4c-c000.snappy.parquet"print(f"reading comments from {s3_path}")comments = spark.read.parquet(s3_path, header=True)print(f"shape of the comments dataframe is {comments.count():,}x{len(comments.columns)}")
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading comments from s3a://sagemaker-us-east-1-300610919477/project/comments/yyyy=*
shape of the comments dataframe is 7,356,042x21
CPU times: user 3.1 s, sys: 364 ms, total: 3.47 s
Wall time: 20min 20s
%%timeoutput_prefix_data_submissions =f"project/submissions/yyyy=*"s3_path =f"s3a://{bucket}/{output_prefix_data_submissions}"print(f"reading submissions from {s3_path}")submissions = spark.read.parquet(s3_path, header=True)print(f"shape of the submissions dataframe is {submissions.count():,}x{len(submissions.columns)}")
reading submissions from s3a://sagemaker-us-east-1-300610919477/project/submissions/yyyy=*
23/12/05 06:10:21 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 5:========================================================>(97 + 1) / 98]
shape of the submissions dataframe is 404,298x68
CPU times: user 187 ms, sys: 33.1 ms, total: 221 ms
Wall time: 8min 37s
+---------+-----------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|subreddit| author| author_flair_text| created_utc| body|controversiality|score| parent_id|stickied| link_id| id|
+---------+-----------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
| anime| DonaldJenkins| null|2021-11-14 04:39:47| I sent it to ya ;)| 0| 1|t1_hk0whi9| false|t3_ov07rq|hkjr7uj|
| anime| DonMo999|:MAL:https://myan...|2021-11-14 04:40:25|Displate has some...| 0| 1| t3_qtgc12| false|t3_qtgc12|hkjralc|
| anime| OrangeBanana38|:AMQ::STAR::AL:ht...|2021-11-14 04:41:01|That sounds like ...| 0| 3|t1_hkjq6wn| false|t3_qryjfm|hkjrd4w|
| anime| ClBanjai| null|2021-11-14 04:41:03|what kind of ques...| 0| 1| t3_qth8ql| false|t3_qth8ql|hkjrdae|
| anime| helsaabiart| null|2021-11-14 04:42:02|Today on Shokugek...| 0| 4| t3_qt8p0u| false|t3_qt8p0u|hkjrhg6|
| anime| Lezoux|:MAL:https://myan...|2021-11-14 04:42:08| It's easy enough.| 0| 3|t1_hkjrd4w| false|t3_qryjfm|hkjrhv3|
| anime| AutoModerator| null|2021-11-14 04:42:39|Hello! If you eve...| 0| 1| t3_qti7iu| false|t3_qti7iu|hkjrk63|
| anime| AutoModerator| null|2021-11-14 04:42:39|Hi xxcile, it see...| 0| 1| t3_qti7iu| false|t3_qti7iu|hkjrk6q|
| anime| Terra246| null|2021-11-14 04:42:47|I did see amagi b...| 0| 4|t1_hkjm3z4| false|t3_qtgzu3|hkjrkr1|
| anime|ZaphodBeebblebrox|:S3::AL:https://a...|2021-11-14 04:43:39|Which is your Mad...| 0| 3|t1_hkjpckv| false|t3_qryjfm|hkjrogp|
| anime| GreekFire242| null|2021-11-14 04:43:47| Demon Slayer| 0| 2|t1_hkjpe43| false|t3_qtgcp8|hkjrp3s|
| anime| Terra246| null|2021-11-14 04:43:52|I mean, it is one...| 0| 2|t1_hkjpa0q| false|t3_qtgzu3|hkjrpgf|
| anime| MakotoPrince| null|2021-11-14 04:44:32|Yet another good ...| 0| 3| t3_qtg0z3| false|t3_qtg0z3|hkjrsa2|
| anime| Gryse_Blacolar| null|2021-11-14 04:44:41|That's basically ...| 0| 2| t3_qsz91x| false|t3_qsz91x|hkjrsvy|
| anime| Junnielocked| null|2021-11-14 04:44:47|Looked up the ani...| 0| 2| t3_qt7yff| false|t3_qt7yff|hkjrtas|
| anime| kubabubba| null|2021-11-14 04:45:01| How about now?| 0| 24|t1_hkihkib| false|t3_qt7yff|hkjrual|
| anime| alotmorealots| null|2021-11-14 04:45:08|Your post could d...| 0| 1| t3_qtgpcl| false|t3_qtgpcl|hkjrutb|
| anime| heimdal77| null|2021-11-14 04:45:17|Depends is it lik...| 0| 2|t1_hkj9jju| false|t3_qtfmin|hkjrvfv|
| anime| jackofslayers| null|2021-11-14 04:45:35|I have my own sus...| 0| 7|t1_hkjcx61| false|t3_qt5igg|hkjrwsz|
| anime| SarcasmUndefined| null|2021-11-14 04:45:59|Looking submissiv...| 0| 4|t1_hkhunza| false|t3_qt3ovl|hkjrykr|
+---------+-----------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
only showing top 20 rows
Data Quality Check
Before diving into the analysis, we first need to check the data quality and perform data cleaning based on that.
The quality checks we performed are:
Checking for missing values
Examining the number of missing values for each feature.
Checking for duplicates
Verifying if the data contains any duplicates.
Checking for corrupted data points
Since some submissions or comments may have been deleted by users or some users deleted their accounts, all corresponding submissions or comments may have been removed.
1. Check missing value
sub_miss = sub.select([count(when(((col(c).isNull()) | (col(c) =='')), c)).alias(c) for c in sub.columns])sub_miss.show()
sub_cleaned = ( sub .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column .withColumn("created_hour", hour("created_utc")) # create hour column .withColumn("created_week", dayofweek("created_utc")) # create day of the week column .withColumn("created_month", month("created_utc")) # create month of the year column .withColumn("created_year", year("created_utc")) # create the year column .withColumn("title", lower(col('title'))) # text cleaning: lowercase .withColumn("selftext", lower(col('selftext'))) # text cleaning: lowercase .withColumn("cleaned_title", regexp_replace(col('title'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number .withColumn("cleaned_title", regexp_replace(col('cleaned_title'), r'\s+', ' ')) # text cleaning: remove extra space in text .withColumn('title_wordCount', size(split(col('cleaned_title'), ' '))) # word count .withColumn("cleaned_selftext", regexp_replace(col('selftext'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number .withColumn("cleaned_selftext", regexp_replace(col('cleaned_selftext'), r'\s+', ' ')) # text cleaning: remove extra space in text .withColumn('selftext_wordCount', size(split(col('cleaned_selftext'), ' '))) # word count .withColumn('contain_pokemon', col("cleaned_title").rlike("""(?i)pokemon|(?i)pokémon""")) # create dummy variable column on title)
com_cleaned = ( com .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column .withColumn("created_hour", hour("created_utc")) # create hour column .withColumn("created_week", dayofweek("created_utc")) # create day of the week column .withColumn("created_month", month("created_utc")) # create month of the year column .withColumn("created_year", year("created_utc")) # create the year column .withColumn("body", lower(col('body'))) # text cleaning: lowercase .withColumn("cleaned", regexp_replace(col('body'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number .withColumn("cleaned", regexp_replace(col('cleaned'), r'\s+', ' ')) # text cleaning: remove extra space in text .withColumn('body_wordCount', size(split(col('cleaned'), ' '))) # word count .withColumn('contain_pokemon', col("body").rlike("""(?i)pokemon|(?i)pokémon""")) # create dummy variable column)
# need to cache() in full cleaned dataset# sub_cleaned.cache()sub_cleaned.show()
output_sub ="project/cleaned/sub"s3_path_sub =f"s3a://{bucket}/{output_sub}"print(f"writing cleaned submission to {s3_path_sub}")sub_cleaned.write.parquet(s3_path_sub, mode="overwrite")
writing cleaned submission to s3a://sagemaker-us-east-1-300610919477/project/cleaned/sub
output_com ="project/cleaned/com"s3_path_com =f"s3a://{bucket}/{output_com}"print(f"writing cleaned comments to {s3_path_com}")com_cleaned.write.parquet(s3_path_com, mode="overwrite")
writing cleaned comments to s3a://sagemaker-us-east-1-300610919477/project/cleaned/com
# Read cleaned data from parquetimport sagemakersession = sagemaker.Session()bucket = session.default_bucket()# bucket = 'sagemaker-us-east-1-315969085594'sub_bucket_path =f"s3a://{bucket}/project/cleaned/sub"com_bucket_path =f"s3a://{bucket}/project/cleaned/com"print(f"reading submissions from {sub_bucket_path}")sub = spark.read.parquet(sub_bucket_path, header=True)print(f"shape of the sub dataframe is {sub.count():,}x{len(sub.columns)}")print(f"reading comments from {com_bucket_path}")com = spark.read.parquet(com_bucket_path, header=True)print(f"shape of the com dataframe is {com.count():,}x{len(com.columns)}")
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading submissions from s3a://sagemaker-us-east-1-300610919477/project/cleaned/sub
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+---------------+
|subreddit| author| author_flair_text| created_utc| body|controversiality|score| parent_id|stickied| link_id| id|created_date|created_hour|created_week|created_month|created_year| cleaned|body_wordCount|contain_pokemon|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+---------------+
| anime| cocksmongler| null|2022-12-20 22:50:59| she does too| 0| 16|t1_j11628o| false|t3_zqtkyg|j11989t| 2022-12-20| 22| 3| 12| 2022| she does too| 3| false|
| anime| Raiking02|:Id::Ie::If::M:ht...|2022-12-20 22:51:14|*finally decides ...| 0| 8| t3_zn0m5r| false|t3_zn0m5r|j1199j8| 2022-12-20| 22| 3| 12| 2022|finally decides t...| 32| false|
| anime| MonoFauz| null|2022-12-20 22:52:34|this bette be an ...| 0| 1| t3_zqchel| false|t3_zqchel|j119gf5| 2022-12-20| 22| 3| 12| 2022|this bette be an ...| 15| false|
| anime| eruciform| null|2022-12-20 22:52:46|cardcaptor sakura...| 0| 6| t3_zr280b| false|t3_zr280b|j119hfj| 2022-12-20| 22| 3| 12| 2022|cardcaptor sakura...| 14| false|
| anime| Karkava| null|2022-12-20 22:52:46|and even then, hi...| 0| 3|t1_j0zxz09| false|t3_zqtkyg|j119hfr| 2022-12-20| 22| 3| 12| 2022|and even then his...| 14| false|
| anime| HelioA|:MAL:https://myan...|2022-12-20 22:53:00| no u| 0| 2|t1_j1188rf| false|t3_zn0m5r|j119ip2| 2022-12-20| 22| 3| 12| 2022| no u| 2| false|
| anime| Random_Useless_Tips| null|2022-12-20 22:53:06|in english it can...| 0| 29|t1_j0zxnir| false|t3_zqtkyg|j119j7e| 2022-12-20| 22| 3| 12| 2022|in english it can...| 83| false|
| anime| ComfortablyRotten|:BC::BD::U::M:htt...|2022-12-20 22:53:11|[i think i have m...| 0| 4|t1_j1162s1| false|t3_zn0m5r|j119jnw| 2022-12-20| 22| 3| 12| 2022|i think i have mo...| 95| false|
| anime| Eventhorrizon| null|2022-12-20 22:53:26|"started watching...| 0| 8| t3_zr280b| false|t3_zr280b|j119l1k| 2022-12-20| 22| 3| 12| 2022|started watching ...| 58| false|
| anime| JustAnswerAQuestion|:CS::CT::I::M:htt...|2022-12-20 22:53:28|i meant to reply ...| 0| 3|t1_j1190s6| false|t3_zn0m5r|j119l6e| 2022-12-20| 22| 3| 12| 2022|i meant to reply ...| 6| false|
| anime| electric_anteater| null|2022-12-20 22:53:38| bet what| 0| 6|t1_j116w30| false|t3_zqtkyg|j119m3c| 2022-12-20| 22| 3| 12| 2022| bet what| 2| false|
| anime| CrewOrdinary8872| null|2022-12-20 22:53:40|not the same swor...| 0| 15|t1_j118awx| false|t3_zqtkyg|j119m92| 2022-12-20| 22| 3| 12| 2022|not the same swor...| 17| false|
| anime| AutoModerator| null|2022-12-20 22:53:57|hi reason-local! ...| 0| 1| t3_zr2dxt| false|t3_zr2dxt|j119noi| 2022-12-20| 22| 3| 12| 2022|hi reasonlocal yo...| 151| false|
| anime|equalopurtunityotter| null|2022-12-20 22:54:03|are there any ani...| 0| 1| t3_zqkjyo| false|t3_zqkjyo|j119o93| 2022-12-20| 22| 3| 12| 2022|are there any ani...| 51| false|
| anime| mutDD| null|2022-12-20 22:54:07|see my reply to s...| 0| 0|t1_j114rml| false|t3_zr1fxi|j119olf| 2022-12-20| 22| 3| 12| 2022|see my reply to s...| 32| false|
| anime| Vikkio92|:K:https://kitsu....|2022-12-20 22:54:36|i have no idea wh...| 0| 6| t3_zqtkyg| false|t3_zqtkyg|j119r1s| 2022-12-20| 22| 3| 12| 2022|i have no idea wh...| 11| false|
| anime| Karkava| null|2022-12-20 22:54:41|[the midway point...| 0| 2|t1_j10dofz| false|t3_zqtkyg|j119rf6| 2022-12-20| 22| 3| 12| 2022|the midway point ...| 11| false|
| anime| HelioA|:MAL:https://myan...|2022-12-20 22:55:33|any particular re...| 0| 2|t1_j1188rf| false|t3_zn0m5r|j119vue| 2022-12-20| 22| 3| 12| 2022|any particular re...| 8| false|
| anime| LilyGinnyBlack| null|2022-12-20 22:55:34|fruits basket (re...| 0| 2| t3_zqy8ii| false|t3_zqy8ii|j119vw8| 2022-12-20| 22| 3| 12| 2022|fruits basket reb...| 5| false|
| anime| Raiking02|:Id::Ie::If::M:ht...|2022-12-20 22:55:48|oh god, it's over...| 0| 8|t1_j118mxd| false|t3_zn0m5r|j119x1q| 2022-12-20| 22| 3| 12| 2022|oh god its over 8...| 14| false|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+---------------+
only showing top 20 rows
EDA on Anime
%pip install plotly%pip install wordcloudimport pyspark.sql.types as Timport pandas as pdimport matplotlib.pyplot as pltfrom matplotlib.ticker import FuncFormatterimport plotly.graph_objects as goimport plotly.subplots as spimport plotly.express as pximport plotly.io as piopio.renderers.default ="plotly_mimetype+notebook_connected"import seaborn as snsfrom wordcloud import WordCloudimport nltkfrom nltk.corpus import stopwordsnltk.download('stopwords')
Requirement already satisfied: plotly in /opt/conda/lib/python3.10/site-packages (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.10/site-packages (from plotly) (8.0.1)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
Collecting wordcloud
Obtaining dependency information for wordcloud from https://files.pythonhosted.org/packages/22/0d/bb4eccd60d272b33cbc79c661c60acc604f1688cfc922deb9d3eb5be640a/wordcloud-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
Using cached wordcloud-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Requirement already satisfied: numpy>=1.6.1 in /opt/conda/lib/python3.10/site-packages (from wordcloud) (1.26.0)
Requirement already satisfied: pillow in /opt/conda/lib/python3.10/site-packages (from wordcloud) (10.0.1)
Requirement already satisfied: matplotlib in /opt/conda/lib/python3.10/site-packages (from wordcloud) (3.5.2)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.10/site-packages (from matplotlib->wordcloud) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib->wordcloud) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib->wordcloud) (1.4.2)
Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from matplotlib->wordcloud) (21.3)
Requirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.10/site-packages (from matplotlib->wordcloud) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
Using cached wordcloud-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (455 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.2
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
True
The number of comments and submissions for each day
# Create a subplot with two y-axesfig = sp.make_subplots(specs=[[{"secondary_y": True}]])# Add traces for submissions and comments to the respective y-axesfig.add_trace(go.Scatter(x=sub_daily_count['created_date'], y=sub_daily_count['total_submissions'], marker_color='#42a1b9', opacity=.65, name="submissions"), secondary_y=False)fig.add_trace(go.Scatter(x=com_daily_count['created_date'], y=com_daily_count['total_comments'], marker_color='#d13a47', opacity=.65, name="comments"), secondary_y=True)# Update the layout for the whole figurefig.update_layout( title='The number of comments and submissions for each day', xaxis={'title': 'Date'},)# Update the y-axis labelsfig.update_yaxes(title_text="# Submissions", secondary_y=False)fig.update_yaxes(title_text="# Comments", secondary_y=True)# Show the figurefig.show()
Boxplot of word count by stickied or not for submission
# Get boolean columns' namesbool_columns = [col[0] for col in sub.dtypes if col[1] =='boolean']# Assuming you have a PySpark DataFrame 'df'bool_columnsfor col_name in bool_columns: sub = sub.withColumn(col_name, F.coalesce(F.col(col_name).cast('int'), F.lit(0)))# sub.printSchema()sub_df = sub.toPandas()# Create a box plot using Plotlyfig = px.box(sub_df, x="stickied", y="title_wordCount", title=f"Box Plot of title_wordCount by stickied")fig.show()
The number of comments of pokemon-related submissions
ax = sns.histplot(data=sub_df[sub_df["contain_pokemon"]==True], x="num_comments", hue='contain_pokemon', binwidth=7, color="#d5ecf1")ax.set_title("Histogram of the number of comments for pokemon-related submissions")ax.set_xlabel("# comments")ax.get_legend().remove()plt.savefig("../website-source/images/anime_histogram_submissions.png")plt.show()
Word Count V.S. score
Submissions: Title Word Count V.S. Score
# Get boolean columns' namesbool_columns = [col[0] for col in sub.dtypes if col[1] =='boolean']# Assuming you have a PySpark DataFrame 'df'bool_columnsfor col_name in bool_columns: sub = sub.withColumn(col_name, F.coalesce(F.col(col_name).cast('int'), F.lit(0)))# sub.printSchema()sub_df = sub.toPandas()
ax = sub_df.plot.scatter('score', 'title_wordCount', s=3, c="#d13a47")ax.set_title("Scatter plot of title word count and score for submissions")ax.set_xscale('log')plt.savefig("../website-source/images/anime_submissions_wordcount_score_scatterplot.png")plt.show()
Comments: Body Word Count V.S. Score
com_wordcount_score_df = com.select("body_wordCount", 'score').toPandas()ax = com_wordcount_score_df.plot.scatter('score', 'body_wordCount', s=3, c="#42a1b9")ax.set_title("Scatter plot of body word count and score for comments")ax.set_xscale('log')ax.set_yscale('log')plt.savefig("../website-source/images/anime_comments_wordcount_score_scatterplot.png")plt.show()
Comments