Code: EDA-Franchise

spark
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 6, Finished, Available)

SparkSession - hive

SparkContext

Spark UI

Version
v3.2.2.5.1-100879434
Master
yarn
AppName
Azure ML Experiment

Read full reddit data

blob_account_name = "marckvnonprodblob"
blob_container_name = "bigdata"
# read only
blob_sas_token = "?sv=2021-10-04&st=2023-10-04T01%3A42%3A59Z&se=2024-01-02T02%3A42%3A00Z&sr=c&sp=rlf&sig=w3CH9MbCOpwO7DtHlrahc7AlRPxSZZb8MOgS6TaXLzI%3D"

wasbs_base_url = (
    f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/"
)
spark.conf.set(
    f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net",
    blob_sas_token,
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 7, Finished, Available)
comments_path = "reddit-parquet/comments/"
submissions_path = "reddit-parquet/submissions/"
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 8, Finished, Available)
comments_df = spark.read.parquet(f"{wasbs_base_url}{comments_path}")
submissions_df = spark.read.parquet(f"{wasbs_base_url}{submissions_path}")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 9, Finished, Available)
import pyspark.sql.functions as F
from pyspark.sql.functions import sum as _sum, mean, stddev, max as _max, min as _min, count, percentile_approx, year, month, dayofmonth, ceil, col, dayofweek, hour, explode, date_format, lower, size, split, regexp_replace, isnan, when
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 10, Finished, Available)
# Top list of subreddits as a string
subreddits = "pokemon, Naruto, dbz, OnePiece, yugioh, digimon, StardustCrusaders, Gundam, DetectiveConan, OneTruthPrevails, DemonSlayerAnime, attackontitan, TokyoGhoul, swordartonline, NeonGenesisEvangelion, ShokugekiNoSoma, OnePunchMan, KillLaKill, Kaguya_sama"

# Split the string into a list of subreddit names
subreddit_list = subreddits.split(", ")

# Build the filter condition
filter_condition = col("subreddit") == subreddit_list[0]
for subreddit in subreddit_list[1:]:
    filter_condition = filter_condition | (col("subreddit") == subreddit)

# Filter the submissions DataFrame
submissions = submissions_df.filter(filter_condition)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 11, Finished, Available)
# Filter the comments DataFrame
comments = comments_df.filter(filter_condition)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 12, Finished, Available)
submissions.groupBy('subreddit').count().show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 6, 13, Finished, Available)
+--------------------+------+
|           subreddit| count|
+--------------------+------+
|NeonGenesisEvange...|  5994|
|         Kaguya_sama| 15163|
|             pokemon|355231|
|   StardustCrusaders| 79635|
|              yugioh| 85497|
|     ShokugekiNoSoma|  2872|
|            OnePiece|311736|
|          TokyoGhoul| 12751|
|       attackontitan| 89525|
|    OneTruthPrevails|  8755|
|      swordartonline| 16315|
|              Gundam| 46466|
|                 dbz| 53576|
|         OnePunchMan| 87300|
|             digimon| 41472|
|              Naruto|148262|
|    DemonSlayerAnime| 37018|
|          KillLaKill|  9934|
|      DetectiveConan|  2334|
+--------------------+------+
sub = submissions.select("subreddit", "author", "author_flair_text", "created_utc", "title", "selftext", "num_comments", "num_crossposts", "over_18", "score", "stickied", "id")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 13, Finished, Available)
sub.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 14, Finished, Available)
+-----------------+-------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
|        subreddit|             author|   author_flair_text|        created_utc|               title|            selftext|num_comments|num_crossposts|over_18|score|stickied|     id|
+-----------------+-------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
|          digimon|           kuroi110|                null|2023-02-22 21:08:07|Visiting Japan in...|From what I under...|          11|             0|  false|   10|   false|119d7f0|
|StardustCrusaders|        Atticus1301|                null|2023-02-22 21:08:41|          OC [by me]|                    |          11|             0|  false|  590|   false|119d878|
|          pokemon|    OneWhoGetsBread|                null|2023-02-22 21:08:56|the PokemonTogeth...|So several days a...|           0|             0|  false|    6|   false|119d8jo|
|         OnePiece|Fluid_Implement9799|                null|2023-02-22 21:09:58|the royal rumble ...|           [removed]|           1|             0|  false|    1|   false|119d9z9|
|         OnePiece|         hockeystew|                null|2023-02-22 21:11:29|Can someone help ...|https://mangabudd...|           0|             0|  false|    1|   false|119dcdb|
|           Gundam|          ncswisher|                null|2023-02-22 21:11:53|Gundam Design Fee...|           [removed]|           0|             0|  false|    1|   false|119dcyw|
|          pokemon|            lumixod|                null|2023-02-22 21:13:47|Wholesome friendship|                    |          69|             0|  false| 1947|   false|119dfv9|
|         OnePiece|        lampione784|                null|2023-02-22 21:15:21|What One piece ga...|The creator of th...|           2|             0|  false|    1|   false|119dicx|
|         OnePiece|      Many_Line9136|                null|2023-02-22 21:15:28|Oda needs to stop...|           [removed]|           1|             0|  false|    1|   false|119dijp|
|          pokemon|           KubfuKid|                null|2023-02-22 21:16:45|(OC) Diego the Gh...|                    |           2|             0|  false|   18|   false|119dkh9|
|          pokemon|  blackjackgabbiani|                null|2023-02-22 21:16:58|Who's a non-villa...|For me, Tyme insp...|         372|             0|  false|  406|   false|119dkrj|
|          pokemon|          [deleted]|                null|2023-02-22 21:17:16|Zard Hunting in B...|           [removed]|           0|             0|  false|    1|   false|119dl6k|
|              dbz|        Badj0jo_009|                null|2023-02-22 21:17:30|What do you guys ...|                    |           0|             0|  false|    1|   false|119dli5|
|          pokemon|  Totallynotttegegg|                null|2023-02-22 21:17:58|i have a realization|​\n\n[...|           0|             0|  false|    5|   false|119dm5g|
|         OnePiece|          [deleted]|                null|2023-02-22 21:18:02| My pirate-kittens!!|           [removed]|           2|             0|  false|    3|   false|119dm8c|
|StardustCrusaders|      SombraDragonv|                null|2023-02-22 21:18:19|How to take ZkSyn...|           [removed]|           0|             0|  false|    1|   false|119dmn4|
|          pokemon|            MeiLei-|:906::722::495::2...|2023-02-22 21:18:19|bored in class. h...|                    |           6|             0|  false|    0|   false|119dmnh|
|         OnePiece|          [deleted]|                null|2023-02-22 21:19:29|        BONK !!!😍😆|           [removed]|           1|             0|  false|    0|   false|119dobb|
|          pokemon|          [deleted]|                null|2023-02-22 21:19:53|Original 151, cau...|           [deleted]|          15|             0|  false|  153|   false|119dp0i|
|         OnePiece|          [deleted]|                null|2023-02-22 21:20:24|          SHANKS ART|           [removed]|           0|             0|  false|    1|   false|119dpwd|
+-----------------+-------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
only showing top 20 rows
comments.groupBy('subreddit').count().show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 7, 17, Finished, Available)
+--------------------+-------+
|           subreddit|  count|
+--------------------+-------+
|NeonGenesisEvange...|  31571|
|         Kaguya_sama| 229630|
|             pokemon|5703713|
|   StardustCrusaders| 763751|
|              yugioh|1238280|
|     ShokugekiNoSoma|  18705|
|            OnePiece|7166824|
|          TokyoGhoul|  81979|
|       attackontitan| 691710|
|    OneTruthPrevails|  67709|
|      swordartonline| 211009|
|              Gundam| 842233|
|                 dbz| 553935|
|         OnePunchMan|1848632|
|          KillLaKill|  82956|
|             digimon| 528217|
|      DetectiveConan|  11357|
|              Naruto|2146971|
|    DemonSlayerAnime| 324778|
+--------------------+-------+
com = comments.select("subreddit", "author", "author_flair_text", "created_utc", "body", "controversiality", "score",  "parent_id", "stickied", "link_id", "id")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 14, Finished, Available)
com.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 16, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|subreddit|              author|   author_flair_text|        created_utc|                body|controversiality|score| parent_id|stickied|  link_id|     id|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|   Gundam|        Kn1ght0fv01d|                null|2021-06-23 08:32:41|Looks at the extr...|               0|    1| t3_o5obrx|   false|t3_o5obrx|h2qxm9h|
| OnePiece|            kitay123|       Bounty Hunter|2021-06-23 08:32:48|What's so funny a...|               1|   -4| t3_o63f5c|   false|t3_o63f5c|h2qxmj8|
| OnePiece|          Darkmithra|                null|2021-06-23 08:32:50|They want to mono...|               0|    3|t1_h2qwtee|   false|t3_o5l1y6|h2qxmlb|
| OnePiece|            nehc9050|                null|2021-06-23 08:32:55|I'm confused, I'm...|               0|    1|t1_h2q924w|   false|t3_o5l1y6|h2qxmrn|
| OnePiece| Sea-Improvement5038|                null|2021-06-23 08:32:55|nor do i get why ...|               0|    1|t1_h2qwsd4|   false|t3_o5l1y6|h2qxmrv|
|   Naruto|            aleky254|                    |2021-06-23 08:32:59|Lol. Nagato is a ...|               0|    1|t1_h2qw5vx|   false|t3_o51egf|h2qxmvx|
| OnePiece|          AnudeStart|                null|2021-06-23 08:33:10|Meth. Coke isn’t ...|               0|    4|t1_h2qws0o|   false|t3_o5l1y6|h2qxnb1|
|   Naruto|        EDR-Basement|                    |2021-06-23 08:33:14|Kakashi, Neji, Sh...|               0|    2| t3_o66ix9|   false|t3_o66ix9|h2qxnhc|
|  pokemon|            KatonRyu|                null|2021-06-23 08:33:18|I still play like...|               0|    1| t3_o5h6hg|   false|t3_o5h6hg|h2qxnm1|
|   yugioh|        Katze1Punkt0|:att-water: Iced ...|2021-06-23 08:33:26|Thats because *it...|               0|    1| t3_o62oyc|   false|t3_o62oyc|h2qxnwt|
| OnePiece|The_Edgiest_Edgelord|                null|2021-06-23 08:33:34|The jacket jacket...|               0|    4|t1_h2qxmlb|   false|t3_o5l1y6|h2qxo85|
| OnePiece|           sekhon_98|                null|2021-06-23 08:33:36|Theoretically the...|               0|    1|t1_h2qwtee|   false|t3_o5l1y6|h2qxoac|
| OnePiece|  KnowledgeNorth6337|                null|2021-06-23 08:33:37|The no one knowin...|               0|    6| t3_o5l1y6|   false|t3_o5l1y6|h2qxobx|
| OnePiece|      mathemagician0|              Lurker|2021-06-23 08:33:40|still don't like ...|               0|    5|t1_h2qww6e|   false|t3_o5l1y6|h2qxogn|
|   Naruto|        KhaoticTwist|                    |2021-06-23 08:33:55|Isshiki is defini...|               0|   -3| t3_o67sfh|   false|t3_o67sfh|h2qxp0j|
|   Naruto|         WeedyNaruto|                    |2021-06-23 08:34:00|> did you not ...|               0|    1|t1_h2qx89o|   false|t3_o5k2rd|h2qxp7h|
| OnePiece|          JollyBlaze|                null|2021-06-23 08:34:01|If the CP9 kept t...|               0|   -1| t3_o5l1y6|   false|t3_o5l1y6|h2qxp9h|
| OnePiece|        Dumpling2104|                null|2021-06-23 08:34:03|Maybe that’s why ...|               0|    2|t1_h2qu81f|   false|t3_o5l1y6|h2qxpce|
|  pokemon|        purejackbaby|                null|2021-06-23 08:34:09|Could you like, g...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qxpjq|
| OnePiece|         Mr_Lectures|                null|2021-06-23 08:34:13|well sadly i saw ...|               0|    1|t1_h2qx75y|   false|t3_o5l1y6|h2qxppz|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
only showing top 20 rows

Data Cleaning

  1. Filter out corrupted data
  2. Clean Text data
  3. Create new columns (date, year, hour, week, month, cleaned text, wordCount)
    • Conduct word count on cleaned text for each data point
sub = sub.filter(
    (col('title')!='') & \
    (col('title')!='[deleted]') & \
    (col('title')!='[removed]') & \
    (col('selftext')!='') & \
    (col('selftext')!='[deleted]') & \
    (col('selftext')!='[removed]') & \
    (col('author')!='[deleted]') & \
    (col('author')!='[removed]')
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 15, Finished, Available)
com = com.filter(
    (col('body')!='') & \
    (col('body')!='[deleted]') & \
    (col('body')!='[removed]') & \
    (col('author')!='[deleted]') & \
    (col('author')!='[removed]')
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 16, Finished, Available)
sub_cleaned = (
    sub
    .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
    .withColumn("created_hour", hour("created_utc")) # create hour column
    .withColumn("created_week", dayofweek("created_utc")) # create day of the week column
    .withColumn("created_month", month("created_utc")) # create month of the year column
    .withColumn("created_year", year("created_utc")) # create the year column
    .withColumn("title", lower(col('title'))) # text cleaning: lowercase
    .withColumn("selftext", lower(col('selftext'))) # text cleaning: lowercase
    .withColumn("cleaned_title", regexp_replace(col('title'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned_title", regexp_replace(col('cleaned_title'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('title_wordCount', size(split(col('cleaned_title'), ' '))) # word count
    .withColumn("cleaned_selftext", regexp_replace(col('selftext'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned_selftext", regexp_replace(col('cleaned_selftext'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('selftext_wordCount', size(split(col('cleaned_selftext'), ' '))) # word count
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 17, Finished, Available)
com_cleaned = (
    com
    .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
    .withColumn("created_hour", hour("created_utc")) # create hour column
    .withColumn("created_week", dayofweek("created_utc")) # create day of the week column
    .withColumn("created_month", month("created_utc")) # create month of the year column
    .withColumn("created_year", year("created_utc")) # create the year column
    .withColumn("body", lower(col('body'))) # text cleaning: lowercase
    .withColumn("cleaned", regexp_replace(col('body'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned", regexp_replace(col('cleaned'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('body_wordCount', size(split(col('cleaned'), ' '))) # word count
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 18, Finished, Available)
sub_cleaned.cache()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 21, Finished, Available)
DataFrame[subreddit: string, author: string, author_flair_text: string, created_utc: timestamp, title: string, selftext: string, num_comments: bigint, num_crossposts: bigint, over_18: boolean, score: bigint, stickied: boolean, id: string, created_date: string, created_hour: int, created_week: int, created_month: int, created_year: int, cleaned_title: string, title_wordCount: int, cleaned_selftext: string, selftext_wordCount: int]
sub_cleaned.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 22, Finished, Available)
+-----------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
|        subreddit|              author|   author_flair_text|        created_utc|               title|            selftext|num_comments|num_crossposts|over_18|score|stickied|     id|created_date|created_hour|created_week|created_month|created_year|       cleaned_title|title_wordCount|    cleaned_selftext|selftext_wordCount|
+-----------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
|          digimon|            kuroi110|                null|2023-02-22 21:08:07|visiting japan in...|from what i under...|          11|             0|  false|   10|   false|119d7f0|  2023-02-22|          21|           4|            2|        2023|visiting japan in...|             19|from what i under...|                47|
|          pokemon|     OneWhoGetsBread|                null|2023-02-22 21:08:56|the pokemontogeth...|so several days a...|           0|             0|  false|    6|   false|119d8jo|  2023-02-22|          21|           4|            2|        2023|the pokemontogeth...|              3|so several days a...|               241|
|         OnePiece|          hockeystew|                null|2023-02-22 21:11:29|can someone help ...|https://mangabudd...|           0|             0|  false|    1|   false|119dcdb|  2023-02-22|          21|           4|            2|        2023|can someone help ...|             11|httpsmangabuddyco...|                30|
|         OnePiece|         lampione784|                null|2023-02-22 21:15:21|what one piece ga...|the creator of th...|           2|             0|  false|    1|   false|119dicx|  2023-02-22|          21|           4|            2|        2023|what one piece ga...|              6|the creator of th...|                22|
|          pokemon|   blackjackgabbiani|                null|2023-02-22 21:16:58|who's a non-villa...|for me, tyme insp...|         372|             0|  false|  406|   false|119dkrj|  2023-02-22|          21|           4|            2|        2023|whos a nonvillain...|              9|for me tyme inspi...|               122|
|          pokemon|   Totallynotttegegg|                null|2023-02-22 21:17:58|i have a realization|​\n\n[...|           0|             0|  false|    5|   false|119dm5g|  2023-02-22|          21|           4|            2|        2023|i have a realization|              4|ampx200b brock is...|                 6|
|         OnePiece|         Minigeneius|                null|2023-02-22 21:20:28|thoughts on 10th ...|i've read all of ...|           3|             0|  false|    0|   false|119dpzv|  2023-02-22|          21|           4|            2|        2023|thoughts on 10th ...|              5|ive read all of o...|                72|
|          pokemon|     coolnessAlert39|                null|2023-02-22 21:26:11|name any bug type...|ok now we’re doin...|          32|             0|  false|    0|   false|119dymy|  2023-02-22|          21|           4|            2|        2023|name any bug type...|             15|ok now were doing...|                81|
|         OnePiece|             vixnvox|              Marine|2023-02-22 21:30:18|wtf is going on w...|he is such a rand...|           1|             0|  false|    0|   false|119e4cs|  2023-02-22|          21|           4|            2|        2023|wtf is going on w...|              6|he is such a rand...|                30|
|          pokemon|          Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|my starters for e...|gen 1: charizard ...|           1|             0|  false|    0|   false|117vi5f|  2023-02-21|           5|           3|            2|        2023|my starters for e...|              5|gen 1 charizard g...|                50|
|          pokemon|       DenseRead9852|                null|2023-02-21 05:33:56|i really fucking ...|i feel like it's ...|          13|             0|  false|    0|   false|117vluc|  2023-02-21|           5|           3|            2|        2023|i really fucking ...|             24|i feel like its f...|                65|
|         OnePiece|    VA_Monkey_D_Garp|                null|2023-02-21 05:38:14|just a rant about...|garp quite litera...|           0|             0|  false|    8|   false|117vohs|  2023-02-21|           5|           3|            2|        2023|just a rant about...|              5|garp quite litera...|               274|
|       TokyoGhoul|           panling69|                null|2023-02-21 05:39:30|if anyone wants a...|hello everyone, j...|           5|             0|  false|    7|   false|117vpbl|  2023-02-21|           5|           3|            2|        2023|if anyone wants a...|             18|hello everyone ju...|                43|
|         OnePiece| hopefulindiegamedev|                null|2023-02-21 05:44:42|can't tell if its...|[https://www.yout...|           0|             0|  false|    1|   false|117vsl4|  2023-02-21|           5|           3|            2|        2023|cant tell if its ...|             28|httpswwwyoutubeco...|                12|
|         OnePiece|The-seven-deadly-sin|                null|2023-02-21 05:44:52|could a fishman l...|               title|           8|             0|  false|    2|   false|117vsoi|  2023-02-21|           5|           3|            2|        2023|could a fishman l...|             24|               title|                 1|
|           Naruto|          FosterPupz|                    |2023-02-21 05:45:12|       just a giggle|i just wanted to ...|           3|             0|  false|    4|   false|117vsx3|  2023-02-21|           5|           3|            2|        2023|       just a giggle|              3|i just wanted to ...|               128|
| DemonSlayerAnime|   Vivid-Balance9658|                null|2023-02-21 05:57:54|giyushino fanfict...|hello everyone! i...|           2|             0|  false|    1|   false|117w0xl|  2023-02-21|           5|           3|            2|        2023|giyushino fanfiction|              2|hello everyone i ...|                57|
|           Naruto|        juankruh1250|                    |2023-02-21 05:58:17|what are some of ...|i'll start with t...|          22|             0|  false|    4|   false|117w15s|  2023-02-21|           5|           3|            2|        2023|what are some of ...|             12|ill start with th...|                54|
|StardustCrusaders|       Many_Line9136|                null|2023-02-21 05:58:58|arrow saga takeaw...|i was walking bac...|           0|             0|  false|    5|   false|117w1ks|  2023-02-21|           5|           3|            2|        2023|arrow saga takeaw...|              4|i was walking bac...|               227|
|         OnePiece|      chiep-the-riep|                null|2023-02-21 06:00:22|    laughtale island|laughtale is just...|           3|             0|  false|    2|   false|117w2jw|  2023-02-21|           6|           3|            2|        2023|    laughtale island|              2|laughtale is just...|               164|
+-----------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
only showing top 20 rows
com_cleaned.cache()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 23, Finished, Available)
DataFrame[subreddit: string, author: string, author_flair_text: string, created_utc: timestamp, body: string, controversiality: bigint, score: bigint, parent_id: string, stickied: boolean, link_id: string, id: string, created_date: string, created_hour: int, created_week: int, created_month: int, created_year: int, cleaned: string, body_wordCount: int]
com_cleaned.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 24, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|subreddit|              author|   author_flair_text|        created_utc|                body|controversiality|score| parent_id|stickied|  link_id|     id|created_date|created_hour|created_week|created_month|created_year|             cleaned|body_wordCount|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|   Gundam|        Kn1ght0fv01d|                null|2021-06-23 08:32:41|looks at the extr...|               0|    1| t3_o5obrx|   false|t3_o5obrx|h2qxm9h|  2021-06-23|           8|           4|            6|        2021|looks at the extr...|            16|
| OnePiece|            kitay123|       Bounty Hunter|2021-06-23 08:32:48|what's so funny a...|               1|   -4| t3_o63f5c|   false|t3_o63f5c|h2qxmj8|  2021-06-23|           8|           4|            6|        2021|whats so funny ab...|             6|
| OnePiece|          Darkmithra|                null|2021-06-23 08:32:50|they want to mono...|               0|    3|t1_h2qwtee|   false|t3_o5l1y6|h2qxmlb|  2021-06-23|           8|           4|            6|        2021|they want to mono...|            15|
| OnePiece|            nehc9050|                null|2021-06-23 08:32:55|i'm confused, i'm...|               0|    1|t1_h2q924w|   false|t3_o5l1y6|h2qxmrn|  2021-06-23|           8|           4|            6|        2021|im confused im ju...|            12|
| OnePiece| Sea-Improvement5038|                null|2021-06-23 08:32:55|nor do i get why ...|               0|    1|t1_h2qwsd4|   false|t3_o5l1y6|h2qxmrv|  2021-06-23|           8|           4|            6|        2021|nor do i get why ...|             8|
|   Naruto|            aleky254|                    |2021-06-23 08:32:59|lol. nagato is a ...|               0|    1|t1_h2qw5vx|   false|t3_o51egf|h2qxmvx|  2021-06-23|           8|           4|            6|        2021|lol nagato is a b...|             9|
| OnePiece|          AnudeStart|                null|2021-06-23 08:33:10|meth. coke isn’t ...|               0|    4|t1_h2qws0o|   false|t3_o5l1y6|h2qxnb1|  2021-06-23|           8|           4|            6|        2021|meth coke isnt li...|             5|
|   Naruto|        EDR-Basement|                    |2021-06-23 08:33:14|kakashi, neji, sh...|               0|    2| t3_o66ix9|   false|t3_o66ix9|h2qxnhc|  2021-06-23|           8|           4|            6|        2021|kakashi neji shin...|             6|
|  pokemon|            KatonRyu|                null|2021-06-23 08:33:18|i still play like...|               0|    1| t3_o5h6hg|   false|t3_o5h6hg|h2qxnm1|  2021-06-23|           8|           4|            6|        2021|i still play like...|            28|
|   yugioh|        Katze1Punkt0|:att-water: Iced ...|2021-06-23 08:33:26|thats because *it...|               0|    1| t3_o62oyc|   false|t3_o62oyc|h2qxnwt|  2021-06-23|           8|           4|            6|        2021|thats because it ...|             5|
| OnePiece|The_Edgiest_Edgelord|                null|2021-06-23 08:33:34|the jacket jacket...|               0|    4|t1_h2qxmlb|   false|t3_o5l1y6|h2qxo85|  2021-06-23|           8|           4|            6|        2021|the jacket jacket...|             8|
| OnePiece|           sekhon_98|                null|2021-06-23 08:33:36|theoretically the...|               0|    1|t1_h2qwtee|   false|t3_o5l1y6|h2qxoac|  2021-06-23|           8|           4|            6|        2021|theoretically the...|            29|
| OnePiece|  KnowledgeNorth6337|                null|2021-06-23 08:33:37|the no one knowin...|               0|    6| t3_o5l1y6|   false|t3_o5l1y6|h2qxobx|  2021-06-23|           8|           4|            6|        2021|the no one knowin...|            51|
| OnePiece|      mathemagician0|              Lurker|2021-06-23 08:33:40|still don't like ...|               0|    5|t1_h2qww6e|   false|t3_o5l1y6|h2qxogn|  2021-06-23|           8|           4|            6|        2021|still dont like w...|            78|
|   Naruto|        KhaoticTwist|                    |2021-06-23 08:33:55|isshiki is defini...|               0|   -3| t3_o67sfh|   false|t3_o67sfh|h2qxp0j|  2021-06-23|           8|           4|            6|        2021|isshiki is defini...|           150|
|   Naruto|         WeedyNaruto|                    |2021-06-23 08:34:00|> did you not ...|               0|    1|t1_h2qx89o|   false|t3_o5k2rd|h2qxp7h|  2021-06-23|           8|           4|            6|        2021|gt did you not se...|            86|
| OnePiece|          JollyBlaze|                null|2021-06-23 08:34:01|if the cp9 kept t...|               0|   -1| t3_o5l1y6|   false|t3_o5l1y6|h2qxp9h|  2021-06-23|           8|           4|            6|        2021|if the cp9 kept t...|            61|
| OnePiece|        Dumpling2104|                null|2021-06-23 08:34:03|maybe that’s why ...|               0|    2|t1_h2qu81f|   false|t3_o5l1y6|h2qxpce|  2021-06-23|           8|           4|            6|        2021|maybe thats why s...|            10|
|  pokemon|        purejackbaby|                null|2021-06-23 08:34:09|could you like, g...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qxpjq|  2021-06-23|           8|           4|            6|        2021|could you like go...|            28|
| OnePiece|         Mr_Lectures|                null|2021-06-23 08:34:13|well sadly i saw ...|               0|    1|t1_h2qx75y|   false|t3_o5l1y6|h2qxppz|  2021-06-23|           8|           4|            6|        2021|well sadly i saw ...|             9|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
only showing top 20 rows

Save cleaned data

datastore = 'azureml://datastores/workspaceblobstore/paths'

# https://<STORAGE-ACCOUNT>.blob.core.windows.net/<CONTAINER-NAME>
Storage_URI="https://group09astorage08f5ea16c.blob.core.windows.net/azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"

workspace_default_storage_account = "group09astorage08f5ea16c"
workspace_default_container = "azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"

workspace_wasbs_base_url = (
    f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 25, Finished, Available)
sub_cleaned.write.mode("overwrite").parquet(f"{workspace_wasbs_base_url}/franchise_sub_cleaned.parquet")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 7, 27, Finished, Available)
com_cleaned.write.parquet(f"{workspace_wasbs_base_url}/franchise_com_cleaned.parquet")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 7, 29, Finished, Available)
datastore = 'azureml://datastores/workspaceblobstore/paths'

submissions = spark.read.parquet(f"{datastore}/franchise_sub_cleaned.parquet")
comments = spark.read.parquet(f"{datastore}/franchise_com_cleaned.parquet")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 8, 21, Finished, Available)
submissions = sub_cleaned
comments = com_cleaned
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 19, Finished, Available)
%pip install plotly
StatementMeta(, , -1, Finished, Available)
Requirement already satisfied: plotly in /home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages (4.14.3)
Requirement already satisfied: retrying>=1.3.3 in /home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages (from plotly) (1.3.3)
Requirement already satisfied: six in /home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages (from plotly) (1.16.0)
WARNING: You are using pip version 22.0.4; however, version 23.3.1 is available.
You should consider upgrading via the '/nfs4/pyenv-a84f12ee-d053-42ac-befe-f5a4ce7804a6/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
Warning: PySpark kernel has been restarted to use updated packages.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import plotly.graph_objects as go
import plotly.subplots as sp
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
from plotly.subplots import make_subplots
import pyspark.sql.types as T
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 20, Finished, Available)
comments.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 8, 22, Finished, Available)
+---------+--------------------+-----------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|subreddit|              author|author_flair_text|        created_utc|                body|controversiality|score| parent_id|stickied|  link_id|     id|created_date|created_hour|created_week|created_month|created_year|             cleaned|body_wordCount|
+---------+--------------------+-----------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|  pokemon|        BucketHeadJr|             null|2021-02-26 15:35:03|do they? the enco...|               0|    8|t1_gou6k94|   false|t3_lszxq0|gou70b1|  2021-02-26|          15|           6|            2|        2021|do they the encou...|            23|
|  pokemon|           salgadosp|             null|2021-02-26 15:35:04|yeah but how will...|               0|    9|t1_gou6w8h|   false|t3_lt01hi|gou70bq|  2021-02-26|          15|           6|            2|        2021|yeah but how will...|            13|
|  pokemon|            moiraixo|             null|2021-02-26 15:35:04|i was meant to vo...|               0|    1| t3_lt01hi|   false|t3_lt01hi|gou70bt|  2021-02-26|          15|           6|            2|        2021|i was meant to vo...|            10|
| OnePiece|         tylionheart|       Cipher Pol|2021-02-26 15:35:04|well hope you wer...|               0|    0|t1_gou6j3l|   false|t3_lsta9h|gou70d0|  2021-02-26|          15|           6|            2|        2021|well hope you wer...|            12|
|  pokemon|             Ethacon|             null|2021-02-26 15:35:05|i just hope it ha...|               0|    1| t3_lszxq0|   false|t3_lszxq0|gou70ds|  2021-02-26|          15|           6|            2|        2021|i just hope it ha...|            12|
|  pokemon|            fedemasa|             null|2021-02-26 15:35:05|drifloon is an aw...|               0|    6|t1_gotw6di|   false|t3_lsycl4|gou70eg|  2021-02-26|          15|           6|            2|        2021|drifloon is an aw...|             5|
|  pokemon|JustAHipsterInDenial|             null|2021-02-26 15:35:05|rowlet is my one ...|               0|   22|t1_gou5i4t|   false|t3_lsztja|gou70f8|  2021-02-26|          15|           6|            2|        2021|rowlet is my one ...|            27|
|  pokemon|     PlusUltraTaylor|             null|2021-02-26 15:35:06|it's not made by ...|               0|    1| t3_lt03jh|   false|t3_lt03jh|gou70hi|  2021-02-26|          15|           6|            2|        2021|its not made by g...|             6|
|  pokemon|        coolfreshguy|             null|2021-02-26 15:35:06|definitely gonna ...|               0|    1|t1_gou5uzs|   false|t3_lsycl4|gou70hr|  2021-02-26|          15|           6|            2|        2021|definitely gonna ...|             4|
|  pokemon|            Mastreos|             null|2021-02-26 15:35:06|tbh i love it. i'...|               0|    1| t3_lszzxu|   false|t3_lszzxu|gou70ij|  2021-02-26|          15|           6|            2|        2021|tbh i love it ive...|            11|
|  pokemon|         thebombyboi|             null|2021-02-26 15:35:06|also the animatio...|               0|    2|t1_gou6vvi|   false|t3_lsztja|gou70iv|  2021-02-26|          15|           6|            2|        2021|also the animatio...|            19|
|  pokemon|  WhiteSilverDragoon|             null|2021-02-26 15:35:07|i hate the artsty...|               0|    1| t3_lszxq0|   false|t3_lszxq0|gou70kv|  2021-02-26|          15|           6|            2|        2021|i hate the artsty...|            43|
| OnePiece|         Florissssss|             null|2021-02-26 15:35:07|after wci if sanj...|               0|   14| t3_lsta9h|   false|t3_lsta9h|gou70l5|  2021-02-26|          15|           6|            2|        2021|after wci if sanj...|            98|
|  pokemon|               TPGPC|             null|2021-02-26 15:35:07|honestly terrible...|               0|   14| t3_lszzxu|   false|t3_lszzxu|gou70l8|  2021-02-26|          15|           6|            2|        2021|honestly terrible...|            43|
|  pokemon|             Zenn-13|             null|2021-02-26 15:35:07|yup. i guess i ca...|               0|   14|t1_gou6q3c|   false|t3_lsycl4|gou70lv|  2021-02-26|          15|           6|            2|        2021|yup i guess i can...|            16|
|  pokemon|          T_Raycroft|             null|2021-02-26 15:35:07|isn’t that what p...|               0|   21| t3_lt03qw|   false|t3_lt03qw|gou70m1|  2021-02-26|          15|           6|            2|        2021|isnt that what pe...|            17|
|  pokemon|           Velocirob|             null|2021-02-26 15:35:08|no battle frontie...|               0|    5|t1_gou5qsg|   false|t3_lsztwl|gou70nj|  2021-02-26|          15|           6|            2|        2021|no battle frontie...|            13|
|  pokemon|  LookaLookaKooLaLey|             null|2021-02-26 15:35:08|seriously!!!! sws...|               1|    5| t3_lt00a8|   false|t3_lt00a8|gou70pq|  2021-02-26|          15|           6|            2|        2021|seriously swsh ar...|            32|
|  pokemon|            Dewottle|             null|2021-02-26 15:35:09|i thought this wa...|               0|    2| t3_lszz5o|   false|t3_lszz5o|gou70q2|  2021-02-26|          15|           6|            2|        2021|i thought this wa...|            23|
|  pokemon|            Oddish03|             null|2021-02-26 15:35:08|john hanke from n...|               0|    2|t1_gou3d9l|   false|t3_lsycl4|gou70q3|  2021-02-26|          15|           6|            2|        2021|john hanke from n...|             6|
+---------+--------------------+-----------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
only showing top 20 rows

Table: Comparative Analysis of Reddit Subreddits’ Activity

total_comments = comments.count()
total_submissions = submissions.count()

print("Total Comments:", total_comments)
print("Total Submissions:", total_submissions)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 32, Finished, Available)
# Group by subreddit and calculate counts and average scores for comments
comments_grouped = comments.groupBy('subreddit') \
    .agg(
        count('subreddit').alias('comments_count'),
        (count('subreddit') / total_comments * 100).alias('comments_percentage'),
        mean('score').alias('avg_comments_score')
    )

# Group by subreddit and calculate counts and average scores for submissions
submissions_grouped = submissions.groupBy('subreddit') \
    .agg(
        count('subreddit').alias('submissions_count'),
        (count('subreddit') / total_submissions * 100).alias('submissions_percentage'),
        mean('score').alias('avg_submissions_score')
    )

result_table = comments_grouped.join(submissions_grouped, 'subreddit', 'outer')

# Fill nulls with zeros
result_table = result_table.na.fill(0)
result_table.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 33, Finished, Available)
result_table_pandas = result_table.toPandas()
result_table_pandas.to_csv('comparative_franchise_result_table.csv', index=False)
# Load the data
file_path = 'Users/hw487/fall-2023-reddit-project-team-09/data/csv/top_animes_avg_score_vs_count.csv'
result_table_pandas = pd.read_csv(file_path)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 36, Finished, Available)
# Check the column names
print("Original column names:", result_table_pandas.columns)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 39, Finished, Available)
Original column names: Index(['           subreddit', 'comments_count', 'comments_percentage',
       'avg_comments_score', 'submissions_count', 'submissions_percentage',
       'avg_submissions_score'],
      dtype='object')
result_table_pandas.columns = [col.strip() for col in result_table_pandas.columns]
# result_table_pandas.rename(columns={'           subreddit': 'subreddit'}, inplace=True)

# Check the column names after renaming
print("Updated column names:", result_table_pandas.columns)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 40, Finished, Available)
Updated column names: Index(['subreddit', 'comments_count', 'comments_percentage',
       'avg_comments_score', 'submissions_count', 'submissions_percentage',
       'avg_submissions_score'],
      dtype='object')
result_table_pandas.to_csv('comparative_franchise_result_table.csv', index=False)

EDA: Subreddit Activity Bubble Plot with Average Score Metrics

# Normalize the size of the bubbles for comments count
bubble_size = result_table_pandas['comments_count'].copy()
size_scale = 10000  # Adjust this scale to get appropriately sized bubbles
bubble_size = (bubble_size / bubble_size.max()) * size_scale

# Create the Plotly bubble chart
fig = px.scatter(
    result_table_pandas,
    x='avg_comments_score',
    y='avg_submissions_score',
    size=bubble_size,  # sets the bubble sizes
    color='comments_count',  # sets the bubble colors based on comment count
    color_continuous_scale='Purpor', 
    hover_name='subreddit',  # shows subreddit name when hovering over bubbles
    labels={'avg_comments_score': 'Average Comment Score', 'avg_submissions_score': 'Average Submission Score', 'comments_count': 'Comment Count'},
    title='Bubble Plot of Reddit Activity by Subreddit',
    template='plotly_white'
)

# Show the plot
fig.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 44, Finished, Available)

EDA: Comparative Monthly Comment Activity Across Selected Subreddits

comments_by_month_subreddit = comments \
    .withColumn('year_month', date_format('created_utc', 'yyyy-MM')) \
    .groupBy('year_month', 'subreddit') \
    .count() \
    .toPandas()

# Filter the dataframe for the subreddits of interest
subreddits_of_interest = ['pokemon', 'OnePiece', 'Naruto', 'OnePunchMan', 'yugioh', 'Gundam',
                          'StardustCrusaders', 'attackontitan', 'dbz', 'digimon', 'DemonSlayerAnime',
                          'Kaguya_sama', 'swordartonline', 'KillLaKill', 'TokyoGhoul']
filtered_df = comments_by_month_subreddit[comments_by_month_subreddit['subreddit'].isin(subreddits_of_interest)]

# Pivot the DataFrame so that each subreddit has its own column
pivot_df = filtered_df.pivot(index='year_month', columns='subreddit', values='count')

# Sort the DataFrame by year_month
pivot_df.sort_index(inplace=True)

# Custom color palette
color_palette = [
    "#42a63c", "#42a1b9", 
    "#d13a47", "#f7c200",
]

desired_saturation = 0.5  # Decreased saturation

# Generate more colors if there are more subreddits than colors in the custom palette
if len(subreddits_of_interest) > len(color_palette):
    # Create additional colors with decreased saturation
    additional_colors = sns.husl_palette(len(subreddits_of_interest) - len(color_palette), s=desired_saturation)
    # Convert RGB to hex
    additional_colors_hex = [mcolors.to_hex(color) for color in additional_colors]
    # Add the additional colors to our color palette
    color_palette.extend(additional_colors_hex)

# Ensure there are enough colors for the subreddits
assert len(color_palette) >= len(subreddits_of_interest), "Not enough colors for the number of subreddits"

# blended_color_palette = sns.blend_palette(["#42a63c","#42a1b9","#967bb6","#d13a47","#f7c200"], 15)
# blended_color_palette = sns.blend_palette(["#42a63c","#f7c200","#d13a47","#967bb6","#42a1b9"], 15)

# Create a Plotly figure
fig = go.Figure()

# Loop through each subreddit in the columns and add them as a separate line in the plot
for i, subreddit in enumerate(pivot_df.columns):
    fig.add_trace(go.Scatter(
        x=pivot_df.index, 
        y=pivot_df[subreddit], 
        mode='lines+markers',  # Line with markers
        name=subreddit,  # Legend name
        line=dict(color=color_palette[i % len(color_palette)])  # Assign color from the palette / blended_color_palette
    ))

# Update layout with titles and labels
fig.update_layout(
    title='Number of Comments Per Month by Subreddit',
    xaxis_title='Month',
    xaxis_tickangle=45,  # Rotate x-axis labels
    yaxis_title='Number of Comments',
    legend=dict(x=0.5, y=-0.2, xanchor='center', orientation='h'),  # Position the legend
    margin=dict(b=150)  # Adjust bottom margin to accommodate the legend
)

# Show the plot
fig.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 56, Finished, Available)
# Save the data
filtered_df.to_csv('comments_monthly_filtered_cross_franchise.csv', index=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 99, Finished, Available)
# Load the data
comments_filtered_path = 'Users/hw487/fall-2023-reddit-project-team-09/data/csv/comments_monthly_filtered_cross_franchise.csv'
filtered_df = pd.read_csv(comments_filtered_path)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 111, Finished, Available)
# Pivot the DataFrame so that each subreddit has its own column
pivot_df = filtered_df.pivot(index='year_month', columns='subreddit', values='count')

# Sort the DataFrame by year_month
pivot_df.sort_index(inplace=True)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 112, Finished, Available)
# Custom color palette
color_palette = [
    "#42a63c", "#42a1b9", 
    "#d13a47", "#f7c200",
]

desired_saturation = 0.5  # Decreased saturation

# Generate more colors if there are more subreddits than colors in the custom palette
if len(subreddits_of_interest) > len(color_palette):
    # Create additional colors with decreased saturation
    additional_colors = sns.husl_palette(len(subreddits_of_interest) - len(color_palette), s=desired_saturation)
    # Convert RGB to hex
    additional_colors_hex = [mcolors.to_hex(color) for color in additional_colors]
    # Add the additional colors to our color palette
    color_palette.extend(additional_colors_hex)

# Ensure there are enough colors for the subreddits
assert len(color_palette) >= len(subreddits_of_interest), "Not enough colors for the number of subreddits"

# Create a Plotly figure
fig = go.Figure()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 113, Finished, Available)
# Define the number of traces (subreddits) per legend group
traces_per_group = 3

# Loop through each subreddit and add them as a separate line in the plot
for i, subreddit in enumerate(pivot_df.columns):
    legend_group = f'group{i // traces_per_group}'  # Define legend group
    color = color_palette[i % len(color_palette)]  # Assign color from the palette

    fig.add_trace(go.Scatter(
        x=pivot_df.index, 
        y=pivot_df[subreddit],
        mode='lines+markers',
        name=subreddit,
        line=dict(color=color),
        legendgroup=legend_group  # Assign to a legend group
    ))
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 114, Finished, Available)
# Update layout with titles, labels, and adjusted legend
fig.update_layout(
    title='Number of Comments Per Month by Subreddit',
    xaxis_title='Month',
    xaxis_tickangle=45,
    yaxis_title='Number of Comments',
    legend=dict(
        x=0.5, y=-0.5, xanchor='center', orientation='h',
        tracegroupgap=100  # Adjust for spacing between groups
    ),
    margin=dict(b=200)  # May need adjustment for legend
)

# Show the plot
fig.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 115, Finished, Available)
# Create a Seaborn blended color palette
# blended_color_palette = sns.blend_palette(["#42a63c","#42a1b9","#967bb6","#d13a47","#f7c200"], 15)
blended_color_palette = sns.blend_palette(["#42a63c","#f7c200","#d13a47","#967bb6","#42a1b9"], 15)
# blended_color_palette = sns.blend_palette(['#d13a47', '#967bb6', '#42a1b9', '#42a63c', '#f7c200'], 15)

# Convert RGB tuples to hexadecimal format
hex_color_palette = [mcolors.to_hex(color) for color in blended_color_palette]
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 116, Finished, Available)
import plotly.graph_objects as go

# Create a Plotly figure
fig = go.Figure()

# Loop through each subreddit in the columns and add them as a separate line in the plot
for i, subreddit in enumerate(pivot_df.columns):
    fig.add_trace(go.Scatter(
        x=pivot_df.index, 
        y=pivot_df[subreddit], 
        mode='lines+markers', 
        name=subreddit, 
        line=dict(color=hex_color_palette[i % len(hex_color_palette)]),
        legendgroup=f'group{i // 3}'  # Assigning a legend group based on index
    ))

# Update layout with titles and labels
fig.update_layout(
    title='Number of Comments Per Month by Subreddit',
    xaxis_title='Month',
    xaxis_tickangle=45,
    yaxis_title='Number of Comments',
    legend=dict(
        x=0.5, 
        y=-0.5,  # Adjust as needed
        xanchor='center', 
        orientation='h',
        tracegroupgap=135,  # Adjust this value for spacing between legend groups
        itemsizing='constant'  # Keeps legend symbols the same size
    ),
    margin=dict(b=200)  # Adjust bottom margin to accommodate the legend
)

# Show the plot
fig.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 117, Finished, Available)

EDA: Pie Chart - Dominant Subreddits in Comment Proportions

data = {
    'subreddit': ['NeonGenesisEvangelion', 'Kaguya_sama', 'pokemon', 'StardustCrusaders', 'yugioh', 
                  'ShokugekiNoSoma', 'OnePiece', 'TokyoGhoul', 'attackontitan', 'OneTruthPrevails', 
                  'swordartonline', 'Gundam', 'dbz', 'OnePunchMan', 'KillLaKill', 'digimon', 
                  'DetectiveConan', 'Naruto', 'DemonSlayerAnime'],
    'count': [31571, 229630, 5703713, 763751, 1238280, 18705, 7166824, 81979, 691710, 67709, 
              211009, 842233, 553935, 1848632, 82956, 528217, 11357, 2146971, 324778]
}

df = pd.DataFrame(data)

# Calculate the total count
total_count = df['count'].sum()

# Calculate the proportion for each subreddit
df['proportion'] = (df['count'] / total_count) * 100
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 21, Finished, Available)
# Generate a blended color palette
blended_colors = sns.blend_palette(["#d5ecf1","#f4d7da","#fdf3cb","#d7edd9"], len(df))

# Convert the RGB tuples to hex colors for Plotly
custom_colors = [mcolors.to_hex(color) for color in blended_colors]
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 22, Finished, Available)
# Create the Plotly pie chart
fig = px.pie(
    df,
    names='subreddit',
    values='proportion',
    color_discrete_sequence=custom_colors,  # Use the custom color sequence
    title='Proportion of Comments by Subreddit',
    hole=0.3  # Optional: Create a donut chart by specifying a hole size
)

# Customizing the labels and layout
fig.update_traces(textinfo='percent+label', textposition='inside')
fig.update_layout(
    showlegend=True,
    legend_title_text='Subreddits',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.7,  # Adjust position of legend
        xanchor='center',
        x=0.5
    )
)

# Show the plot
fig.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 26, Finished, Available)

EDA: Boxplot Analysis - Score Distribution by Subreddit

import seaborn as sns
import matplotlib.pyplot as plt

sampled_comments = comments.sample(False, 0.1, seed=12)

# Select only the 'subreddit' and 'score' columns and convert to Pandas DataFrame
subreddit_scores = sampled_comments.select('subreddit', 'score').toPandas()

plt.figure(figsize=(14, 7))

# Boxplot without outliers
sns.boxplot(x='score', y='subreddit', data=subreddit_scores, orient='h', showfliers=False)

# Adding a strip plot to overlay the actual scores
sns.stripplot(x='score', y='subreddit', data=subreddit_scores, orient='h', 
              color='grey', alpha=0.5, size=3)

# Customize the appearance
sns.set_style("whitegrid")
sns.set_palette("pastel")

# Set a log scale for the x-axis if needed
# plt.xscale('log') # Uncomment this line if score distribution is highly skewed

plt.title('Distribution of Scores Across Subreddits')
plt.xlabel('Score')
plt.ylabel('Subreddit')
plt.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 31, Finished, Available)

plt.figure(figsize=(14, 7))

# Boxplot without outliers
sns.boxplot(x='score', y='subreddit', data=subreddit_scores, orient='h', showfliers=False)

# Customize the appearance
sns.set_style("whitegrid")
sns.set_palette("pastel")

plt.title('Distribution of Scores Across Subreddits')
plt.xlabel('Score')
plt.ylabel('Subreddit')
plt.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 42, Finished, Available)