sparkStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 6, Finished, Available)
SparkSession - hive
sparkStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 6, Finished, Available)
SparkSession - hive
blob_account_name = "marckvnonprodblob"
blob_container_name = "bigdata"
# read only
blob_sas_token = "?sv=2021-10-04&st=2023-10-04T01%3A42%3A59Z&se=2024-01-02T02%3A42%3A00Z&sr=c&sp=rlf&sig=w3CH9MbCOpwO7DtHlrahc7AlRPxSZZb8MOgS6TaXLzI%3D"
wasbs_base_url = (
f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/"
)
spark.conf.set(
f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net",
blob_sas_token,
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 7, Finished, Available)
comments_path = "reddit-parquet/comments/"
submissions_path = "reddit-parquet/submissions/"StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 8, Finished, Available)
comments_df = spark.read.parquet(f"{wasbs_base_url}{comments_path}")
submissions_df = spark.read.parquet(f"{wasbs_base_url}{submissions_path}")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 9, Finished, Available)
import pyspark.sql.functions as F
from pyspark.sql.functions import sum as _sum, mean, stddev, max as _max, min as _min, count, percentile_approx, year, month, dayofmonth, ceil, col, dayofweek, hour, explode, date_format, lower, size, split, regexp_replace, isnan, whenStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 10, Finished, Available)
# Top list of subreddits as a string
subreddits = "pokemon, Naruto, dbz, OnePiece, yugioh, digimon, StardustCrusaders, Gundam, DetectiveConan, OneTruthPrevails, DemonSlayerAnime, attackontitan, TokyoGhoul, swordartonline, NeonGenesisEvangelion, ShokugekiNoSoma, OnePunchMan, KillLaKill, Kaguya_sama"
# Split the string into a list of subreddit names
subreddit_list = subreddits.split(", ")
# Build the filter condition
filter_condition = col("subreddit") == subreddit_list[0]
for subreddit in subreddit_list[1:]:
filter_condition = filter_condition | (col("subreddit") == subreddit)
# Filter the submissions DataFrame
submissions = submissions_df.filter(filter_condition)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 11, Finished, Available)
# Filter the comments DataFrame
comments = comments_df.filter(filter_condition)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 12, Finished, Available)
submissions.groupBy('subreddit').count().show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 6, 13, Finished, Available)
+--------------------+------+
| subreddit| count|
+--------------------+------+
|NeonGenesisEvange...| 5994|
| Kaguya_sama| 15163|
| pokemon|355231|
| StardustCrusaders| 79635|
| yugioh| 85497|
| ShokugekiNoSoma| 2872|
| OnePiece|311736|
| TokyoGhoul| 12751|
| attackontitan| 89525|
| OneTruthPrevails| 8755|
| swordartonline| 16315|
| Gundam| 46466|
| dbz| 53576|
| OnePunchMan| 87300|
| digimon| 41472|
| Naruto|148262|
| DemonSlayerAnime| 37018|
| KillLaKill| 9934|
| DetectiveConan| 2334|
+--------------------+------+
sub = submissions.select("subreddit", "author", "author_flair_text", "created_utc", "title", "selftext", "num_comments", "num_crossposts", "over_18", "score", "stickied", "id")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 13, Finished, Available)
sub.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 14, Finished, Available)
+-----------------+-------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
| subreddit| author| author_flair_text| created_utc| title| selftext|num_comments|num_crossposts|over_18|score|stickied| id|
+-----------------+-------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
| digimon| kuroi110| null|2023-02-22 21:08:07|Visiting Japan in...|From what I under...| 11| 0| false| 10| false|119d7f0|
|StardustCrusaders| Atticus1301| null|2023-02-22 21:08:41| OC [by me]| | 11| 0| false| 590| false|119d878|
| pokemon| OneWhoGetsBread| null|2023-02-22 21:08:56|the PokemonTogeth...|So several days a...| 0| 0| false| 6| false|119d8jo|
| OnePiece|Fluid_Implement9799| null|2023-02-22 21:09:58|the royal rumble ...| [removed]| 1| 0| false| 1| false|119d9z9|
| OnePiece| hockeystew| null|2023-02-22 21:11:29|Can someone help ...|https://mangabudd...| 0| 0| false| 1| false|119dcdb|
| Gundam| ncswisher| null|2023-02-22 21:11:53|Gundam Design Fee...| [removed]| 0| 0| false| 1| false|119dcyw|
| pokemon| lumixod| null|2023-02-22 21:13:47|Wholesome friendship| | 69| 0| false| 1947| false|119dfv9|
| OnePiece| lampione784| null|2023-02-22 21:15:21|What One piece ga...|The creator of th...| 2| 0| false| 1| false|119dicx|
| OnePiece| Many_Line9136| null|2023-02-22 21:15:28|Oda needs to stop...| [removed]| 1| 0| false| 1| false|119dijp|
| pokemon| KubfuKid| null|2023-02-22 21:16:45|(OC) Diego the Gh...| | 2| 0| false| 18| false|119dkh9|
| pokemon| blackjackgabbiani| null|2023-02-22 21:16:58|Who's a non-villa...|For me, Tyme insp...| 372| 0| false| 406| false|119dkrj|
| pokemon| [deleted]| null|2023-02-22 21:17:16|Zard Hunting in B...| [removed]| 0| 0| false| 1| false|119dl6k|
| dbz| Badj0jo_009| null|2023-02-22 21:17:30|What do you guys ...| | 0| 0| false| 1| false|119dli5|
| pokemon| Totallynotttegegg| null|2023-02-22 21:17:58|i have a realization|​\n\n[...| 0| 0| false| 5| false|119dm5g|
| OnePiece| [deleted]| null|2023-02-22 21:18:02| My pirate-kittens!!| [removed]| 2| 0| false| 3| false|119dm8c|
|StardustCrusaders| SombraDragonv| null|2023-02-22 21:18:19|How to take ZkSyn...| [removed]| 0| 0| false| 1| false|119dmn4|
| pokemon| MeiLei-|:906::722::495::2...|2023-02-22 21:18:19|bored in class. h...| | 6| 0| false| 0| false|119dmnh|
| OnePiece| [deleted]| null|2023-02-22 21:19:29| BONK !!!😍😆| [removed]| 1| 0| false| 0| false|119dobb|
| pokemon| [deleted]| null|2023-02-22 21:19:53|Original 151, cau...| [deleted]| 15| 0| false| 153| false|119dp0i|
| OnePiece| [deleted]| null|2023-02-22 21:20:24| SHANKS ART| [removed]| 0| 0| false| 1| false|119dpwd|
+-----------------+-------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
only showing top 20 rows
comments.groupBy('subreddit').count().show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 7, 17, Finished, Available)
+--------------------+-------+
| subreddit| count|
+--------------------+-------+
|NeonGenesisEvange...| 31571|
| Kaguya_sama| 229630|
| pokemon|5703713|
| StardustCrusaders| 763751|
| yugioh|1238280|
| ShokugekiNoSoma| 18705|
| OnePiece|7166824|
| TokyoGhoul| 81979|
| attackontitan| 691710|
| OneTruthPrevails| 67709|
| swordartonline| 211009|
| Gundam| 842233|
| dbz| 553935|
| OnePunchMan|1848632|
| KillLaKill| 82956|
| digimon| 528217|
| DetectiveConan| 11357|
| Naruto|2146971|
| DemonSlayerAnime| 324778|
+--------------------+-------+
com = comments.select("subreddit", "author", "author_flair_text", "created_utc", "body", "controversiality", "score", "parent_id", "stickied", "link_id", "id")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 14, Finished, Available)
com.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 16, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|subreddit| author| author_flair_text| created_utc| body|controversiality|score| parent_id|stickied| link_id| id|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
| Gundam| Kn1ght0fv01d| null|2021-06-23 08:32:41|Looks at the extr...| 0| 1| t3_o5obrx| false|t3_o5obrx|h2qxm9h|
| OnePiece| kitay123| Bounty Hunter|2021-06-23 08:32:48|What's so funny a...| 1| -4| t3_o63f5c| false|t3_o63f5c|h2qxmj8|
| OnePiece| Darkmithra| null|2021-06-23 08:32:50|They want to mono...| 0| 3|t1_h2qwtee| false|t3_o5l1y6|h2qxmlb|
| OnePiece| nehc9050| null|2021-06-23 08:32:55|I'm confused, I'm...| 0| 1|t1_h2q924w| false|t3_o5l1y6|h2qxmrn|
| OnePiece| Sea-Improvement5038| null|2021-06-23 08:32:55|nor do i get why ...| 0| 1|t1_h2qwsd4| false|t3_o5l1y6|h2qxmrv|
| Naruto| aleky254| |2021-06-23 08:32:59|Lol. Nagato is a ...| 0| 1|t1_h2qw5vx| false|t3_o51egf|h2qxmvx|
| OnePiece| AnudeStart| null|2021-06-23 08:33:10|Meth. Coke isn’t ...| 0| 4|t1_h2qws0o| false|t3_o5l1y6|h2qxnb1|
| Naruto| EDR-Basement| |2021-06-23 08:33:14|Kakashi, Neji, Sh...| 0| 2| t3_o66ix9| false|t3_o66ix9|h2qxnhc|
| pokemon| KatonRyu| null|2021-06-23 08:33:18|I still play like...| 0| 1| t3_o5h6hg| false|t3_o5h6hg|h2qxnm1|
| yugioh| Katze1Punkt0|:att-water: Iced ...|2021-06-23 08:33:26|Thats because *it...| 0| 1| t3_o62oyc| false|t3_o62oyc|h2qxnwt|
| OnePiece|The_Edgiest_Edgelord| null|2021-06-23 08:33:34|The jacket jacket...| 0| 4|t1_h2qxmlb| false|t3_o5l1y6|h2qxo85|
| OnePiece| sekhon_98| null|2021-06-23 08:33:36|Theoretically the...| 0| 1|t1_h2qwtee| false|t3_o5l1y6|h2qxoac|
| OnePiece| KnowledgeNorth6337| null|2021-06-23 08:33:37|The no one knowin...| 0| 6| t3_o5l1y6| false|t3_o5l1y6|h2qxobx|
| OnePiece| mathemagician0| Lurker|2021-06-23 08:33:40|still don't like ...| 0| 5|t1_h2qww6e| false|t3_o5l1y6|h2qxogn|
| Naruto| KhaoticTwist| |2021-06-23 08:33:55|Isshiki is defini...| 0| -3| t3_o67sfh| false|t3_o67sfh|h2qxp0j|
| Naruto| WeedyNaruto| |2021-06-23 08:34:00|> did you not ...| 0| 1|t1_h2qx89o| false|t3_o5k2rd|h2qxp7h|
| OnePiece| JollyBlaze| null|2021-06-23 08:34:01|If the CP9 kept t...| 0| -1| t3_o5l1y6| false|t3_o5l1y6|h2qxp9h|
| OnePiece| Dumpling2104| null|2021-06-23 08:34:03|Maybe that’s why ...| 0| 2|t1_h2qu81f| false|t3_o5l1y6|h2qxpce|
| pokemon| purejackbaby| null|2021-06-23 08:34:09|Could you like, g...| 0| 1| t3_o63atm| false|t3_o63atm|h2qxpjq|
| OnePiece| Mr_Lectures| null|2021-06-23 08:34:13|well sadly i saw ...| 0| 1|t1_h2qx75y| false|t3_o5l1y6|h2qxppz|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
only showing top 20 rows
sub = sub.filter(
(col('title')!='') & \
(col('title')!='[deleted]') & \
(col('title')!='[removed]') & \
(col('selftext')!='') & \
(col('selftext')!='[deleted]') & \
(col('selftext')!='[removed]') & \
(col('author')!='[deleted]') & \
(col('author')!='[removed]')
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 15, Finished, Available)
com = com.filter(
(col('body')!='') & \
(col('body')!='[deleted]') & \
(col('body')!='[removed]') & \
(col('author')!='[deleted]') & \
(col('author')!='[removed]')
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 16, Finished, Available)
sub_cleaned = (
sub
.withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
.withColumn("created_hour", hour("created_utc")) # create hour column
.withColumn("created_week", dayofweek("created_utc")) # create day of the week column
.withColumn("created_month", month("created_utc")) # create month of the year column
.withColumn("created_year", year("created_utc")) # create the year column
.withColumn("title", lower(col('title'))) # text cleaning: lowercase
.withColumn("selftext", lower(col('selftext'))) # text cleaning: lowercase
.withColumn("cleaned_title", regexp_replace(col('title'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
.withColumn("cleaned_title", regexp_replace(col('cleaned_title'), r'\s+', ' ')) # text cleaning: remove extra space in text
.withColumn('title_wordCount', size(split(col('cleaned_title'), ' '))) # word count
.withColumn("cleaned_selftext", regexp_replace(col('selftext'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
.withColumn("cleaned_selftext", regexp_replace(col('cleaned_selftext'), r'\s+', ' ')) # text cleaning: remove extra space in text
.withColumn('selftext_wordCount', size(split(col('cleaned_selftext'), ' '))) # word count
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 17, Finished, Available)
com_cleaned = (
com
.withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
.withColumn("created_hour", hour("created_utc")) # create hour column
.withColumn("created_week", dayofweek("created_utc")) # create day of the week column
.withColumn("created_month", month("created_utc")) # create month of the year column
.withColumn("created_year", year("created_utc")) # create the year column
.withColumn("body", lower(col('body'))) # text cleaning: lowercase
.withColumn("cleaned", regexp_replace(col('body'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
.withColumn("cleaned", regexp_replace(col('cleaned'), r'\s+', ' ')) # text cleaning: remove extra space in text
.withColumn('body_wordCount', size(split(col('cleaned'), ' '))) # word count
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 18, Finished, Available)
sub_cleaned.cache()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 21, Finished, Available)
DataFrame[subreddit: string, author: string, author_flair_text: string, created_utc: timestamp, title: string, selftext: string, num_comments: bigint, num_crossposts: bigint, over_18: boolean, score: bigint, stickied: boolean, id: string, created_date: string, created_hour: int, created_week: int, created_month: int, created_year: int, cleaned_title: string, title_wordCount: int, cleaned_selftext: string, selftext_wordCount: int]
sub_cleaned.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 22, Finished, Available)
+-----------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
| subreddit| author| author_flair_text| created_utc| title| selftext|num_comments|num_crossposts|over_18|score|stickied| id|created_date|created_hour|created_week|created_month|created_year| cleaned_title|title_wordCount| cleaned_selftext|selftext_wordCount|
+-----------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
| digimon| kuroi110| null|2023-02-22 21:08:07|visiting japan in...|from what i under...| 11| 0| false| 10| false|119d7f0| 2023-02-22| 21| 4| 2| 2023|visiting japan in...| 19|from what i under...| 47|
| pokemon| OneWhoGetsBread| null|2023-02-22 21:08:56|the pokemontogeth...|so several days a...| 0| 0| false| 6| false|119d8jo| 2023-02-22| 21| 4| 2| 2023|the pokemontogeth...| 3|so several days a...| 241|
| OnePiece| hockeystew| null|2023-02-22 21:11:29|can someone help ...|https://mangabudd...| 0| 0| false| 1| false|119dcdb| 2023-02-22| 21| 4| 2| 2023|can someone help ...| 11|httpsmangabuddyco...| 30|
| OnePiece| lampione784| null|2023-02-22 21:15:21|what one piece ga...|the creator of th...| 2| 0| false| 1| false|119dicx| 2023-02-22| 21| 4| 2| 2023|what one piece ga...| 6|the creator of th...| 22|
| pokemon| blackjackgabbiani| null|2023-02-22 21:16:58|who's a non-villa...|for me, tyme insp...| 372| 0| false| 406| false|119dkrj| 2023-02-22| 21| 4| 2| 2023|whos a nonvillain...| 9|for me tyme inspi...| 122|
| pokemon| Totallynotttegegg| null|2023-02-22 21:17:58|i have a realization|​\n\n[...| 0| 0| false| 5| false|119dm5g| 2023-02-22| 21| 4| 2| 2023|i have a realization| 4|ampx200b brock is...| 6|
| OnePiece| Minigeneius| null|2023-02-22 21:20:28|thoughts on 10th ...|i've read all of ...| 3| 0| false| 0| false|119dpzv| 2023-02-22| 21| 4| 2| 2023|thoughts on 10th ...| 5|ive read all of o...| 72|
| pokemon| coolnessAlert39| null|2023-02-22 21:26:11|name any bug type...|ok now we’re doin...| 32| 0| false| 0| false|119dymy| 2023-02-22| 21| 4| 2| 2023|name any bug type...| 15|ok now were doing...| 81|
| OnePiece| vixnvox| Marine|2023-02-22 21:30:18|wtf is going on w...|he is such a rand...| 1| 0| false| 0| false|119e4cs| 2023-02-22| 21| 4| 2| 2023|wtf is going on w...| 6|he is such a rand...| 30|
| pokemon| Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|my starters for e...|gen 1: charizard ...| 1| 0| false| 0| false|117vi5f| 2023-02-21| 5| 3| 2| 2023|my starters for e...| 5|gen 1 charizard g...| 50|
| pokemon| DenseRead9852| null|2023-02-21 05:33:56|i really fucking ...|i feel like it's ...| 13| 0| false| 0| false|117vluc| 2023-02-21| 5| 3| 2| 2023|i really fucking ...| 24|i feel like its f...| 65|
| OnePiece| VA_Monkey_D_Garp| null|2023-02-21 05:38:14|just a rant about...|garp quite litera...| 0| 0| false| 8| false|117vohs| 2023-02-21| 5| 3| 2| 2023|just a rant about...| 5|garp quite litera...| 274|
| TokyoGhoul| panling69| null|2023-02-21 05:39:30|if anyone wants a...|hello everyone, j...| 5| 0| false| 7| false|117vpbl| 2023-02-21| 5| 3| 2| 2023|if anyone wants a...| 18|hello everyone ju...| 43|
| OnePiece| hopefulindiegamedev| null|2023-02-21 05:44:42|can't tell if its...|[https://www.yout...| 0| 0| false| 1| false|117vsl4| 2023-02-21| 5| 3| 2| 2023|cant tell if its ...| 28|httpswwwyoutubeco...| 12|
| OnePiece|The-seven-deadly-sin| null|2023-02-21 05:44:52|could a fishman l...| title| 8| 0| false| 2| false|117vsoi| 2023-02-21| 5| 3| 2| 2023|could a fishman l...| 24| title| 1|
| Naruto| FosterPupz| |2023-02-21 05:45:12| just a giggle|i just wanted to ...| 3| 0| false| 4| false|117vsx3| 2023-02-21| 5| 3| 2| 2023| just a giggle| 3|i just wanted to ...| 128|
| DemonSlayerAnime| Vivid-Balance9658| null|2023-02-21 05:57:54|giyushino fanfict...|hello everyone! i...| 2| 0| false| 1| false|117w0xl| 2023-02-21| 5| 3| 2| 2023|giyushino fanfiction| 2|hello everyone i ...| 57|
| Naruto| juankruh1250| |2023-02-21 05:58:17|what are some of ...|i'll start with t...| 22| 0| false| 4| false|117w15s| 2023-02-21| 5| 3| 2| 2023|what are some of ...| 12|ill start with th...| 54|
|StardustCrusaders| Many_Line9136| null|2023-02-21 05:58:58|arrow saga takeaw...|i was walking bac...| 0| 0| false| 5| false|117w1ks| 2023-02-21| 5| 3| 2| 2023|arrow saga takeaw...| 4|i was walking bac...| 227|
| OnePiece| chiep-the-riep| null|2023-02-21 06:00:22| laughtale island|laughtale is just...| 3| 0| false| 2| false|117w2jw| 2023-02-21| 6| 3| 2| 2023| laughtale island| 2|laughtale is just...| 164|
+-----------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
only showing top 20 rows
com_cleaned.cache()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 23, Finished, Available)
DataFrame[subreddit: string, author: string, author_flair_text: string, created_utc: timestamp, body: string, controversiality: bigint, score: bigint, parent_id: string, stickied: boolean, link_id: string, id: string, created_date: string, created_hour: int, created_week: int, created_month: int, created_year: int, cleaned: string, body_wordCount: int]
com_cleaned.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 24, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|subreddit| author| author_flair_text| created_utc| body|controversiality|score| parent_id|stickied| link_id| id|created_date|created_hour|created_week|created_month|created_year| cleaned|body_wordCount|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
| Gundam| Kn1ght0fv01d| null|2021-06-23 08:32:41|looks at the extr...| 0| 1| t3_o5obrx| false|t3_o5obrx|h2qxm9h| 2021-06-23| 8| 4| 6| 2021|looks at the extr...| 16|
| OnePiece| kitay123| Bounty Hunter|2021-06-23 08:32:48|what's so funny a...| 1| -4| t3_o63f5c| false|t3_o63f5c|h2qxmj8| 2021-06-23| 8| 4| 6| 2021|whats so funny ab...| 6|
| OnePiece| Darkmithra| null|2021-06-23 08:32:50|they want to mono...| 0| 3|t1_h2qwtee| false|t3_o5l1y6|h2qxmlb| 2021-06-23| 8| 4| 6| 2021|they want to mono...| 15|
| OnePiece| nehc9050| null|2021-06-23 08:32:55|i'm confused, i'm...| 0| 1|t1_h2q924w| false|t3_o5l1y6|h2qxmrn| 2021-06-23| 8| 4| 6| 2021|im confused im ju...| 12|
| OnePiece| Sea-Improvement5038| null|2021-06-23 08:32:55|nor do i get why ...| 0| 1|t1_h2qwsd4| false|t3_o5l1y6|h2qxmrv| 2021-06-23| 8| 4| 6| 2021|nor do i get why ...| 8|
| Naruto| aleky254| |2021-06-23 08:32:59|lol. nagato is a ...| 0| 1|t1_h2qw5vx| false|t3_o51egf|h2qxmvx| 2021-06-23| 8| 4| 6| 2021|lol nagato is a b...| 9|
| OnePiece| AnudeStart| null|2021-06-23 08:33:10|meth. coke isn’t ...| 0| 4|t1_h2qws0o| false|t3_o5l1y6|h2qxnb1| 2021-06-23| 8| 4| 6| 2021|meth coke isnt li...| 5|
| Naruto| EDR-Basement| |2021-06-23 08:33:14|kakashi, neji, sh...| 0| 2| t3_o66ix9| false|t3_o66ix9|h2qxnhc| 2021-06-23| 8| 4| 6| 2021|kakashi neji shin...| 6|
| pokemon| KatonRyu| null|2021-06-23 08:33:18|i still play like...| 0| 1| t3_o5h6hg| false|t3_o5h6hg|h2qxnm1| 2021-06-23| 8| 4| 6| 2021|i still play like...| 28|
| yugioh| Katze1Punkt0|:att-water: Iced ...|2021-06-23 08:33:26|thats because *it...| 0| 1| t3_o62oyc| false|t3_o62oyc|h2qxnwt| 2021-06-23| 8| 4| 6| 2021|thats because it ...| 5|
| OnePiece|The_Edgiest_Edgelord| null|2021-06-23 08:33:34|the jacket jacket...| 0| 4|t1_h2qxmlb| false|t3_o5l1y6|h2qxo85| 2021-06-23| 8| 4| 6| 2021|the jacket jacket...| 8|
| OnePiece| sekhon_98| null|2021-06-23 08:33:36|theoretically the...| 0| 1|t1_h2qwtee| false|t3_o5l1y6|h2qxoac| 2021-06-23| 8| 4| 6| 2021|theoretically the...| 29|
| OnePiece| KnowledgeNorth6337| null|2021-06-23 08:33:37|the no one knowin...| 0| 6| t3_o5l1y6| false|t3_o5l1y6|h2qxobx| 2021-06-23| 8| 4| 6| 2021|the no one knowin...| 51|
| OnePiece| mathemagician0| Lurker|2021-06-23 08:33:40|still don't like ...| 0| 5|t1_h2qww6e| false|t3_o5l1y6|h2qxogn| 2021-06-23| 8| 4| 6| 2021|still dont like w...| 78|
| Naruto| KhaoticTwist| |2021-06-23 08:33:55|isshiki is defini...| 0| -3| t3_o67sfh| false|t3_o67sfh|h2qxp0j| 2021-06-23| 8| 4| 6| 2021|isshiki is defini...| 150|
| Naruto| WeedyNaruto| |2021-06-23 08:34:00|> did you not ...| 0| 1|t1_h2qx89o| false|t3_o5k2rd|h2qxp7h| 2021-06-23| 8| 4| 6| 2021|gt did you not se...| 86|
| OnePiece| JollyBlaze| null|2021-06-23 08:34:01|if the cp9 kept t...| 0| -1| t3_o5l1y6| false|t3_o5l1y6|h2qxp9h| 2021-06-23| 8| 4| 6| 2021|if the cp9 kept t...| 61|
| OnePiece| Dumpling2104| null|2021-06-23 08:34:03|maybe that’s why ...| 0| 2|t1_h2qu81f| false|t3_o5l1y6|h2qxpce| 2021-06-23| 8| 4| 6| 2021|maybe thats why s...| 10|
| pokemon| purejackbaby| null|2021-06-23 08:34:09|could you like, g...| 0| 1| t3_o63atm| false|t3_o63atm|h2qxpjq| 2021-06-23| 8| 4| 6| 2021|could you like go...| 28|
| OnePiece| Mr_Lectures| null|2021-06-23 08:34:13|well sadly i saw ...| 0| 1|t1_h2qx75y| false|t3_o5l1y6|h2qxppz| 2021-06-23| 8| 4| 6| 2021|well sadly i saw ...| 9|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
only showing top 20 rows
datastore = 'azureml://datastores/workspaceblobstore/paths'
# https://<STORAGE-ACCOUNT>.blob.core.windows.net/<CONTAINER-NAME>
Storage_URI="https://group09astorage08f5ea16c.blob.core.windows.net/azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"
workspace_default_storage_account = "group09astorage08f5ea16c"
workspace_default_container = "azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"
workspace_wasbs_base_url = (
f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 25, Finished, Available)
sub_cleaned.write.mode("overwrite").parquet(f"{workspace_wasbs_base_url}/franchise_sub_cleaned.parquet")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 7, 27, Finished, Available)
com_cleaned.write.parquet(f"{workspace_wasbs_base_url}/franchise_com_cleaned.parquet")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 7, 29, Finished, Available)
datastore = 'azureml://datastores/workspaceblobstore/paths'
submissions = spark.read.parquet(f"{datastore}/franchise_sub_cleaned.parquet")
comments = spark.read.parquet(f"{datastore}/franchise_com_cleaned.parquet")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 8, 21, Finished, Available)
submissions = sub_cleaned
comments = com_cleanedStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 19, Finished, Available)
%pip install plotlyStatementMeta(, , -1, Finished, Available)
Requirement already satisfied: plotly in /home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages (4.14.3)
Requirement already satisfied: retrying>=1.3.3 in /home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages (from plotly) (1.3.3)
Requirement already satisfied: six in /home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages (from plotly) (1.16.0)
WARNING: You are using pip version 22.0.4; however, version 23.3.1 is available.
You should consider upgrading via the '/nfs4/pyenv-a84f12ee-d053-42ac-befe-f5a4ce7804a6/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
Warning: PySpark kernel has been restarted to use updated packages.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import plotly.graph_objects as go
import plotly.subplots as sp
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
from plotly.subplots import make_subplots
import pyspark.sql.types as TStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 20, Finished, Available)
comments.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 8, 22, Finished, Available)
+---------+--------------------+-----------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|subreddit| author|author_flair_text| created_utc| body|controversiality|score| parent_id|stickied| link_id| id|created_date|created_hour|created_week|created_month|created_year| cleaned|body_wordCount|
+---------+--------------------+-----------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
| pokemon| BucketHeadJr| null|2021-02-26 15:35:03|do they? the enco...| 0| 8|t1_gou6k94| false|t3_lszxq0|gou70b1| 2021-02-26| 15| 6| 2| 2021|do they the encou...| 23|
| pokemon| salgadosp| null|2021-02-26 15:35:04|yeah but how will...| 0| 9|t1_gou6w8h| false|t3_lt01hi|gou70bq| 2021-02-26| 15| 6| 2| 2021|yeah but how will...| 13|
| pokemon| moiraixo| null|2021-02-26 15:35:04|i was meant to vo...| 0| 1| t3_lt01hi| false|t3_lt01hi|gou70bt| 2021-02-26| 15| 6| 2| 2021|i was meant to vo...| 10|
| OnePiece| tylionheart| Cipher Pol|2021-02-26 15:35:04|well hope you wer...| 0| 0|t1_gou6j3l| false|t3_lsta9h|gou70d0| 2021-02-26| 15| 6| 2| 2021|well hope you wer...| 12|
| pokemon| Ethacon| null|2021-02-26 15:35:05|i just hope it ha...| 0| 1| t3_lszxq0| false|t3_lszxq0|gou70ds| 2021-02-26| 15| 6| 2| 2021|i just hope it ha...| 12|
| pokemon| fedemasa| null|2021-02-26 15:35:05|drifloon is an aw...| 0| 6|t1_gotw6di| false|t3_lsycl4|gou70eg| 2021-02-26| 15| 6| 2| 2021|drifloon is an aw...| 5|
| pokemon|JustAHipsterInDenial| null|2021-02-26 15:35:05|rowlet is my one ...| 0| 22|t1_gou5i4t| false|t3_lsztja|gou70f8| 2021-02-26| 15| 6| 2| 2021|rowlet is my one ...| 27|
| pokemon| PlusUltraTaylor| null|2021-02-26 15:35:06|it's not made by ...| 0| 1| t3_lt03jh| false|t3_lt03jh|gou70hi| 2021-02-26| 15| 6| 2| 2021|its not made by g...| 6|
| pokemon| coolfreshguy| null|2021-02-26 15:35:06|definitely gonna ...| 0| 1|t1_gou5uzs| false|t3_lsycl4|gou70hr| 2021-02-26| 15| 6| 2| 2021|definitely gonna ...| 4|
| pokemon| Mastreos| null|2021-02-26 15:35:06|tbh i love it. i'...| 0| 1| t3_lszzxu| false|t3_lszzxu|gou70ij| 2021-02-26| 15| 6| 2| 2021|tbh i love it ive...| 11|
| pokemon| thebombyboi| null|2021-02-26 15:35:06|also the animatio...| 0| 2|t1_gou6vvi| false|t3_lsztja|gou70iv| 2021-02-26| 15| 6| 2| 2021|also the animatio...| 19|
| pokemon| WhiteSilverDragoon| null|2021-02-26 15:35:07|i hate the artsty...| 0| 1| t3_lszxq0| false|t3_lszxq0|gou70kv| 2021-02-26| 15| 6| 2| 2021|i hate the artsty...| 43|
| OnePiece| Florissssss| null|2021-02-26 15:35:07|after wci if sanj...| 0| 14| t3_lsta9h| false|t3_lsta9h|gou70l5| 2021-02-26| 15| 6| 2| 2021|after wci if sanj...| 98|
| pokemon| TPGPC| null|2021-02-26 15:35:07|honestly terrible...| 0| 14| t3_lszzxu| false|t3_lszzxu|gou70l8| 2021-02-26| 15| 6| 2| 2021|honestly terrible...| 43|
| pokemon| Zenn-13| null|2021-02-26 15:35:07|yup. i guess i ca...| 0| 14|t1_gou6q3c| false|t3_lsycl4|gou70lv| 2021-02-26| 15| 6| 2| 2021|yup i guess i can...| 16|
| pokemon| T_Raycroft| null|2021-02-26 15:35:07|isn’t that what p...| 0| 21| t3_lt03qw| false|t3_lt03qw|gou70m1| 2021-02-26| 15| 6| 2| 2021|isnt that what pe...| 17|
| pokemon| Velocirob| null|2021-02-26 15:35:08|no battle frontie...| 0| 5|t1_gou5qsg| false|t3_lsztwl|gou70nj| 2021-02-26| 15| 6| 2| 2021|no battle frontie...| 13|
| pokemon| LookaLookaKooLaLey| null|2021-02-26 15:35:08|seriously!!!! sws...| 1| 5| t3_lt00a8| false|t3_lt00a8|gou70pq| 2021-02-26| 15| 6| 2| 2021|seriously swsh ar...| 32|
| pokemon| Dewottle| null|2021-02-26 15:35:09|i thought this wa...| 0| 2| t3_lszz5o| false|t3_lszz5o|gou70q2| 2021-02-26| 15| 6| 2| 2021|i thought this wa...| 23|
| pokemon| Oddish03| null|2021-02-26 15:35:08|john hanke from n...| 0| 2|t1_gou3d9l| false|t3_lsycl4|gou70q3| 2021-02-26| 15| 6| 2| 2021|john hanke from n...| 6|
+---------+--------------------+-----------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
only showing top 20 rows
total_comments = comments.count()
total_submissions = submissions.count()
print("Total Comments:", total_comments)
print("Total Submissions:", total_submissions)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 32, Finished, Available)
# Group by subreddit and calculate counts and average scores for comments
comments_grouped = comments.groupBy('subreddit') \
.agg(
count('subreddit').alias('comments_count'),
(count('subreddit') / total_comments * 100).alias('comments_percentage'),
mean('score').alias('avg_comments_score')
)
# Group by subreddit and calculate counts and average scores for submissions
submissions_grouped = submissions.groupBy('subreddit') \
.agg(
count('subreddit').alias('submissions_count'),
(count('subreddit') / total_submissions * 100).alias('submissions_percentage'),
mean('score').alias('avg_submissions_score')
)
result_table = comments_grouped.join(submissions_grouped, 'subreddit', 'outer')
# Fill nulls with zeros
result_table = result_table.na.fill(0)
result_table.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 33, Finished, Available)
result_table_pandas = result_table.toPandas()
result_table_pandas.to_csv('comparative_franchise_result_table.csv', index=False)# Load the data
file_path = 'Users/hw487/fall-2023-reddit-project-team-09/data/csv/top_animes_avg_score_vs_count.csv'
result_table_pandas = pd.read_csv(file_path)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 36, Finished, Available)
# Check the column names
print("Original column names:", result_table_pandas.columns)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 39, Finished, Available)
Original column names: Index([' subreddit', 'comments_count', 'comments_percentage',
'avg_comments_score', 'submissions_count', 'submissions_percentage',
'avg_submissions_score'],
dtype='object')
result_table_pandas.columns = [col.strip() for col in result_table_pandas.columns]
# result_table_pandas.rename(columns={' subreddit': 'subreddit'}, inplace=True)
# Check the column names after renaming
print("Updated column names:", result_table_pandas.columns)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 40, Finished, Available)
Updated column names: Index(['subreddit', 'comments_count', 'comments_percentage',
'avg_comments_score', 'submissions_count', 'submissions_percentage',
'avg_submissions_score'],
dtype='object')
result_table_pandas.to_csv('comparative_franchise_result_table.csv', index=False)# Normalize the size of the bubbles for comments count
bubble_size = result_table_pandas['comments_count'].copy()
size_scale = 10000 # Adjust this scale to get appropriately sized bubbles
bubble_size = (bubble_size / bubble_size.max()) * size_scale
# Create the Plotly bubble chart
fig = px.scatter(
result_table_pandas,
x='avg_comments_score',
y='avg_submissions_score',
size=bubble_size, # sets the bubble sizes
color='comments_count', # sets the bubble colors based on comment count
color_continuous_scale='Purpor',
hover_name='subreddit', # shows subreddit name when hovering over bubbles
labels={'avg_comments_score': 'Average Comment Score', 'avg_submissions_score': 'Average Submission Score', 'comments_count': 'Comment Count'},
title='Bubble Plot of Reddit Activity by Subreddit',
template='plotly_white'
)
# Show the plot
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 44, Finished, Available)
# Aggregate the number of comments per month using existing columns
comments_monthly_count = comments.groupBy('created_year', 'created_month') \
.count() \
.withColumn('year_month', F.concat_ws('-', 'created_year', 'created_month')) \
.toPandas()
# Do the same for submissions
submissions_monthly_count = submissions.groupBy('created_year', 'created_month') \
.count() \
.withColumn('year_month', F.concat_ws('-', 'created_year', 'created_month')) \
.toPandas()
# Sort both DataFrames by year_month
comments_monthly_count.sort_values('year_month', inplace=True)
submissions_monthly_count.sort_values('year_month', inplace=True)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 46, Finished, Available)
# Adjust year_month format and sort in Pandas for comments
comments_monthly_count['year_month'] = comments_monthly_count['created_year'].astype(str) + '-' + comments_monthly_count['created_month'].astype(str).str.zfill(2)
comments_monthly_count.sort_values('year_month', inplace=True)
# Do the same for submissions
submissions_monthly_count['year_month'] = submissions_monthly_count['created_year'].astype(str) + '-' + submissions_monthly_count['created_month'].astype(str).str.zfill(2)
submissions_monthly_count.sort_values('year_month', inplace=True)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 52, Finished, Available)
# Save the data
comments_monthly_count.to_csv('comments_monthly_count_franchise.csv', index=False)
submissions_monthly_count.to_csv('submissions_monthly_count_franchise.csv', index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 54, Finished, Available)
# Load the data
comments_file_path = 'Users/hw487/fall-2023-reddit-project-team-09/data/csv/comments_monthly_count_franchise.csv'
submissions_file_path = 'Users/hw487/fall-2023-reddit-project-team-09/data/csv/submissions_monthly_count_franchise.csv'
comments_monthly_count = pd.read_csv(comments_file_path)
submissions_monthly_count = pd.read_csv(submissions_file_path)# Create a subplot with two y-axes
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add a line trace for submissions
fig.add_trace(
go.Scatter(
x=submissions_monthly_count['year_month'],
y=submissions_monthly_count['count'],
name='Submissions',
marker_color='#42a1b9'
),
secondary_y=False
)
# Add a line trace for comments
fig.add_trace(
go.Scatter(
x=comments_monthly_count['year_month'],
y=comments_monthly_count['count'],
name='Comments',
marker_color='#d13a47'
),
secondary_y=True
)
# Update the layout
fig.update_layout(
title='Monthly Trends of Comments and Submissions',
xaxis_title='Month',
xaxis_tickangle=45
)
# Update y-axes titles
fig.update_yaxes(title_text='Number of Submissions', secondary_y=False)
fig.update_yaxes(title_text='Number of Comments', secondary_y=True)
# Show the figure
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 55, Finished, Available)
comments_by_month_subreddit = comments \
.withColumn('year_month', date_format('created_utc', 'yyyy-MM')) \
.groupBy('year_month', 'subreddit') \
.count() \
.toPandas()
# Filter the dataframe for the subreddits of interest
subreddits_of_interest = ['pokemon', 'OnePiece', 'Naruto', 'OnePunchMan', 'yugioh', 'Gundam',
'StardustCrusaders', 'attackontitan', 'dbz', 'digimon', 'DemonSlayerAnime',
'Kaguya_sama', 'swordartonline', 'KillLaKill', 'TokyoGhoul']
filtered_df = comments_by_month_subreddit[comments_by_month_subreddit['subreddit'].isin(subreddits_of_interest)]
# Pivot the DataFrame so that each subreddit has its own column
pivot_df = filtered_df.pivot(index='year_month', columns='subreddit', values='count')
# Sort the DataFrame by year_month
pivot_df.sort_index(inplace=True)
# Custom color palette
color_palette = [
"#42a63c", "#42a1b9",
"#d13a47", "#f7c200",
]
desired_saturation = 0.5 # Decreased saturation
# Generate more colors if there are more subreddits than colors in the custom palette
if len(subreddits_of_interest) > len(color_palette):
# Create additional colors with decreased saturation
additional_colors = sns.husl_palette(len(subreddits_of_interest) - len(color_palette), s=desired_saturation)
# Convert RGB to hex
additional_colors_hex = [mcolors.to_hex(color) for color in additional_colors]
# Add the additional colors to our color palette
color_palette.extend(additional_colors_hex)
# Ensure there are enough colors for the subreddits
assert len(color_palette) >= len(subreddits_of_interest), "Not enough colors for the number of subreddits"
# blended_color_palette = sns.blend_palette(["#42a63c","#42a1b9","#967bb6","#d13a47","#f7c200"], 15)
# blended_color_palette = sns.blend_palette(["#42a63c","#f7c200","#d13a47","#967bb6","#42a1b9"], 15)
# Create a Plotly figure
fig = go.Figure()
# Loop through each subreddit in the columns and add them as a separate line in the plot
for i, subreddit in enumerate(pivot_df.columns):
fig.add_trace(go.Scatter(
x=pivot_df.index,
y=pivot_df[subreddit],
mode='lines+markers', # Line with markers
name=subreddit, # Legend name
line=dict(color=color_palette[i % len(color_palette)]) # Assign color from the palette / blended_color_palette
))
# Update layout with titles and labels
fig.update_layout(
title='Number of Comments Per Month by Subreddit',
xaxis_title='Month',
xaxis_tickangle=45, # Rotate x-axis labels
yaxis_title='Number of Comments',
legend=dict(x=0.5, y=-0.2, xanchor='center', orientation='h'), # Position the legend
margin=dict(b=150) # Adjust bottom margin to accommodate the legend
)
# Show the plot
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 56, Finished, Available)
# Save the data
filtered_df.to_csv('comments_monthly_filtered_cross_franchise.csv', index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 99, Finished, Available)
# Load the data
comments_filtered_path = 'Users/hw487/fall-2023-reddit-project-team-09/data/csv/comments_monthly_filtered_cross_franchise.csv'
filtered_df = pd.read_csv(comments_filtered_path)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 111, Finished, Available)
# Pivot the DataFrame so that each subreddit has its own column
pivot_df = filtered_df.pivot(index='year_month', columns='subreddit', values='count')
# Sort the DataFrame by year_month
pivot_df.sort_index(inplace=True)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 112, Finished, Available)
# Custom color palette
color_palette = [
"#42a63c", "#42a1b9",
"#d13a47", "#f7c200",
]
desired_saturation = 0.5 # Decreased saturation
# Generate more colors if there are more subreddits than colors in the custom palette
if len(subreddits_of_interest) > len(color_palette):
# Create additional colors with decreased saturation
additional_colors = sns.husl_palette(len(subreddits_of_interest) - len(color_palette), s=desired_saturation)
# Convert RGB to hex
additional_colors_hex = [mcolors.to_hex(color) for color in additional_colors]
# Add the additional colors to our color palette
color_palette.extend(additional_colors_hex)
# Ensure there are enough colors for the subreddits
assert len(color_palette) >= len(subreddits_of_interest), "Not enough colors for the number of subreddits"
# Create a Plotly figure
fig = go.Figure()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 113, Finished, Available)
# Define the number of traces (subreddits) per legend group
traces_per_group = 3
# Loop through each subreddit and add them as a separate line in the plot
for i, subreddit in enumerate(pivot_df.columns):
legend_group = f'group{i // traces_per_group}' # Define legend group
color = color_palette[i % len(color_palette)] # Assign color from the palette
fig.add_trace(go.Scatter(
x=pivot_df.index,
y=pivot_df[subreddit],
mode='lines+markers',
name=subreddit,
line=dict(color=color),
legendgroup=legend_group # Assign to a legend group
))StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 114, Finished, Available)
# Update layout with titles, labels, and adjusted legend
fig.update_layout(
title='Number of Comments Per Month by Subreddit',
xaxis_title='Month',
xaxis_tickangle=45,
yaxis_title='Number of Comments',
legend=dict(
x=0.5, y=-0.5, xanchor='center', orientation='h',
tracegroupgap=100 # Adjust for spacing between groups
),
margin=dict(b=200) # May need adjustment for legend
)
# Show the plot
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 115, Finished, Available)
# Create a Seaborn blended color palette
# blended_color_palette = sns.blend_palette(["#42a63c","#42a1b9","#967bb6","#d13a47","#f7c200"], 15)
blended_color_palette = sns.blend_palette(["#42a63c","#f7c200","#d13a47","#967bb6","#42a1b9"], 15)
# blended_color_palette = sns.blend_palette(['#d13a47', '#967bb6', '#42a1b9', '#42a63c', '#f7c200'], 15)
# Convert RGB tuples to hexadecimal format
hex_color_palette = [mcolors.to_hex(color) for color in blended_color_palette]StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 116, Finished, Available)
import plotly.graph_objects as go
# Create a Plotly figure
fig = go.Figure()
# Loop through each subreddit in the columns and add them as a separate line in the plot
for i, subreddit in enumerate(pivot_df.columns):
fig.add_trace(go.Scatter(
x=pivot_df.index,
y=pivot_df[subreddit],
mode='lines+markers',
name=subreddit,
line=dict(color=hex_color_palette[i % len(hex_color_palette)]),
legendgroup=f'group{i // 3}' # Assigning a legend group based on index
))
# Update layout with titles and labels
fig.update_layout(
title='Number of Comments Per Month by Subreddit',
xaxis_title='Month',
xaxis_tickangle=45,
yaxis_title='Number of Comments',
legend=dict(
x=0.5,
y=-0.5, # Adjust as needed
xanchor='center',
orientation='h',
tracegroupgap=135, # Adjust this value for spacing between legend groups
itemsizing='constant' # Keeps legend symbols the same size
),
margin=dict(b=200) # Adjust bottom margin to accommodate the legend
)
# Show the plot
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 9, 117, Finished, Available)
data = {
'subreddit': ['NeonGenesisEvangelion', 'Kaguya_sama', 'pokemon', 'StardustCrusaders', 'yugioh',
'ShokugekiNoSoma', 'OnePiece', 'TokyoGhoul', 'attackontitan', 'OneTruthPrevails',
'swordartonline', 'Gundam', 'dbz', 'OnePunchMan', 'KillLaKill', 'digimon',
'DetectiveConan', 'Naruto', 'DemonSlayerAnime'],
'count': [31571, 229630, 5703713, 763751, 1238280, 18705, 7166824, 81979, 691710, 67709,
211009, 842233, 553935, 1848632, 82956, 528217, 11357, 2146971, 324778]
}
df = pd.DataFrame(data)
# Calculate the total count
total_count = df['count'].sum()
# Calculate the proportion for each subreddit
df['proportion'] = (df['count'] / total_count) * 100StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 21, Finished, Available)
# Generate a blended color palette
blended_colors = sns.blend_palette(["#d5ecf1","#f4d7da","#fdf3cb","#d7edd9"], len(df))
# Convert the RGB tuples to hex colors for Plotly
custom_colors = [mcolors.to_hex(color) for color in blended_colors]StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 22, Finished, Available)
# Create the Plotly pie chart
fig = px.pie(
df,
names='subreddit',
values='proportion',
color_discrete_sequence=custom_colors, # Use the custom color sequence
title='Proportion of Comments by Subreddit',
hole=0.3 # Optional: Create a donut chart by specifying a hole size
)
# Customizing the labels and layout
fig.update_traces(textinfo='percent+label', textposition='inside')
fig.update_layout(
showlegend=True,
legend_title_text='Subreddits',
legend=dict(
orientation='h',
yanchor='bottom',
y=-0.7, # Adjust position of legend
xanchor='center',
x=0.5
)
)
# Show the plot
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 26, Finished, Available)
import seaborn as sns
import matplotlib.pyplot as plt
sampled_comments = comments.sample(False, 0.1, seed=12)
# Select only the 'subreddit' and 'score' columns and convert to Pandas DataFrame
subreddit_scores = sampled_comments.select('subreddit', 'score').toPandas()
plt.figure(figsize=(14, 7))
# Boxplot without outliers
sns.boxplot(x='score', y='subreddit', data=subreddit_scores, orient='h', showfliers=False)
# Adding a strip plot to overlay the actual scores
sns.stripplot(x='score', y='subreddit', data=subreddit_scores, orient='h',
color='grey', alpha=0.5, size=3)
# Customize the appearance
sns.set_style("whitegrid")
sns.set_palette("pastel")
# Set a log scale for the x-axis if needed
# plt.xscale('log') # Uncomment this line if score distribution is highly skewed
plt.title('Distribution of Scores Across Subreddits')
plt.xlabel('Score')
plt.ylabel('Subreddit')
plt.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 31, Finished, Available)
plt.figure(figsize=(14, 7))
# Boxplot without outliers
sns.boxplot(x='score', y='subreddit', data=subreddit_scores, orient='h', showfliers=False)
# Customize the appearance
sns.set_style("whitegrid")
sns.set_palette("pastel")
plt.title('Distribution of Scores Across Subreddits')
plt.xlabel('Score')
plt.ylabel('Subreddit')
plt.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 51, 42, Finished, Available)