sparkStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 6, Finished, Available)
SparkSession - hive
sparkStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 6, Finished, Available)
SparkSession - hive
import pyspark.sql.functions as F
from pyspark.sql.functions import sum as _sum, mean, stddev, max as _max, min as _min, count, percentile_approx, year, month, dayofmonth, ceil, col, dayofweek, hour, explode, date_format, lower, size, split, regexp_replace, isnan, whenStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 7, Finished, Available)
blob_account_name = "marckvnonprodblob"
blob_container_name = "bigdata"
# read only
blob_sas_token = "?sv=2021-10-04&st=2023-10-04T01%3A42%3A59Z&se=2024-01-02T02%3A42%3A00Z&sr=c&sp=rlf&sig=w3CH9MbCOpwO7DtHlrahc7AlRPxSZZb8MOgS6TaXLzI%3D"
wasbs_base_url = (
f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/"
)
spark.conf.set(
f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net",
blob_sas_token,
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 55, Finished, Available)
comments_path = "reddit-parquet/comments/"
submissions_path = "reddit-parquet/submissions/"StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 56, Finished, Available)
comments_df = spark.read.parquet(f"{wasbs_base_url}{comments_path}")
submissions_df = spark.read.parquet(f"{wasbs_base_url}{submissions_path}")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 57, Finished, Available)
submissions = submissions_df.filter(submissions_df.subreddit == "pokemon")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 11, Finished, Available)
comments = comments_df.filter(comments_df.subreddit == "pokemon")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 58, Finished, Available)
sub = submissions.select("subreddit", "author", "author_flair_text", "created_utc", "title", "selftext", "num_comments", "num_crossposts", "over_18", "score", "stickied", "id")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 13, Finished, Available)
sub.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 14, Finished, Available)
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
|subreddit| author| author_flair_text| created_utc| title| selftext|num_comments|num_crossposts|over_18|score|stickied| id|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
| pokemon| OneWhoGetsBread| null|2023-02-22 21:08:56|the PokemonTogeth...|So several days a...| 0| 0| false| 6| false|119d8jo|
| pokemon| lumixod| null|2023-02-22 21:13:47|Wholesome friendship| | 69| 0| false| 1947| false|119dfv9|
| pokemon| KubfuKid| null|2023-02-22 21:16:45|(OC) Diego the Gh...| | 2| 0| false| 18| false|119dkh9|
| pokemon|blackjackgabbiani| null|2023-02-22 21:16:58|Who's a non-villa...|For me, Tyme insp...| 372| 0| false| 406| false|119dkrj|
| pokemon| [deleted]| null|2023-02-22 21:17:16|Zard Hunting in B...| [removed]| 0| 0| false| 1| false|119dl6k|
| pokemon|Totallynotttegegg| null|2023-02-22 21:17:58|i have a realization|​\n\n[...| 0| 0| false| 5| false|119dm5g|
| pokemon| MeiLei-|:906::722::495::2...|2023-02-22 21:18:19|bored in class. h...| | 6| 0| false| 0| false|119dmnh|
| pokemon| [deleted]| null|2023-02-22 21:19:53|Original 151, cau...| [deleted]| 15| 0| false| 153| false|119dp0i|
| pokemon| gravemind9| null|2023-02-22 21:25:09|not a single thou...| | 3| 0| false| 6| false|119dx5q|
| pokemon| coolnessAlert39| null|2023-02-22 21:26:11|Name any Bug type...|Ok now we’re doin...| 32| 0| false| 0| false|119dymy|
| pokemon| War_Daddy_992| null|2023-02-22 21:26:21|Team Magma create...| | 42| 0| false| 262| false|119dyv4|
| pokemon| Redd_ddd| null|2023-02-22 21:28:13|I gave Ghetsis a ...| | 28| 0| false| 88| false|119e1e5|
| pokemon| Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|My starters for e...|Gen 1: Charizard ...| 1| 0| false| 0| false|117vi5f|
| pokemon| DenseRead9852| null|2023-02-21 05:33:56|I really fucking ...|I feel like it's ...| 13| 0| false| 0| false|117vluc|
| pokemon| [deleted]| null|2023-02-21 05:36:05|Opinion on Pokemo...| [removed]| 0| 0| false| 1| false|117vn8n|
| pokemon| UncleQnobi| null|2023-02-21 05:43:01| Team Gyarados🥶| | 0| 0| false| 1| false|117vrkl|
| pokemon| eyemcantoeknees| null|2023-02-21 05:45:19|I made Tinkaton o...| | 10| 0| false| 192| false|117vszo|
| pokemon| [deleted]| null|2023-02-21 05:48:49|Tough question;If...| [removed]| 0| 0| false| 1| false|117vv70|
| pokemon| [deleted]| null|2023-02-21 05:49:41| Vaporeon...?| [deleted]| 1| 0| false| 0| false|117vvqi|
| pokemon| Attakrus| null|2023-02-21 05:51:46|Tough question;If...| [removed]| 29| 0| false| 1| false|117vx29|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
only showing top 20 rows
com = comments.select("subreddit", "author", "author_flair_text", "created_utc", "body", "controversiality", "score", "parent_id", "stickied", "link_id", "id")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 59, Finished, Available)
com.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 60, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|subreddit| author| author_flair_text| created_utc| body|controversiality|score| parent_id|stickied| link_id| id|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
| pokemon| KatonRyu| null|2021-06-23 08:33:18|I still play like...| 0| 1| t3_o5h6hg| false|t3_o5h6hg|h2qxnm1|
| pokemon| purejackbaby| null|2021-06-23 08:34:09|Could you like, g...| 0| 1| t3_o63atm| false|t3_o63atm|h2qxpjq|
| pokemon|Nothingyoudoisawaste| null|2021-06-23 08:34:16|It felt forced - ...| 0| 2|t1_gzxgl8w| false|t3_nnyg0q|h2qxpu5|
| pokemon| PuxOf_| null|2021-06-23 08:34:19|doing challenge m...| 0| 1| t3_o5j81l| false|t3_o5j81l|h2qxpxh|
| pokemon| [deleted]| null|2021-06-23 08:34:34| [removed]| 0| 1| t3_o67wf5| false|t3_o67wf5|h2qxqik|
| pokemon| ProbablyPuma| I am Rain Boi|2021-06-23 08:37:47|Yall its supposed...| 0| 1| t3_o66cwy| false|t3_o66cwy|h2qxxrd|
| pokemon| [deleted]| null|2021-06-23 08:38:13| [removed]| 0| 1|t1_h2pogtp| false|t3_o609sa|h2qxyp7|
| pokemon| Fishsticks03|:gm: Avenge the F...|2021-06-23 08:38:24| demon prawn| 0| 1| t3_o5myo5| false|t3_o5myo5|h2qxz3e|
| pokemon| DeadpoolCroatia| null|2021-06-23 08:40:30|Miltank was never...| 0| 1| t3_o5j81l| false|t3_o5j81l|h2qy3pv|
| pokemon| lofi_rico| null|2021-06-23 08:40:43|I can't believe N...| 0| 1| t3_o63atm| false|t3_o63atm|h2qy479|
| pokemon|Aggravating-Public15| null|2021-06-23 08:41:25|I guess that’s tr...| 0| 1|t1_h2pynet| false|t3_o589jq|h2qy5nj|
| pokemon| ElGatoGuerrero72| null|2021-06-23 08:42:15|I feel like Steph...| 0| 9| t3_o65117| false|t3_o65117|h2qy7d7|
| pokemon| Long__Jump| null|2021-06-23 08:42:45|Shiny Bibarel as ...| 0| 4| t3_o5wra5| false|t3_o5wra5|h2qy8d4|
| pokemon| Aryallie_18| null|2021-06-23 08:43:25|These are adorabl...| 0| 1| t3_o63atm| false|t3_o63atm|h2qy9tl|
| pokemon| Jankufood| null|2021-06-23 08:44:46|If the original P...| 0| 1| t3_o63atm| false|t3_o63atm|h2qycm4|
| pokemon| Snoo_58305| null|2021-06-23 08:44:47|That’s really ama...| 0| 1| t3_o63atm| false|t3_o63atm|h2qycnm|
| pokemon| Bringbackdigimon| null|2021-06-23 08:44:51|Don’t give gamefr...| 0| 1| t3_o63atm| false|t3_o63atm|h2qycs5|
| pokemon|Aggravating-Public15| null|2021-06-23 08:44:55|I loved gen 5 lol...| 0| 4| t3_o67fvr| false|t3_o67fvr|h2qycy3|
| pokemon| [deleted]| null|2021-06-23 08:45:21| [removed]| 0| 1| t3_o5j81l| false|t3_o5j81l|h2qydtu|
| pokemon| MoonLightScreen|WHY WASN'T I IN MOON|2021-06-23 08:45:49|Well, naturally. ...| 0| 3| t3_o64dri| false|t3_o64dri|h2qyeuv|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
only showing top 20 rows
sub = sub.filter(
(col('title')!='') & \
(col('title')!='[deleted]') & \
(col('title')!='[removed]') & \
(col('selftext')!='') & \
(col('selftext')!='[deleted]') & \
(col('selftext')!='[removed]') & \
(col('author')!='[deleted]') & \
(col('author')!='[removed]')
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 17, Finished, Available)
com = com.filter(
(col('body')!='') & \
(col('body')!='[deleted]') & \
(col('body')!='[removed]') & \
(col('author')!='[deleted]') & \
(col('author')!='[removed]')
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 61, Finished, Available)
sub_cleaned = (
sub
.withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
.withColumn("created_hour", hour("created_utc")) # create hour column
.withColumn("created_week", dayofweek("created_utc")) # create day of the week column
.withColumn("created_month", month("created_utc")) # create month of the year column
.withColumn("created_year", year("created_utc")) # create the year column
.withColumn("title", lower(col('title'))) # text cleaning: lowercase
.withColumn("selftext", lower(col('selftext'))) # text cleaning: lowercase
.withColumn("cleaned_title", regexp_replace(col('title'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
.withColumn("cleaned_title", regexp_replace(col('cleaned_title'), r'\s+', ' ')) # text cleaning: remove extra space in text
.withColumn('title_wordCount', size(split(col('cleaned_title'), ' '))) # word count
.withColumn("cleaned_selftext", regexp_replace(col('selftext'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
.withColumn("cleaned_selftext", regexp_replace(col('cleaned_selftext'), r'\s+', ' ')) # text cleaning: remove extra space in text
.withColumn('selftext_wordCount', size(split(col('cleaned_selftext'), ' '))) # word count
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 19, Finished, Available)
com_cleaned = (
com
.withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
.withColumn("created_hour", hour("created_utc")) # create hour column
.withColumn("created_week", dayofweek("created_utc")) # create day of the week column
.withColumn("created_month", month("created_utc")) # create month of the year column
.withColumn("created_year", year("created_utc")) # create the year column
.withColumn("body", lower(col('body'))) # text cleaning: lowercase
.withColumn("cleaned", regexp_replace(col('body'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
.withColumn("cleaned", regexp_replace(col('cleaned'), r'\s+', ' ')) # text cleaning: remove extra space in text
.withColumn('body_wordCount', size(split(col('cleaned'), ' '))) # word count
)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 62, Finished, Available)
sub_cleaned.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 21, Finished, Available)
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
|subreddit| author| author_flair_text| created_utc| title| selftext|num_comments|num_crossposts|over_18|score|stickied| id|created_date|created_hour|created_week|created_month|created_year| cleaned_title|title_wordCount| cleaned_selftext|selftext_wordCount|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
| pokemon| OneWhoGetsBread| null|2023-02-22 21:08:56|the pokemontogeth...|so several days a...| 0| 0| false| 6| false|119d8jo| 2023-02-22| 21| 4| 2| 2023|the pokemontogeth...| 3|so several days a...| 241|
| pokemon|blackjackgabbiani| null|2023-02-22 21:16:58|who's a non-villa...|for me, tyme insp...| 372| 0| false| 406| false|119dkrj| 2023-02-22| 21| 4| 2| 2023|whos a nonvillain...| 9|for me tyme inspi...| 122|
| pokemon|Totallynotttegegg| null|2023-02-22 21:17:58|i have a realization|​\n\n[...| 0| 0| false| 5| false|119dm5g| 2023-02-22| 21| 4| 2| 2023|i have a realization| 4|ampx200b brock is...| 6|
| pokemon| coolnessAlert39| null|2023-02-22 21:26:11|name any bug type...|ok now we’re doin...| 32| 0| false| 0| false|119dymy| 2023-02-22| 21| 4| 2| 2023|name any bug type...| 15|ok now were doing...| 81|
| pokemon| Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|my starters for e...|gen 1: charizard ...| 1| 0| false| 0| false|117vi5f| 2023-02-21| 5| 3| 2| 2023|my starters for e...| 5|gen 1 charizard g...| 50|
| pokemon| DenseRead9852| null|2023-02-21 05:33:56|i really fucking ...|i feel like it's ...| 13| 0| false| 0| false|117vluc| 2023-02-21| 5| 3| 2| 2023|i really fucking ...| 24|i feel like its f...| 65|
| pokemon| Shiny_Hypno| null|2023-02-21 06:01:24|one of the follow...|i came up with th...| 0| 0| false| 3| false|117w3da| 2023-02-21| 6| 3| 2| 2023|one of the follow...| 16|i came up with th...| 91|
| pokemon| EmiKoizuwumi| null|2023-02-21 06:34:43|emi's daily pokem...|[art by me!](http...| 0| 0| false| 5| false|117wo87| 2023-02-21| 6| 3| 2| 2023|emis daily pokemo...| 10|art by mehttpspre...| 530|
| pokemon| sirius1208| null|2023-02-21 05:07:59|first time ev tra...|i'm playing throu...| 5| 0| false| 2| false|117v4t1| 2023-02-21| 5| 3| 2| 2023|first time ev tra...| 4|im playing throug...| 84|
| pokemon| Mr-Green129| null|2023-02-21 19:35:05|monotype run for ...|i have had shield...| 2| 0| false| 1| false|118d5sj| 2023-02-21| 19| 3| 2| 2023|monotype run for ...| 5|i have had shield...| 51|
| pokemon| Sgrinfio| null|2021-02-23 08:03:16|what's the differ...|i always thought ...| 8| 0| false| 5| false| lqcz1n| 2021-02-23| 8| 3| 2| 2021|whats the differe...| 7|i always thought ...| 96|
| pokemon| superharry24| null|2021-02-23 03:37:09|which anime serie...|[removed]\n\n[vie...| 0| 0| false| 1| false| lq86lq| 2021-02-23| 3| 3| 2| 2021|which anime serie...| 8|removed view poll...| 3|
| pokemon| superharry24| null|2021-02-23 03:39:29|which anime serie...|[removed]\n\n[vie...| 0| 0| false| 1| false| lq885n| 2021-02-23| 3| 3| 2| 2021|which anime serie...| 8|removed view poll...| 3|
| pokemon| superharry24| null|2021-02-23 03:43:01|which anime serie...|[removed]\n\n[vie...| 0| 0| false| 1| false| lq8all| 2021-02-23| 3| 3| 2| 2021|which anime serie...| 8|removed view poll...| 3|
| pokemon| superharry24| null|2021-02-23 03:44:19|favorite series p...|[removed]\n\n[vie...| 0| 0| false| 1| false| lq8bfn| 2021-02-23| 3| 3| 2| 2021|favorite series p...| 4|removed view poll...| 3|
| pokemon| superharry24| null|2021-02-23 03:48:55|what is your favo...|just wondering wh...| 5| 0| false| 5| false| lq8ek4| 2021-02-23| 3| 3| 2| 2021|what is your favo...| 7|just wondering wh...| 73|
| pokemon| superharry24| null|2021-02-23 03:49:48|what is your favo...|just wondering wh...| 4| 0| false| 3| false| lq8f5t| 2021-02-23| 3| 3| 2| 2021|what is your favo...| 7|just wondering wh...| 73|
| pokemon| PCN24454| null|2021-02-23 04:21:44|what are your big...|what's something ...| 4| 0| false| 3| false| lq91fh| 2021-02-23| 4| 3| 2| 2021|what are your big...| 8|whats something t...| 133|
| pokemon| Kato756| null|2021-02-23 02:57:25|did you guys see ...|so i guess we wil...| 4| 0| false| 2| false| lq7egb| 2021-02-23| 2| 3| 2| 2021|did you guys see ...| 20|so i guess we wil...| 54|
| pokemon| SnowPhoenix9999| ...|2021-02-23 05:00:19|[rebuild tuesday]...|hello everyone!\n...| 8| 0| false| 1| false| lq7iq8| 2021-02-23| 5| 3| 2| 2021|rebuild tuesday r...| 3|hello everyone we...| 364|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
only showing top 20 rows
com_cleaned.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 66, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|subreddit| author| author_flair_text| created_utc| body|controversiality|score| parent_id|stickied| link_id| id|created_date|created_hour|created_week|created_month|created_year| cleaned|body_wordCount|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
| pokemon| KatonRyu| null|2021-06-23 08:33:18|i still play like...| 0| 1| t3_o5h6hg| false|t3_o5h6hg|h2qxnm1| 2021-06-23| 8| 4| 6| 2021|i still play like...| 28|
| pokemon| purejackbaby| null|2021-06-23 08:34:09|could you like, g...| 0| 1| t3_o63atm| false|t3_o63atm|h2qxpjq| 2021-06-23| 8| 4| 6| 2021|could you like go...| 28|
| pokemon|Nothingyoudoisawaste| null|2021-06-23 08:34:16|it felt forced - ...| 0| 2|t1_gzxgl8w| false|t3_nnyg0q|h2qxpu5| 2021-06-23| 8| 4| 6| 2021|it felt forced i ...| 161|
| pokemon| PuxOf_| null|2021-06-23 08:34:19|doing challenge m...| 0| 1| t3_o5j81l| false|t3_o5j81l|h2qxpxh| 2021-06-23| 8| 4| 6| 2021|doing challenge m...| 9|
| pokemon| ProbablyPuma| I am Rain Boi|2021-06-23 08:37:47|yall its supposed...| 0| 1| t3_o66cwy| false|t3_o66cwy|h2qxxrd| 2021-06-23| 8| 4| 6| 2021|yall its supposed...| 7|
| pokemon| Fishsticks03|:gm: Avenge the F...|2021-06-23 08:38:24| demon prawn| 0| 1| t3_o5myo5| false|t3_o5myo5|h2qxz3e| 2021-06-23| 8| 4| 6| 2021| demon prawn| 2|
| pokemon| DeadpoolCroatia| null|2021-06-23 08:40:30|miltank was never...| 0| 1| t3_o5j81l| false|t3_o5j81l|h2qy3pv| 2021-06-23| 8| 4| 6| 2021|miltank was never...| 21|
| pokemon| lofi_rico| null|2021-06-23 08:40:43|i can't believe n...| 0| 1| t3_o63atm| false|t3_o63atm|h2qy479| 2021-06-23| 8| 4| 6| 2021|i cant believe ni...| 10|
| pokemon|Aggravating-Public15| null|2021-06-23 08:41:25|i guess that’s tr...| 0| 1|t1_h2pynet| false|t3_o589jq|h2qy5nj| 2021-06-23| 8| 4| 6| 2021|i guess thats tru...| 37|
| pokemon| ElGatoGuerrero72| null|2021-06-23 08:42:15|i feel like steph...| 0| 9| t3_o65117| false|t3_o65117|h2qy7d7| 2021-06-23| 8| 4| 6| 2021|i feel like steph...| 18|
| pokemon| Long__Jump| null|2021-06-23 08:42:45|shiny bibarel as ...| 0| 4| t3_o5wra5| false|t3_o5wra5|h2qy8d4| 2021-06-23| 8| 4| 6| 2021|shiny bibarel as ...| 13|
| pokemon| Aryallie_18| null|2021-06-23 08:43:25|these are adorabl...| 0| 1| t3_o63atm| false|t3_o63atm|h2qy9tl| 2021-06-23| 8| 4| 6| 2021|these are adorabl...| 9|
| pokemon| Jankufood| null|2021-06-23 08:44:46|if the original p...| 0| 1| t3_o63atm| false|t3_o63atm|h2qycm4| 2021-06-23| 8| 4| 6| 2021|if the original p...| 8|
| pokemon| Snoo_58305| null|2021-06-23 08:44:47|that’s really ama...| 0| 1| t3_o63atm| false|t3_o63atm|h2qycnm| 2021-06-23| 8| 4| 6| 2021|thats really amaz...| 55|
| pokemon| Bringbackdigimon| null|2021-06-23 08:44:51|don’t give gamefr...| 0| 1| t3_o63atm| false|t3_o63atm|h2qycs5| 2021-06-23| 8| 4| 6| 2021|dont give gamefre...| 4|
| pokemon|Aggravating-Public15| null|2021-06-23 08:44:55|i loved gen 5 lol...| 0| 4| t3_o67fvr| false|t3_o67fvr|h2qycy3| 2021-06-23| 8| 4| 6| 2021|i loved gen 5 lol...| 42|
| pokemon| MoonLightScreen|WHY WASN'T I IN MOON|2021-06-23 08:45:49|well, naturally. ...| 0| 3| t3_o64dri| false|t3_o64dri|h2qyeuv| 2021-06-23| 8| 4| 6| 2021|well naturally wu...| 20|
| pokemon| GodOfYoshis| null|2021-06-23 08:47:17|i mean it's the i...| 0| 1|t1_h2q71pq| false|t3_o5pukl|h2qyhx3| 2021-06-23| 8| 4| 6| 2021|i mean its the in...| 20|
| pokemon| GrandWar4675| null|2021-06-23 08:47:53| pokimane| 0| 15|t1_h2qq5rm| false|t3_o63atm|h2qyj7x| 2021-06-23| 8| 4| 6| 2021| pokimane| 1|
| pokemon| bosswilliz| null|2021-06-23 08:47:54| i want them all!| 0| 1| t3_o63atm| false|t3_o63atm|h2qyj8t| 2021-06-23| 8| 4| 6| 2021| i want them all| 4|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
only showing top 20 rows
datastore = 'azureml://datastores/workspaceblobstore/paths'
# https://<STORAGE-ACCOUNT>.blob.core.windows.net/<CONTAINER-NAME>
Storage_URI="https://group09astorage08f5ea16c.blob.core.windows.net/azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"
workspace_default_storage_account = "group09astorage08f5ea16c"
workspace_default_container = "azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"
workspace_wasbs_base_url = (
f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 23, Finished, Available)
sub_cleaned.write.parquet(f"{workspace_wasbs_base_url}/pokemon_sub_cleaned.parquet")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 25, Finished, Available)
com_cleaned.write.parquet(f"{workspace_wasbs_base_url}/pokemon_com_cleaned.parquet")datastore = 'azureml://datastores/workspaceblobstore/paths'
submissions = spark.read.parquet(f"{datastore}/pokemon_sub_cleaned.parquet")
comments = spark.read.parquet(f"{datastore}/pokemon_com_cleaned.parquet")submissions = sub_cleaned
comments = com_cleanedStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 26, Finished, Available)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import plotly.graph_objects as go
import plotly.subplots as sp
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
from plotly.subplots import make_subplots
import pyspark.sql.types as TStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 27, Finished, Available)
stocks = pd.read_csv("Users/hw487/fall-2023-reddit-project-team-09/data/csv/stocks.csv")StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 29, Finished, Available)
stocksStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 30, Finished, Available)
| Unnamed: 0 | NTDOY | SONY | TYO | Date | |
|---|---|---|---|---|---|
| 0 | 0 | 15.740000 | 100.070000 | 7.735437 | 2021-01-04 |
| 1 | 1 | 16.176001 | 103.110001 | 7.793598 | 2021-01-05 |
| 2 | 2 | 15.738000 | 101.080002 | 7.968082 | 2021-01-06 |
| 3 | 3 | 15.802000 | 102.000000 | 8.026241 | 2021-01-07 |
| 4 | 4 | 16.010000 | 103.989998 | 8.113484 | 2021-01-08 |
| ... | ... | ... | ... | ... | ... |
| 560 | 560 | 9.650000 | 86.639999 | 11.965896 | 2023-03-27 |
| 561 | 561 | 9.610000 | 85.820000 | 11.985530 | 2023-03-28 |
| 562 | 562 | 9.770000 | 87.870003 | 12.044427 | 2023-03-29 |
| 563 | 563 | 9.630000 | 89.309998 | 12.054242 | 2023-03-30 |
| 564 | 564 | 9.690000 | 90.650002 | 11.818654 | 2023-03-31 |
565 rows × 5 columns
from pyspark.sql.functions import to_date
from pyspark.sql.functions import year, month, dayofweek
from pyspark.sql.functions import count, avgStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 35, Finished, Available)
# Convert Pandas DataFrame to Spark DataFrame
stocks_df = spark.createDataFrame(stocks)
# Convert 'Date' column to datetime format in stocks DataFrame
stocks_df = stocks_df.withColumn("Date", to_date("Date", "yyyy-MM-dd"))StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 32, Finished, Available)
# For the stocks DataFrame
stocks_df = stocks_df.withColumn("Year", year("Date"))
stocks_df = stocks_df.withColumn("Month", month("Date"))
stocks_df = stocks_df.withColumn("DayOfWeek", dayofweek("Date"))
# Similarly for submissions and comments DataFrames
submissions = submissions.withColumn("Year", year("created_date"))
submissions = submissions.withColumn("Month", month("created_date"))
submissions = submissions.withColumn("DayOfWeek", dayofweek("created_date"))
comments = comments.withColumn("Year", year("created_date"))
comments = comments.withColumn("Month", month("created_date"))
comments = comments.withColumn("DayOfWeek", dayofweek("created_date"))StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 33, Finished, Available)
from pyspark.sql.functions import weekofyear
# Add week of year to the stocks DataFrame
stocks_df = stocks_df.withColumn("WeekOfYear", weekofyear("Date"))
# Add week of year to submissions and comments DataFrames
submissions = submissions.withColumn("WeekOfYear", weekofyear("created_date"))
comments = comments.withColumn("WeekOfYear", weekofyear("created_date"))StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 37, Finished, Available)
from pyspark.sql.functions import sum as sum_agg # Renaming to avoid confusion with Python's built-in sum()
from pyspark.sql.functions import mean, max as _max, percentile_approx
# Aggregate Submissions by Week
submissions_agg_weekly = submissions.groupBy("WeekOfYear", "Year").agg(
count("*").alias("total_submissions"),
mean("score").alias("avg_submission_score"),
sum_agg("num_comments").alias("total_submission_comments"),
mean("num_comments").alias("avg_num_comments"),
percentile_approx("num_comments", 0.5).alias("median_num_comments"),
_max("num_comments").alias("max_num_comments"),
sum_agg("num_crossposts").alias("total_submission_crossposts"),
mean("num_crossposts").alias("avg_num_crossposts"),
percentile_approx("num_crossposts", 0.5).alias("median_num_crossposts"),
_max("num_crossposts").alias("max_num_crossposts"),
mean("title_wordCount").alias("avg_title_wordCount"),
mean("selftext_wordCount").alias("avg_selftext_wordCount")
).toPandas()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 38, Finished, Available)
submissions_agg_weeklyStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 39, Finished, Available)
| WeekOfYear | Year | total_submissions | avg_submission_score | total_submission_comments | avg_num_comments | median_num_comments | max_num_comments | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 2023 | 393 | 50.684478 | 14381 | 36.592875 | 5 | 1531 | 8 | 0.020356 | 0 | 3 | 8.582697 | 133.150127 |
| 1 | 8 | 2021 | 2259 | 99.024347 | 55402 | 24.525011 | 7 | 6134 | 21 | 0.009296 | 0 | 2 | 9.321824 | 147.490040 |
| 2 | 9 | 2023 | 692 | 40.748555 | 14680 | 21.213873 | 5 | 1063 | 5 | 0.007225 | 0 | 1 | 8.774566 | 136.365607 |
| 3 | 48 | 2022 | 1690 | 133.173373 | 59718 | 35.336095 | 8 | 3096 | 12 | 0.007101 | 0 | 3 | 8.676923 | 141.889941 |
| 4 | 13 | 2023 | 375 | 108.197333 | 7154 | 19.077333 | 5 | 912 | 7 | 0.018667 | 0 | 2 | 8.645333 | 131.466667 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 114 | 32 | 2021 | 505 | 93.233663 | 16866 | 33.398020 | 6 | 2323 | 5 | 0.009901 | 0 | 1 | 9.017822 | 127.942574 |
| 115 | 46 | 2021 | 1268 | 93.505521 | 43890 | 34.613565 | 7 | 5328 | 21 | 0.016562 | 0 | 3 | 8.485804 | 103.922713 |
| 116 | 45 | 2021 | 632 | 71.420886 | 26424 | 41.810127 | 7 | 2963 | 6 | 0.009494 | 0 | 1 | 9.121835 | 146.132911 |
| 117 | 42 | 2021 | 431 | 73.563805 | 18299 | 42.457077 | 7 | 6973 | 9 | 0.020882 | 0 | 3 | 8.470998 | 133.287703 |
| 118 | 44 | 2021 | 417 | 87.009592 | 18070 | 43.333333 | 7 | 1707 | 4 | 0.009592 | 0 | 2 | 8.666667 | 144.973621 |
119 rows × 14 columns
submissions_agg_weekly['Year'] = submissions_agg_weekly['Year'].astype(int)
submissions_agg_weekly['WeekOfYear'] = submissions_agg_weekly['WeekOfYear'].astype(int)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 40, Finished, Available)
submissions_agg_weekly = submissions_agg_weekly.sort_values(by=['Year', 'WeekOfYear'])StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 42, Finished, Available)
submissions_agg_weeklyStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 43, Finished, Available)
| WeekOfYear | Year | total_submissions | avg_submission_score | total_submission_comments | avg_num_comments | median_num_comments | max_num_comments | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 44 | 1 | 2021 | 603 | 78.381426 | 9138 | 15.154229 | 4 | 1247 | 4 | 0.006633 | 0 | 2 | 8.334992 | 157.920398 |
| 71 | 2 | 2021 | 567 | 62.342152 | 9780 | 17.248677 | 6 | 958 | 8 | 0.014109 | 0 | 2 | 8.871252 | 155.313933 |
| 38 | 3 | 2021 | 616 | 56.082792 | 10879 | 17.660714 | 5 | 1022 | 5 | 0.008117 | 0 | 1 | 8.659091 | 159.055195 |
| 22 | 4 | 2021 | 619 | 67.316640 | 10240 | 16.542811 | 4 | 1215 | 8 | 0.012924 | 0 | 1 | 8.568659 | 154.453958 |
| 48 | 5 | 2021 | 627 | 61.556619 | 9526 | 15.192982 | 5 | 1024 | 10 | 0.015949 | 0 | 4 | 8.564593 | 150.409888 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 61 | 10 | 2023 | 434 | 51.755760 | 12594 | 29.018433 | 6 | 2122 | 3 | 0.006912 | 0 | 1 | 8.880184 | 142.140553 |
| 5 | 11 | 2023 | 453 | 118.214128 | 15130 | 33.399558 | 6 | 2330 | 10 | 0.022075 | 0 | 5 | 8.456954 | 140.823400 |
| 28 | 12 | 2023 | 539 | 67.541744 | 10738 | 19.922078 | 5 | 860 | 41 | 0.076067 | 0 | 24 | 8.077922 | 136.018553 |
| 4 | 13 | 2023 | 375 | 108.197333 | 7154 | 19.077333 | 5 | 912 | 7 | 0.018667 | 0 | 2 | 8.645333 | 131.466667 |
| 37 | 52 | 2023 | 118 | 268.491525 | 7055 | 59.788136 | 8 | 1380 | 1 | 0.008475 | 0 | 1 | 8.593220 | 155.067797 |
119 rows × 14 columns
submissions_agg_weekly.to_csv("submissions_agg_weekly.csv", index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 44, Finished, Available)
# Aggregate Submissions by Date
submissions_agg = submissions.groupBy("created_date").agg(
count("*").alias("total_submissions"),
mean("score").alias("avg_submission_score"),
sum_agg("num_comments").alias("total_submission_comments"),
mean("num_comments").alias("avg_num_comments"),
percentile_approx("num_comments", 0.5).alias("median_num_comments"),
_max("num_comments").alias("max_num_comments"),
sum_agg("num_crossposts").alias("total_submission_crossposts"),
mean("num_crossposts").alias("avg_num_crossposts"),
percentile_approx("num_crossposts", 0.5).alias("median_num_crossposts"),
_max("num_crossposts").alias("max_num_crossposts"),
mean("title_wordCount").alias("avg_title_wordCount"),
mean("selftext_wordCount").alias("avg_selftext_wordCount")
).toPandas()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 45, Finished, Available)
submissions_aggStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 46, Finished, Available)
| created_date | total_submissions | avg_submission_score | total_submission_comments | avg_num_comments | median_num_comments | max_num_comments | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-01-11 | 85 | 6.929412 | 1577 | 18.552941 | 4 | 958 | 0 | 0.000000 | 0 | 0 | 8.717647 | 142.870588 |
| 1 | 2021-01-12 | 63 | 3.714286 | 365 | 5.793651 | 5 | 26 | 1 | 0.015873 | 0 | 1 | 7.619048 | 134.174603 |
| 2 | 2021-01-15 | 107 | 7.813084 | 1128 | 10.542056 | 5 | 84 | 0 | 0.000000 | 0 | 0 | 9.672897 | 131.700935 |
| 3 | 2021-01-20 | 81 | 3.283951 | 584 | 7.209877 | 4 | 51 | 0 | 0.000000 | 0 | 0 | 9.790123 | 132.222222 |
| 4 | 2021-01-24 | 98 | 178.377551 | 3482 | 35.530612 | 7 | 841 | 1 | 0.010204 | 0 | 1 | 8.530612 | 152.877551 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 815 | 2023-03-15 | 62 | 487.145161 | 2473 | 39.887097 | 7 | 1783 | 6 | 0.096774 | 0 | 5 | 8.048387 | 99.983871 |
| 816 | 2023-03-18 | 71 | 110.169014 | 5432 | 76.507042 | 8 | 2330 | 1 | 0.014085 | 0 | 1 | 8.732394 | 128.929577 |
| 817 | 2023-03-24 | 66 | 89.015152 | 911 | 13.803030 | 5 | 272 | 24 | 0.363636 | 0 | 24 | 7.257576 | 126.484848 |
| 818 | 2023-03-26 | 95 | 87.873684 | 3653 | 38.452632 | 8 | 860 | 3 | 0.031579 | 0 | 1 | 8.747368 | 129.842105 |
| 819 | 2023-03-27 | 112 | 16.241071 | 1740 | 15.535714 | 4 | 789 | 1 | 0.008929 | 0 | 1 | 8.732143 | 110.633929 |
820 rows × 13 columns
submissions_agg['created_date'] = pd.to_datetime(submissions_agg['created_date'])
submissions_agg = submissions_agg.sort_values(by='created_date')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 47, Finished, Available)
submissions_aggStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 48, Finished, Available)
| created_date | total_submissions | avg_submission_score | total_submission_comments | avg_num_comments | median_num_comments | max_num_comments | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 542 | 2021-01-01 | 52 | 12.576923 | 484 | 9.307692 | 4 | 69 | 0 | 0.000000 | 0 | 0 | 8.711538 | 148.692308 |
| 543 | 2021-01-02 | 109 | 196.844037 | 2714 | 24.899083 | 5 | 453 | 6 | 0.055046 | 0 | 3 | 8.256881 | 135.376147 |
| 357 | 2021-01-03 | 72 | 125.263889 | 1572 | 21.833333 | 8 | 483 | 2 | 0.027778 | 0 | 2 | 10.083333 | 119.805556 |
| 544 | 2021-01-04 | 105 | 65.438095 | 2332 | 22.209524 | 5 | 1043 | 0 | 0.000000 | 0 | 0 | 8.980952 | 140.961905 |
| 545 | 2021-01-05 | 75 | 3.853333 | 414 | 5.520000 | 4 | 29 | 1 | 0.013333 | 0 | 1 | 7.253333 | 177.533333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 819 | 2023-03-27 | 112 | 16.241071 | 1740 | 15.535714 | 4 | 789 | 1 | 0.008929 | 0 | 1 | 8.732143 | 110.633929 |
| 540 | 2023-03-28 | 78 | 31.102564 | 1095 | 14.038462 | 4 | 559 | 0 | 0.000000 | 0 | 0 | 9.653846 | 176.423077 |
| 179 | 2023-03-29 | 61 | 190.655738 | 1519 | 24.901639 | 6 | 721 | 1 | 0.016393 | 0 | 1 | 7.983607 | 110.180328 |
| 541 | 2023-03-30 | 64 | 54.234375 | 829 | 12.953125 | 5 | 182 | 3 | 0.046875 | 0 | 2 | 7.953125 | 116.828125 |
| 356 | 2023-03-31 | 60 | 353.800000 | 1971 | 32.850000 | 7 | 912 | 2 | 0.033333 | 0 | 1 | 8.583333 | 149.166667 |
820 rows × 13 columns
submissions_agg.to_csv("submissions_agg_daily.csv", index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 49, Finished, Available)
# Aggregate Comments by Week
comments_agg_weekly = comments.groupBy("WeekOfYear", "Year").agg(
count("*").alias("total_comments"),
mean("score").alias("avg_comment_score"),
mean("body_wordCount").alias("avg_body_wordCount")
).toPandas()comments_agg_weekly['Year'] = comments_agg_weekly['Year'].astype(int)
comments_agg_weekly['WeekOfYear'] = comments_agg_weekly['WeekOfYear'].astype(int)comments_agg_weekly = comments_agg_weekly.sort_values(by=['Year', 'WeekOfYear'])comments_agg_weekly.to_csv("comments_agg_weekly.csv", index=False)# Aggregate Comments by Date
comments_agg = comments.groupBy("created_date").agg(
count("*").alias("total_comments"),
mean("score").alias("avg_comment_score"),
mean("body_wordCount").alias("avg_body_wordCount")
).toPandas()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 70, Finished, Available)
comments_aggStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 71, Finished, Available)
| created_date | total_comments | avg_comment_score | avg_body_wordCount | |
|---|---|---|---|---|
| 0 | 2023-03-29 | 3925 | 7.854777 | 23.865478 |
| 1 | 2023-03-21 | 5489 | 11.155766 | 26.962653 |
| 2 | 2023-03-20 | 4703 | 10.543270 | 28.540931 |
| 3 | 2021-03-30 | 4261 | 6.230462 | 23.666041 |
| 4 | 2023-02-23 | 5019 | 9.108189 | 24.217972 |
| ... | ... | ... | ... | ... |
| 814 | 2021-11-07 | 6859 | 7.212422 | 29.889634 |
| 815 | 2021-09-26 | 6537 | 8.915405 | 27.742389 |
| 816 | 2021-01-29 | 3966 | 7.781140 | 21.598084 |
| 817 | 2021-10-02 | 6293 | 10.830288 | 23.274432 |
| 818 | 2021-10-30 | 3843 | 12.745511 | 25.692428 |
819 rows × 4 columns
comments_agg['created_date'] = pd.to_datetime(comments_agg['created_date'])
comments_agg = comments_agg.sort_values(by='created_date')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 72, Finished, Available)
comments_aggStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 73, Finished, Available)
| created_date | total_comments | avg_comment_score | avg_body_wordCount | |
|---|---|---|---|---|
| 714 | 2021-01-01 | 3746 | 7.749867 | 19.772824 |
| 686 | 2021-01-02 | 2858 | 10.153254 | 26.478307 |
| 338 | 2021-01-03 | 2827 | 5.977361 | 37.826318 |
| 615 | 2021-01-04 | 4932 | 7.981144 | 25.933496 |
| 632 | 2021-01-05 | 5204 | 9.958301 | 21.860300 |
| ... | ... | ... | ... | ... |
| 581 | 2023-03-27 | 5130 | 7.277778 | 28.739376 |
| 437 | 2023-03-28 | 6712 | 8.297527 | 28.252980 |
| 0 | 2023-03-29 | 3925 | 7.854777 | 23.865478 |
| 295 | 2023-03-30 | 3623 | 8.449903 | 20.830803 |
| 159 | 2023-03-31 | 4575 | 10.321967 | 24.299891 |
819 rows × 4 columns
comments_agg.to_csv("comments_agg_daily.csv", index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 74, Finished, Available)
stocks_df = stocks_df.toPandas()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 75, Finished, Available)
# Convert the 'Date' column in stocks_df to datetime if it's not already
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'])StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 76, Finished, Available)
stocks_df = stocks_df.sort_values(by='Date')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 79, Finished, Available)
# Merge Submissions Aggregate with Stocks Data
combined_df = pd.merge(stocks_df, submissions_agg, left_on='Date', right_on='created_date', how='left')
# Merge Comments Aggregate with Combined Data
combined_df = pd.merge(combined_df, comments_agg, left_on='Date', right_on='created_date', how='left')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 80, Finished, Available)
combined_dfStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 81, Finished, Available)
| Unnamed: 0 | NTDOY | SONY | TYO | Date | Year | Month | DayOfWeek | WeekOfYear | created_date_x | ... | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | created_date_y | total_comments | avg_comment_score | avg_body_wordCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 15.740000 | 100.070000 | 7.735437 | 2021-01-04 | 2021 | 1 | 2 | 1 | 2021-01-04 | ... | 0 | 0.000000 | 0 | 0 | 8.980952 | 140.961905 | 2021-01-04 | 4932.0 | 7.981144 | 25.933496 |
| 1 | 1 | 16.176001 | 103.110001 | 7.793598 | 2021-01-05 | 2021 | 1 | 3 | 1 | 2021-01-05 | ... | 1 | 0.013333 | 0 | 1 | 7.253333 | 177.533333 | 2021-01-05 | 5204.0 | 9.958301 | 21.860300 |
| 2 | 2 | 15.738000 | 101.080002 | 7.968082 | 2021-01-06 | 2021 | 1 | 4 | 1 | 2021-01-06 | ... | 1 | 0.012987 | 0 | 1 | 8.571429 | 153.428571 | 2021-01-06 | 4385.0 | 6.350741 | 21.136602 |
| 3 | 3 | 15.802000 | 102.000000 | 8.026241 | 2021-01-07 | 2021 | 1 | 5 | 1 | 2021-01-07 | ... | 0 | 0.000000 | 0 | 0 | 7.952381 | 147.063492 | 2021-01-07 | 4923.0 | 8.251270 | 18.427382 |
| 4 | 4 | 16.010000 | 103.989998 | 8.113484 | 2021-01-08 | 2021 | 1 | 6 | 1 | 2021-01-08 | ... | 2 | 0.022222 | 0 | 2 | 8.211111 | 147.822222 | 2021-01-08 | 5072.0 | 8.819598 | 21.948935 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 560 | 560 | 9.650000 | 86.639999 | 11.965896 | 2023-03-27 | 2023 | 3 | 2 | 13 | 2023-03-27 | ... | 1 | 0.008929 | 0 | 1 | 8.732143 | 110.633929 | 2023-03-27 | 5130.0 | 7.277778 | 28.739376 |
| 561 | 561 | 9.610000 | 85.820000 | 11.985530 | 2023-03-28 | 2023 | 3 | 3 | 13 | 2023-03-28 | ... | 0 | 0.000000 | 0 | 0 | 9.653846 | 176.423077 | 2023-03-28 | 6712.0 | 8.297527 | 28.252980 |
| 562 | 562 | 9.770000 | 87.870003 | 12.044427 | 2023-03-29 | 2023 | 3 | 4 | 13 | 2023-03-29 | ... | 1 | 0.016393 | 0 | 1 | 7.983607 | 110.180328 | 2023-03-29 | 3925.0 | 7.854777 | 23.865478 |
| 563 | 563 | 9.630000 | 89.309998 | 12.054242 | 2023-03-30 | 2023 | 3 | 5 | 13 | 2023-03-30 | ... | 3 | 0.046875 | 0 | 2 | 7.953125 | 116.828125 | 2023-03-30 | 3623.0 | 8.449903 | 20.830803 |
| 564 | 564 | 9.690000 | 90.650002 | 11.818654 | 2023-03-31 | 2023 | 3 | 6 | 13 | 2023-03-31 | ... | 2 | 0.033333 | 0 | 1 | 8.583333 | 149.166667 | 2023-03-31 | 4575.0 | 10.321967 | 24.299891 |
565 rows × 26 columns
# Dropping the extra 'created_date' columns resulted from merging
combined_df = combined_df.drop(columns=['created_date_x', 'created_date_y'])
# Handling missing values
combined_df = combined_df.fillna({
'total_submissions': 0, 'avg_submission_score': 0, 'total_submission_comments': 0,
'total_submission_crossposts': 0, 'avg_num_comments': 0, 'median_num_comments': 0,
'max_num_comments': 0, 'avg_num_crossposts': 0, 'median_num_crossposts': 0,
'max_num_crossposts': 0, 'avg_title_wordCount': 0, 'avg_selftext_wordCount': 0,
'total_comments': 0, 'avg_comment_score': 0, 'avg_body_wordCount': 0
})StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 82, Finished, Available)
combined_dfStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 83, Finished, Available)
| Unnamed: 0 | NTDOY | SONY | TYO | Date | Year | Month | DayOfWeek | WeekOfYear | total_submissions | ... | max_num_comments | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | total_comments | avg_comment_score | avg_body_wordCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 15.740000 | 100.070000 | 7.735437 | 2021-01-04 | 2021 | 1 | 2 | 1 | 105 | ... | 1043 | 0 | 0.000000 | 0 | 0 | 8.980952 | 140.961905 | 4932.0 | 7.981144 | 25.933496 |
| 1 | 1 | 16.176001 | 103.110001 | 7.793598 | 2021-01-05 | 2021 | 1 | 3 | 1 | 75 | ... | 29 | 1 | 0.013333 | 0 | 1 | 7.253333 | 177.533333 | 5204.0 | 9.958301 | 21.860300 |
| 2 | 2 | 15.738000 | 101.080002 | 7.968082 | 2021-01-06 | 2021 | 1 | 4 | 1 | 77 | ... | 23 | 1 | 0.012987 | 0 | 1 | 8.571429 | 153.428571 | 4385.0 | 6.350741 | 21.136602 |
| 3 | 3 | 15.802000 | 102.000000 | 8.026241 | 2021-01-07 | 2021 | 1 | 5 | 1 | 63 | ... | 22 | 0 | 0.000000 | 0 | 0 | 7.952381 | 147.063492 | 4923.0 | 8.251270 | 18.427382 |
| 4 | 4 | 16.010000 | 103.989998 | 8.113484 | 2021-01-08 | 2021 | 1 | 6 | 1 | 90 | ... | 66 | 2 | 0.022222 | 0 | 2 | 8.211111 | 147.822222 | 5072.0 | 8.819598 | 21.948935 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 560 | 560 | 9.650000 | 86.639999 | 11.965896 | 2023-03-27 | 2023 | 3 | 2 | 13 | 112 | ... | 789 | 1 | 0.008929 | 0 | 1 | 8.732143 | 110.633929 | 5130.0 | 7.277778 | 28.739376 |
| 561 | 561 | 9.610000 | 85.820000 | 11.985530 | 2023-03-28 | 2023 | 3 | 3 | 13 | 78 | ... | 559 | 0 | 0.000000 | 0 | 0 | 9.653846 | 176.423077 | 6712.0 | 8.297527 | 28.252980 |
| 562 | 562 | 9.770000 | 87.870003 | 12.044427 | 2023-03-29 | 2023 | 3 | 4 | 13 | 61 | ... | 721 | 1 | 0.016393 | 0 | 1 | 7.983607 | 110.180328 | 3925.0 | 7.854777 | 23.865478 |
| 563 | 563 | 9.630000 | 89.309998 | 12.054242 | 2023-03-30 | 2023 | 3 | 5 | 13 | 64 | ... | 182 | 3 | 0.046875 | 0 | 2 | 7.953125 | 116.828125 | 3623.0 | 8.449903 | 20.830803 |
| 564 | 564 | 9.690000 | 90.650002 | 11.818654 | 2023-03-31 | 2023 | 3 | 6 | 13 | 60 | ... | 912 | 2 | 0.033333 | 0 | 1 | 8.583333 | 149.166667 | 4575.0 | 10.321967 | 24.299891 |
565 rows × 24 columns
# Merge Submissions Aggregate with Stocks Data
combined_df_weekly = pd.merge(stocks_df, submissions_agg_weekly, on=['Year', 'WeekOfYear'], how='left')
# Merge Comments Aggregate with Combined Data
combined_df_weekly = pd.merge(combined_df_weekly, comments_agg_weekly, on=['Year', 'WeekOfYear'], how='left')
# Handling missing values
combined_df_weekly.fillna({
'total_submissions': 0, 'avg_submission_score': 0, 'total_submission_comments': 0,
'total_submission_crossposts': 0, 'avg_num_comments': 0, 'median_num_comments': 0,
'max_num_comments': 0, 'avg_num_crossposts': 0, 'median_num_crossposts': 0,
'max_num_crossposts': 0, 'avg_title_wordCount': 0, 'avg_selftext_wordCount': 0,
'total_comments': 0, 'avg_comment_score': 0, 'avg_body_wordCount': 0
}, inplace=True)import pandas as pd
# Calculate the start or end of the week (e.g., Sunday as the start of the week)
combined_df['WeekStartDate'] = combined_df['Date'] - pd.to_timedelta(combined_df['Date'].dt.dayofweek, unit='d')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 85, Finished, Available)
combined_dfStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 87, Finished, Available)
| Unnamed: 0 | NTDOY | SONY | TYO | Date | Year | Month | DayOfWeek | WeekOfYear | total_submissions | ... | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | total_comments | avg_comment_score | avg_body_wordCount | WeekStartDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 15.740000 | 100.070000 | 7.735437 | 2021-01-04 | 2021 | 1 | 2 | 1 | 105 | ... | 0 | 0.000000 | 0 | 0 | 8.980952 | 140.961905 | 4932.0 | 7.981144 | 25.933496 | 2021-01-04 |
| 1 | 1 | 16.176001 | 103.110001 | 7.793598 | 2021-01-05 | 2021 | 1 | 3 | 1 | 75 | ... | 1 | 0.013333 | 0 | 1 | 7.253333 | 177.533333 | 5204.0 | 9.958301 | 21.860300 | 2021-01-04 |
| 2 | 2 | 15.738000 | 101.080002 | 7.968082 | 2021-01-06 | 2021 | 1 | 4 | 1 | 77 | ... | 1 | 0.012987 | 0 | 1 | 8.571429 | 153.428571 | 4385.0 | 6.350741 | 21.136602 | 2021-01-04 |
| 3 | 3 | 15.802000 | 102.000000 | 8.026241 | 2021-01-07 | 2021 | 1 | 5 | 1 | 63 | ... | 0 | 0.000000 | 0 | 0 | 7.952381 | 147.063492 | 4923.0 | 8.251270 | 18.427382 | 2021-01-04 |
| 4 | 4 | 16.010000 | 103.989998 | 8.113484 | 2021-01-08 | 2021 | 1 | 6 | 1 | 90 | ... | 2 | 0.022222 | 0 | 2 | 8.211111 | 147.822222 | 5072.0 | 8.819598 | 21.948935 | 2021-01-04 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 560 | 560 | 9.650000 | 86.639999 | 11.965896 | 2023-03-27 | 2023 | 3 | 2 | 13 | 112 | ... | 1 | 0.008929 | 0 | 1 | 8.732143 | 110.633929 | 5130.0 | 7.277778 | 28.739376 | 2023-03-27 |
| 561 | 561 | 9.610000 | 85.820000 | 11.985530 | 2023-03-28 | 2023 | 3 | 3 | 13 | 78 | ... | 0 | 0.000000 | 0 | 0 | 9.653846 | 176.423077 | 6712.0 | 8.297527 | 28.252980 | 2023-03-27 |
| 562 | 562 | 9.770000 | 87.870003 | 12.044427 | 2023-03-29 | 2023 | 3 | 4 | 13 | 61 | ... | 1 | 0.016393 | 0 | 1 | 7.983607 | 110.180328 | 3925.0 | 7.854777 | 23.865478 | 2023-03-27 |
| 563 | 563 | 9.630000 | 89.309998 | 12.054242 | 2023-03-30 | 2023 | 3 | 5 | 13 | 64 | ... | 3 | 0.046875 | 0 | 2 | 7.953125 | 116.828125 | 3623.0 | 8.449903 | 20.830803 | 2023-03-27 |
| 564 | 564 | 9.690000 | 90.650002 | 11.818654 | 2023-03-31 | 2023 | 3 | 6 | 13 | 60 | ... | 2 | 0.033333 | 0 | 1 | 8.583333 | 149.166667 | 4575.0 | 10.321967 | 24.299891 | 2023-03-27 |
565 rows × 25 columns
combined_df = combined_df.drop(columns=['Unnamed: 0'])
combined_dfStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 99, Finished, Available)
| NTDOY | SONY | TYO | Date | Year | Month | DayOfWeek | WeekOfYear | total_submissions | avg_submission_score | ... | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | total_comments | avg_comment_score | avg_body_wordCount | WeekStartDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15.740000 | 100.070000 | 7.735437 | 2021-01-04 | 2021 | 1 | 2 | 1 | 105 | 65.438095 | ... | 0 | 0.000000 | 0 | 0 | 8.980952 | 140.961905 | 4932.0 | 7.981144 | 25.933496 | 2021-01-04 |
| 1 | 16.176001 | 103.110001 | 7.793598 | 2021-01-05 | 2021 | 1 | 3 | 1 | 75 | 3.853333 | ... | 1 | 0.013333 | 0 | 1 | 7.253333 | 177.533333 | 5204.0 | 9.958301 | 21.860300 | 2021-01-04 |
| 2 | 15.738000 | 101.080002 | 7.968082 | 2021-01-06 | 2021 | 1 | 4 | 1 | 77 | 3.090909 | ... | 1 | 0.012987 | 0 | 1 | 8.571429 | 153.428571 | 4385.0 | 6.350741 | 21.136602 | 2021-01-04 |
| 3 | 15.802000 | 102.000000 | 8.026241 | 2021-01-07 | 2021 | 1 | 5 | 1 | 63 | 3.238095 | ... | 0 | 0.000000 | 0 | 0 | 7.952381 | 147.063492 | 4923.0 | 8.251270 | 18.427382 | 2021-01-04 |
| 4 | 16.010000 | 103.989998 | 8.113484 | 2021-01-08 | 2021 | 1 | 6 | 1 | 90 | 19.055556 | ... | 2 | 0.022222 | 0 | 2 | 8.211111 | 147.822222 | 5072.0 | 8.819598 | 21.948935 | 2021-01-04 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 560 | 9.650000 | 86.639999 | 11.965896 | 2023-03-27 | 2023 | 3 | 2 | 13 | 112 | 16.241071 | ... | 1 | 0.008929 | 0 | 1 | 8.732143 | 110.633929 | 5130.0 | 7.277778 | 28.739376 | 2023-03-27 |
| 561 | 9.610000 | 85.820000 | 11.985530 | 2023-03-28 | 2023 | 3 | 3 | 13 | 78 | 31.102564 | ... | 0 | 0.000000 | 0 | 0 | 9.653846 | 176.423077 | 6712.0 | 8.297527 | 28.252980 | 2023-03-27 |
| 562 | 9.770000 | 87.870003 | 12.044427 | 2023-03-29 | 2023 | 3 | 4 | 13 | 61 | 190.655738 | ... | 1 | 0.016393 | 0 | 1 | 7.983607 | 110.180328 | 3925.0 | 7.854777 | 23.865478 | 2023-03-27 |
| 563 | 9.630000 | 89.309998 | 12.054242 | 2023-03-30 | 2023 | 3 | 5 | 13 | 64 | 54.234375 | ... | 3 | 0.046875 | 0 | 2 | 7.953125 | 116.828125 | 3623.0 | 8.449903 | 20.830803 | 2023-03-27 |
| 564 | 9.690000 | 90.650002 | 11.818654 | 2023-03-31 | 2023 | 3 | 6 | 13 | 60 | 353.800000 | ... | 2 | 0.033333 | 0 | 1 | 8.583333 | 149.166667 | 4575.0 | 10.321967 | 24.299891 | 2023-03-27 |
565 rows × 24 columns
combined_df.to_csv("combined_ml_daily.csv", index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 100, Finished, Available)
# Group by week and calculate sum and mean
agg_columns = {
'total_submissions': 'sum',
'total_comments': 'sum',
'NTDOY': 'mean',
'SONY': 'mean',
'TYO': 'mean',
}
weekly_df= combined_df.groupby([pd.Grouper(key='WeekStartDate', freq='W')]).agg(agg_columns).reset_index()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 93, Finished, Available)
weekly_dfStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 94, Finished, Available)
| WeekStartDate | total_submissions | total_comments | NTDOY | SONY | TYO | |
|---|---|---|---|---|---|---|
| 0 | 2021-01-10 | 410 | 24516.0 | 15.8932 | 102.050000 | 7.927368 |
| 1 | 2021-01-17 | 402 | 28040.0 | 15.8692 | 103.445999 | 8.154197 |
| 2 | 2021-01-24 | 309 | 20095.0 | 15.6015 | 102.195002 | 8.118330 |
| 3 | 2021-01-31 | 408 | 22147.0 | 14.9092 | 97.870000 | 7.950633 |
| 4 | 2021-02-07 | 435 | 22889.0 | 15.0080 | 107.333998 | 8.107668 |
| ... | ... | ... | ... | ... | ... | ... |
| 112 | 2023-03-05 | 512 | 42123.0 | 9.3940 | 83.927998 | 13.407248 |
| 113 | 2023-03-12 | 283 | 25753.0 | 9.4020 | 86.634000 | 13.191847 |
| 114 | 2023-03-19 | 294 | 24599.0 | 9.4860 | 85.529999 | 12.037221 |
| 115 | 2023-03-26 | 376 | 23007.0 | 9.5700 | 88.206000 | 11.796666 |
| 116 | 2023-04-02 | 375 | 23965.0 | 9.6700 | 88.058000 | 11.973750 |
117 rows × 6 columns
weekly_df.to_csv("combined_ml_pokemon_weekly.csv", index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 101, Finished, Available)
import plotly.graph_objs as go
from plotly.subplots import make_subplots
# Create subplots with a 2x1 grid
fig = make_subplots(rows=2, cols=1, specs=[[{"secondary_y": True}],[{"secondary_y": True}]])
# Add traces for the first subplot (Submissions and Stock Prices)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['total_submissions'], marker_color='#d13a47', opacity=.65, name="Submissions"), row=1, col=1, secondary_y=False)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['NTDOY'], marker_color='#f7c200', opacity=.65, name="NTDOY"), row=1, col=1, secondary_y=True)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['TYO'], marker_color='#42a63c', opacity=.65, name="TYO"), row=1, col=1, secondary_y=True)
# Add traces for the second subplot (Comments and Stock Prices)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['total_comments'], marker_color='#42a1b9', opacity=.65, name="Comments"), row=2, col=1, secondary_y=False)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['NTDOY'], marker_color='#f7c200', opacity=.65, name="NTDOY", showlegend=False), row=2, col=1, secondary_y=True)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['TYO'], marker_color='#42a63c', opacity=.65, name="TYO", showlegend=False), row=2, col=1, secondary_y=True)
# Update the y-axis labels
fig.update_yaxes(title_text="Submissions", secondary_y=False, row=1, col=1)
fig.update_yaxes(title_text="Mean Stock Prices", secondary_y=True, row=1, col=1)
fig.update_yaxes(title_text="Comments", secondary_y=False, row=2, col=1)
fig.update_yaxes(title_text="Mean Stock Prices", secondary_y=True, row=2, col=1)
# Update the layout for the whole figure
fig.update_layout(
title='The number of comments and submissions in subreddit = pokemon with stock prices for each week',
# xaxis={'title': 'Date by week'},
paper_bgcolor='#FFFFFF',
plot_bgcolor='rgba(0,0,0,0)',
)
# Show the figure
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 103, Finished, Available)
# Summary statistics
print(combined_df_weekly.describe())StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 105, Finished, Available)
# Summary statistics
combined_df.describe()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 106, Finished, Available)
| NTDOY | SONY | TYO | Year | Month | DayOfWeek | WeekOfYear | total_submissions | avg_submission_score | total_submission_comments | ... | max_num_comments | total_submission_crossposts | avg_num_crossposts | median_num_crossposts | max_num_crossposts | avg_title_wordCount | avg_selftext_wordCount | total_comments | avg_comment_score | avg_body_wordCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | ... | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 | 565.000000 |
| mean | 12.071745 | 96.598000 | 10.049639 | 2021.663717 | 6.070796 | 4.038938 | 24.392920 | 92.021239 | 31.223070 | 1759.134513 | ... | 494.194690 | 2.132743 | 0.027103 | 0.003540 | 0.902655 | 8.638694 | 135.312197 | 6839.185841 | 9.737450 | 24.158624 |
| std | 1.817426 | 15.218937 | 1.870428 | 0.665921 | 3.542840 | 1.393465 | 15.428097 | 64.887297 | 61.358436 | 2617.569597 | ... | 813.161999 | 6.664057 | 0.090783 | 0.059444 | 1.853522 | 0.740156 | 29.318434 | 3211.173967 | 2.846059 | 4.692633 |
| min | 9.310000 | 63.169998 | 7.735437 | 2021.000000 | 1.000000 | 2.000000 | 1.000000 | 32.000000 | 1.920000 | 216.000000 | ... | 18.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.333333 | 59.706897 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 10.644000 | 85.230003 | 8.443065 | 2021.000000 | 3.000000 | 3.000000 | 10.000000 | 60.000000 | 3.660377 | 554.000000 | ... | 54.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.146667 | 116.204545 | 4844.000000 | 7.762292 | 21.451534 |
| 50% | 11.550000 | 98.250000 | 9.063451 | 2022.000000 | 6.000000 | 4.000000 | 23.000000 | 76.000000 | 6.578947 | 859.000000 | ... | 132.000000 | 1.000000 | 0.007092 | 0.000000 | 1.000000 | 8.625000 | 132.426966 | 5854.000000 | 9.380717 | 23.364365 |
| 75% | 13.578000 | 107.080002 | 11.691803 | 2022.000000 | 9.000000 | 5.000000 | 38.000000 | 98.000000 | 27.741935 | 1648.000000 | ... | 695.000000 | 2.000000 | 0.020000 | 0.000000 | 1.000000 | 9.080357 | 148.824742 | 7965.000000 | 11.155766 | 26.304678 |
| max | 16.320000 | 128.589996 | 14.404585 | 2023.000000 | 12.000000 | 6.000000 | 52.000000 | 897.000000 | 743.078947 | 31995.000000 | ... | 7470.000000 | 72.000000 | 1.096154 | 1.000000 | 29.000000 | 11.112903 | 409.982906 | 33657.000000 | 23.439061 | 40.911036 |
8 rows × 22 columns
combined_statistics_table = combined_df.describe()
combined_statistics_table.to_csv("combined_statistics_table.csv", index=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 173, Finished, Available)
import plotly.figure_factory as ff
import numpy as np
# Calculate correlation matrix
corr_matrix = combined_df.corr()
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# Apply the mask to the correlation matrix
corr_matrix_masked = corr_matrix.mask(mask)
# Create the heatmap
fig = ff.create_annotated_heatmap(
z=corr_matrix_masked.to_numpy(),
x=corr_matrix_masked.columns.tolist(),
y=corr_matrix_masked.columns.tolist(),
colorscale='Sunset',
showscale=True,
annotation_text=corr_matrix_masked.round(2).to_numpy()
)
# Update the layout
fig.update_layout(
title='Correlation Heatmap',
xaxis=dict(tickmode='array', tickvals=np.arange(len(corr_matrix.columns)), ticktext=corr_matrix.columns),
yaxis=dict(tickmode='array', tickvals=np.arange(len(corr_matrix.columns)), ticktext=corr_matrix.columns),
paper_bgcolor='#FFFFFF',
plot_bgcolor='rgba(0,0,0,0)',
)
# Show the figure
fig.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 108, Finished, Available)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Calculate the correlation matrix
corr_matrix = combined_df.corr()
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# Set up the matplotlib figure
plt.figure(figsize=(20, 16))
# Draw the heatmap with the mask
sns.heatmap(corr_matrix, mask=mask, cmap=sns.cubehelix_palette(as_cmap=True), annot=True, fmt=".2f", linewidths=.5, cbar_kws={"shrink": .5})
# Beautify the plot
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 111, Finished, Available)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# Predicting 'NTDOY' stock prices
features = ['total_submissions', 'avg_submission_score', 'total_comments', 'avg_comment_score', 'avg_num_crossposts', 'total_submission_crossposts']
X = combined_df[features]
y = combined_df['NTDOY']StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 139, Finished, Available)
# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 140, Finished, Available)
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 141, Finished, Available)
# Model training
model = LinearRegression()
model.fit(X_train, y_train)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 142, Finished, Available)
LinearRegression()
# Prediction and evaluation
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')
# R-squared
r_squared = model.score(X_test, y_test)
print(f'R-squared: {r_squared}')
# AIC
n = len(X_test)
k = X.shape[1] + 1
RSS = np.sum((y_test - y_pred) ** 2)
AIC = n * np.log(RSS/n) + 2 * k
print(f'AIC: {AIC}')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 143, Finished, Available)
RMSE: 2.2354823027643596
R-squared: -25.95855276387089
AIC: 195.80728204640792
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
# Prediction and evaluation
y_pred = model.predict(X_test)
# R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')
# AIC for Linear Regression
n = X_test.shape[0] # Number of observations
k = X_train.shape[1] + 1 # Number of features + 1 for the intercept
RSS = np.sum((y_test - y_pred) ** 2)
AIC = n * np.log(RSS/n) + 2 * k
print(f'AIC: {AIC}')
# RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 144, Finished, Available)
R-squared: -25.95855276387089
AIC: 195.80728204640792
RMSE: 2.2354823027643596
from statsmodels.tsa.stattools import adfuller
def check_stationarity(series):
result = adfuller(series)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
# Check if 'NTDOY' stock prices are stationary
check_stationarity(combined_df['NTDOY'].dropna())StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 145, Finished, Available)
ADF Statistic: -1.868510
p-value: 0.347017
Critical Values:
1%: -3.442
5%: -2.867
10%: -2.570
from statsmodels.tsa.stattools import adfuller
result = adfuller(combined_df['NTDOY'].dropna())
print('ADF Statistic:', result[0])
print('p-value:', result[1])StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 146, Finished, Available)
ADF Statistic: -1.868510294901256
p-value: 0.34701672354064506
combined_df['NTDOY_diff'] = combined_df['NTDOY'].diff().dropna()StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 147, Finished, Available)
from statsmodels.tsa.arima.model import ARIMA
# 'NTDOY_diff' is the differenced series
# Drop NaN values that were created due to differencing
df_for_arima = combined_df[['NTDOY_diff']].dropna()
# Fit ARIMA model
# ARIMA with order (p=1, d=1, q=1)
model = ARIMA(df_for_arima, order=(1, 1, 1))
model_fit = model.fit()
# Summary of the model
print(model_fit.summary())StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 168, Finished, Available)
/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:
An unsupported index was provided and will be ignored when e.g. forecasting.
/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:
An unsupported index was provided and will be ignored when e.g. forecasting.
/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:
An unsupported index was provided and will be ignored when e.g. forecasting.
SARIMAX Results
==============================================================================
Dep. Variable: NTDOY_diff No. Observations: 564
Model: ARIMA(1, 1, 1) Log Likelihood 87.867
Date: Mon, 27 Nov 2023 AIC -169.735
Time: 23:38:28 BIC -156.735
Sample: 0 HQIC -164.660
- 564
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.0998 0.037 -2.704 0.007 -0.172 -0.027
ma.L1 -0.9999 0.469 -2.131 0.033 -1.920 -0.080
sigma2 0.0424 0.020 2.126 0.033 0.003 0.081
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 70.51
Prob(Q): 0.99 Prob(JB): 0.00
Heteroskedasticity (H): 0.34 Skew: -0.06
Prob(H) (two-sided): 0.00 Kurtosis: 4.73
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
# Forecasting
forecast = model_fit.forecast(steps=5) # forecast next 5 periods
forecastStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 169, Finished, Available)
/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:376: ValueWarning:
No supported index is available. Prediction results will be given with an integer index beginning at `start`.
564 -0.017879
565 -0.010103
566 -0.010879
567 -0.010802
568 -0.010810
Name: predicted_mean, dtype: float64
forecast_values = forecast.to_list() # Convert forecast to a list
# Generate future dates starting from the day after the last date
last_date = combined_df['Date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=len(forecast_values))
# Pair forecast values with future dates
forecast_with_dates = pd.Series(data=forecast_values, index=future_dates)
forecast_with_datesStatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 170, Finished, Available)
2023-04-01 -0.017879
2023-04-02 -0.010103
2023-04-03 -0.010879
2023-04-04 -0.010802
2023-04-05 -0.010810
Freq: D, dtype: float64
# Get the last actual value
last_actual_value = combined_df['NTDOY'].iloc[-1]
# Cumulatively sum the forecasted differences
cumulative_sum = forecast_with_dates.cumsum()
# Revert the differencing
reverted_forecast = last_actual_value + cumulative_sum
# Now, 'reverted_forecast' contains the forecasted NTDOY prices
print(reverted_forecast)StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 171, Finished, Available)
2023-04-01 9.672121
2023-04-02 9.662018
2023-04-03 9.651138
2023-04-04 9.640336
2023-04-05 9.629527
Freq: D, dtype: float64
from sklearn.metrics import mean_squared_error
from math import sqrt
y_true_values = [9.69, 9.69, 9.74, 10.18, 9.94]
y_true_dates = pd.date_range(start="2023-04-01", end="2023-04-05")
y_true = pd.Series(data=y_true_values, index=y_true_dates)
# Calculate RMSE
rmse = sqrt(mean_squared_error(y_true, reverted_forecast))
print(f'RMSE: {rmse}')StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 172, Finished, Available)
RMSE: 0.281648511155487