Code: ML-Topic 10

spark

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 6, Finished, Available)

SparkSession - hive

SparkContext

Spark UI

Version: v3.2.2.5.1-100879434
Master: yarn
AppName: Azure ML Experiment

import pyspark.sql.functions as F
from pyspark.sql.functions import sum as _sum, mean, stddev, max as _max, min as _min, count, percentile_approx, year, month, dayofmonth, ceil, col, dayofweek, hour, explode, date_format, lower, size, split, regexp_replace, isnan, when

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 7, Finished, Available)

Read full reddit data

blob_account_name = "marckvnonprodblob"
blob_container_name = "bigdata"
# read only
blob_sas_token = "?sv=2021-10-04&st=2023-10-04T01%3A42%3A59Z&se=2024-01-02T02%3A42%3A00Z&sr=c&sp=rlf&sig=w3CH9MbCOpwO7DtHlrahc7AlRPxSZZb8MOgS6TaXLzI%3D"

wasbs_base_url = (
    f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/"
)
spark.conf.set(
    f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net",
    blob_sas_token,
)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 55, Finished, Available)

comments_path = "reddit-parquet/comments/"
submissions_path = "reddit-parquet/submissions/"

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 56, Finished, Available)

comments_df = spark.read.parquet(f"{wasbs_base_url}{comments_path}")
submissions_df = spark.read.parquet(f"{wasbs_base_url}{submissions_path}")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 57, Finished, Available)

submissions = submissions_df.filter(submissions_df.subreddit == "pokemon")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 11, Finished, Available)

comments = comments_df.filter(comments_df.subreddit == "pokemon")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 58, Finished, Available)

sub = submissions.select("subreddit", "author", "author_flair_text", "created_utc", "title", "selftext", "num_comments", "num_crossposts", "over_18", "score", "stickied", "id")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 13, Finished, Available)

sub.show()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 14, Finished, Available)

+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
|subreddit|           author|   author_flair_text|        created_utc|               title|            selftext|num_comments|num_crossposts|over_18|score|stickied|     id|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
|  pokemon|  OneWhoGetsBread|                null|2023-02-22 21:08:56|the PokemonTogeth...|So several days a...|           0|             0|  false|    6|   false|119d8jo|
|  pokemon|          lumixod|                null|2023-02-22 21:13:47|Wholesome friendship|                    |          69|             0|  false| 1947|   false|119dfv9|
|  pokemon|         KubfuKid|                null|2023-02-22 21:16:45|(OC) Diego the Gh...|                    |           2|             0|  false|   18|   false|119dkh9|
|  pokemon|blackjackgabbiani|                null|2023-02-22 21:16:58|Who's a non-villa...|For me, Tyme insp...|         372|             0|  false|  406|   false|119dkrj|
|  pokemon|        [deleted]|                null|2023-02-22 21:17:16|Zard Hunting in B...|           [removed]|           0|             0|  false|    1|   false|119dl6k|
|  pokemon|Totallynotttegegg|                null|2023-02-22 21:17:58|i have a realization|&amp;#x200B;\n\n[...|           0|             0|  false|    5|   false|119dm5g|
|  pokemon|          MeiLei-|:906::722::495::2...|2023-02-22 21:18:19|bored in class. h...|                    |           6|             0|  false|    0|   false|119dmnh|
|  pokemon|        [deleted]|                null|2023-02-22 21:19:53|Original 151, cau...|           [deleted]|          15|             0|  false|  153|   false|119dp0i|
|  pokemon|       gravemind9|                null|2023-02-22 21:25:09|not a single thou...|                    |           3|             0|  false|    6|   false|119dx5q|
|  pokemon|  coolnessAlert39|                null|2023-02-22 21:26:11|Name any Bug type...|Ok now we’re doin...|          32|             0|  false|    0|   false|119dymy|
|  pokemon|    War_Daddy_992|                null|2023-02-22 21:26:21|Team Magma create...|                    |          42|             0|  false|  262|   false|119dyv4|
|  pokemon|         Redd_ddd|                null|2023-02-22 21:28:13|I gave Ghetsis a ...|                    |          28|             0|  false|   88|   false|119e1e5|
|  pokemon|       Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|My starters for e...|Gen 1: Charizard ...|           1|             0|  false|    0|   false|117vi5f|
|  pokemon|    DenseRead9852|                null|2023-02-21 05:33:56|I really fucking ...|I feel like it's ...|          13|             0|  false|    0|   false|117vluc|
|  pokemon|        [deleted]|                null|2023-02-21 05:36:05|Opinion on Pokemo...|           [removed]|           0|             0|  false|    1|   false|117vn8n|
|  pokemon|       UncleQnobi|                null|2023-02-21 05:43:01|     Team Gyarados🥶|                    |           0|             0|  false|    1|   false|117vrkl|
|  pokemon|  eyemcantoeknees|                null|2023-02-21 05:45:19|I made Tinkaton o...|                    |          10|             0|  false|  192|   false|117vszo|
|  pokemon|        [deleted]|                null|2023-02-21 05:48:49|Tough question;If...|           [removed]|           0|             0|  false|    1|   false|117vv70|
|  pokemon|        [deleted]|                null|2023-02-21 05:49:41|        Vaporeon...?|           [deleted]|           1|             0|  false|    0|   false|117vvqi|
|  pokemon|         Attakrus|                null|2023-02-21 05:51:46|Tough question;If...|           [removed]|          29|             0|  false|    1|   false|117vx29|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
only showing top 20 rows

com = comments.select("subreddit", "author", "author_flair_text", "created_utc", "body", "controversiality", "score",  "parent_id", "stickied", "link_id", "id")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 59, Finished, Available)

com.show()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 60, Finished, Available)

+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|subreddit|              author|   author_flair_text|        created_utc|                body|controversiality|score| parent_id|stickied|  link_id|     id|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|  pokemon|            KatonRyu|                null|2021-06-23 08:33:18|I still play like...|               0|    1| t3_o5h6hg|   false|t3_o5h6hg|h2qxnm1|
|  pokemon|        purejackbaby|                null|2021-06-23 08:34:09|Could you like, g...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qxpjq|
|  pokemon|Nothingyoudoisawaste|                null|2021-06-23 08:34:16|It felt forced - ...|               0|    2|t1_gzxgl8w|   false|t3_nnyg0q|h2qxpu5|
|  pokemon|              PuxOf_|                null|2021-06-23 08:34:19|doing challenge m...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qxpxh|
|  pokemon|           [deleted]|                null|2021-06-23 08:34:34|           [removed]|               0|    1| t3_o67wf5|   false|t3_o67wf5|h2qxqik|
|  pokemon|        ProbablyPuma|       I am Rain Boi|2021-06-23 08:37:47|Yall its supposed...|               0|    1| t3_o66cwy|   false|t3_o66cwy|h2qxxrd|
|  pokemon|           [deleted]|                null|2021-06-23 08:38:13|           [removed]|               0|    1|t1_h2pogtp|   false|t3_o609sa|h2qxyp7|
|  pokemon|        Fishsticks03|:gm: Avenge the F...|2021-06-23 08:38:24|         demon prawn|               0|    1| t3_o5myo5|   false|t3_o5myo5|h2qxz3e|
|  pokemon|     DeadpoolCroatia|                null|2021-06-23 08:40:30|Miltank was never...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qy3pv|
|  pokemon|           lofi_rico|                null|2021-06-23 08:40:43|I can't believe N...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy479|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:41:25|I guess that’s tr...|               0|    1|t1_h2pynet|   false|t3_o589jq|h2qy5nj|
|  pokemon|    ElGatoGuerrero72|                null|2021-06-23 08:42:15|I feel like Steph...|               0|    9| t3_o65117|   false|t3_o65117|h2qy7d7|
|  pokemon|          Long__Jump|                null|2021-06-23 08:42:45|Shiny Bibarel as ...|               0|    4| t3_o5wra5|   false|t3_o5wra5|h2qy8d4|
|  pokemon|         Aryallie_18|                null|2021-06-23 08:43:25|These are adorabl...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy9tl|
|  pokemon|           Jankufood|                null|2021-06-23 08:44:46|If the original P...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycm4|
|  pokemon|          Snoo_58305|                null|2021-06-23 08:44:47|That’s really ama...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycnm|
|  pokemon|    Bringbackdigimon|                null|2021-06-23 08:44:51|Don’t give gamefr...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycs5|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:44:55|I loved gen 5 lol...|               0|    4| t3_o67fvr|   false|t3_o67fvr|h2qycy3|
|  pokemon|           [deleted]|                null|2021-06-23 08:45:21|           [removed]|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qydtu|
|  pokemon|     MoonLightScreen|WHY WASN'T I IN MOON|2021-06-23 08:45:49|Well, naturally. ...|               0|    3| t3_o64dri|   false|t3_o64dri|h2qyeuv|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
only showing top 20 rows

Data Cleaning

Filter out corrupted data
Clean Text data
Create new columns (date, year, hour, week, month, cleaned text, wordCount)
- Conduct word count on cleaned text for each data point

sub = sub.filter(
    (col('title')!='') & \
    (col('title')!='[deleted]') & \
    (col('title')!='[removed]') & \
    (col('selftext')!='') & \
    (col('selftext')!='[deleted]') & \
    (col('selftext')!='[removed]') & \
    (col('author')!='[deleted]') & \
    (col('author')!='[removed]')
)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 17, Finished, Available)

com = com.filter(
    (col('body')!='') & \
    (col('body')!='[deleted]') & \
    (col('body')!='[removed]') & \
    (col('author')!='[deleted]') & \
    (col('author')!='[removed]')
)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 61, Finished, Available)

sub_cleaned = (
    sub
    .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
    .withColumn("created_hour", hour("created_utc")) # create hour column
    .withColumn("created_week", dayofweek("created_utc")) # create day of the week column
    .withColumn("created_month", month("created_utc")) # create month of the year column
    .withColumn("created_year", year("created_utc")) # create the year column
    .withColumn("title", lower(col('title'))) # text cleaning: lowercase
    .withColumn("selftext", lower(col('selftext'))) # text cleaning: lowercase
    .withColumn("cleaned_title", regexp_replace(col('title'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned_title", regexp_replace(col('cleaned_title'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('title_wordCount', size(split(col('cleaned_title'), ' '))) # word count
    .withColumn("cleaned_selftext", regexp_replace(col('selftext'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned_selftext", regexp_replace(col('cleaned_selftext'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('selftext_wordCount', size(split(col('cleaned_selftext'), ' '))) # word count
)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 19, Finished, Available)

com_cleaned = (
    com
    .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
    .withColumn("created_hour", hour("created_utc")) # create hour column
    .withColumn("created_week", dayofweek("created_utc")) # create day of the week column
    .withColumn("created_month", month("created_utc")) # create month of the year column
    .withColumn("created_year", year("created_utc")) # create the year column
    .withColumn("body", lower(col('body'))) # text cleaning: lowercase
    .withColumn("cleaned", regexp_replace(col('body'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned", regexp_replace(col('cleaned'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('body_wordCount', size(split(col('cleaned'), ' '))) # word count
)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 62, Finished, Available)

sub_cleaned.show()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 21, Finished, Available)

+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
|subreddit|           author|   author_flair_text|        created_utc|               title|            selftext|num_comments|num_crossposts|over_18|score|stickied|     id|created_date|created_hour|created_week|created_month|created_year|       cleaned_title|title_wordCount|    cleaned_selftext|selftext_wordCount|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
|  pokemon|  OneWhoGetsBread|                null|2023-02-22 21:08:56|the pokemontogeth...|so several days a...|           0|             0|  false|    6|   false|119d8jo|  2023-02-22|          21|           4|            2|        2023|the pokemontogeth...|              3|so several days a...|               241|
|  pokemon|blackjackgabbiani|                null|2023-02-22 21:16:58|who's a non-villa...|for me, tyme insp...|         372|             0|  false|  406|   false|119dkrj|  2023-02-22|          21|           4|            2|        2023|whos a nonvillain...|              9|for me tyme inspi...|               122|
|  pokemon|Totallynotttegegg|                null|2023-02-22 21:17:58|i have a realization|&amp;#x200b;\n\n[...|           0|             0|  false|    5|   false|119dm5g|  2023-02-22|          21|           4|            2|        2023|i have a realization|              4|ampx200b brock is...|                 6|
|  pokemon|  coolnessAlert39|                null|2023-02-22 21:26:11|name any bug type...|ok now we’re doin...|          32|             0|  false|    0|   false|119dymy|  2023-02-22|          21|           4|            2|        2023|name any bug type...|             15|ok now were doing...|                81|
|  pokemon|       Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|my starters for e...|gen 1: charizard ...|           1|             0|  false|    0|   false|117vi5f|  2023-02-21|           5|           3|            2|        2023|my starters for e...|              5|gen 1 charizard g...|                50|
|  pokemon|    DenseRead9852|                null|2023-02-21 05:33:56|i really fucking ...|i feel like it's ...|          13|             0|  false|    0|   false|117vluc|  2023-02-21|           5|           3|            2|        2023|i really fucking ...|             24|i feel like its f...|                65|
|  pokemon|      Shiny_Hypno|                null|2023-02-21 06:01:24|one of the follow...|i came up with th...|           0|             0|  false|    3|   false|117w3da|  2023-02-21|           6|           3|            2|        2023|one of the follow...|             16|i came up with th...|                91|
|  pokemon|     EmiKoizuwumi|                null|2023-02-21 06:34:43|emi's daily pokem...|[art by me!](http...|           0|             0|  false|    5|   false|117wo87|  2023-02-21|           6|           3|            2|        2023|emis daily pokemo...|             10|art by mehttpspre...|               530|
|  pokemon|       sirius1208|                null|2023-02-21 05:07:59|first time ev tra...|i'm playing throu...|           5|             0|  false|    2|   false|117v4t1|  2023-02-21|           5|           3|            2|        2023|first time ev tra...|              4|im playing throug...|                84|
|  pokemon|      Mr-Green129|                null|2023-02-21 19:35:05|monotype run for ...|i have had shield...|           2|             0|  false|    1|   false|118d5sj|  2023-02-21|          19|           3|            2|        2023|monotype run for ...|              5|i have had shield...|                51|
|  pokemon|         Sgrinfio|                null|2021-02-23 08:03:16|what's the differ...|i always thought ...|           8|             0|  false|    5|   false| lqcz1n|  2021-02-23|           8|           3|            2|        2021|whats the differe...|              7|i always thought ...|                96|
|  pokemon|     superharry24|                null|2021-02-23 03:37:09|which anime serie...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq86lq|  2021-02-23|           3|           3|            2|        2021|which anime serie...|              8|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:39:29|which anime serie...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq885n|  2021-02-23|           3|           3|            2|        2021|which anime serie...|              8|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:43:01|which anime serie...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq8all|  2021-02-23|           3|           3|            2|        2021|which anime serie...|              8|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:44:19|favorite series p...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq8bfn|  2021-02-23|           3|           3|            2|        2021|favorite series p...|              4|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:48:55|what is your favo...|just wondering wh...|           5|             0|  false|    5|   false| lq8ek4|  2021-02-23|           3|           3|            2|        2021|what is your favo...|              7|just wondering wh...|                73|
|  pokemon|     superharry24|                null|2021-02-23 03:49:48|what is your favo...|just wondering wh...|           4|             0|  false|    3|   false| lq8f5t|  2021-02-23|           3|           3|            2|        2021|what is your favo...|              7|just wondering wh...|                73|
|  pokemon|         PCN24454|                null|2021-02-23 04:21:44|what are your big...|what's something ...|           4|             0|  false|    3|   false| lq91fh|  2021-02-23|           4|           3|            2|        2021|what are your big...|              8|whats something t...|               133|
|  pokemon|          Kato756|                null|2021-02-23 02:57:25|did you guys see ...|so i guess we wil...|           4|             0|  false|    2|   false| lq7egb|  2021-02-23|           2|           3|            2|        2021|did you guys see ...|             20|so i guess we wil...|                54|
|  pokemon|  SnowPhoenix9999|                 ...|2021-02-23 05:00:19|[rebuild tuesday]...|hello everyone!\n...|           8|             0|  false|    1|   false| lq7iq8|  2021-02-23|           5|           3|            2|        2021|rebuild tuesday r...|              3|hello everyone we...|               364|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
only showing top 20 rows

com_cleaned.show()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 66, Finished, Available)

+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|subreddit|              author|   author_flair_text|        created_utc|                body|controversiality|score| parent_id|stickied|  link_id|     id|created_date|created_hour|created_week|created_month|created_year|             cleaned|body_wordCount|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|  pokemon|            KatonRyu|                null|2021-06-23 08:33:18|i still play like...|               0|    1| t3_o5h6hg|   false|t3_o5h6hg|h2qxnm1|  2021-06-23|           8|           4|            6|        2021|i still play like...|            28|
|  pokemon|        purejackbaby|                null|2021-06-23 08:34:09|could you like, g...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qxpjq|  2021-06-23|           8|           4|            6|        2021|could you like go...|            28|
|  pokemon|Nothingyoudoisawaste|                null|2021-06-23 08:34:16|it felt forced - ...|               0|    2|t1_gzxgl8w|   false|t3_nnyg0q|h2qxpu5|  2021-06-23|           8|           4|            6|        2021|it felt forced i ...|           161|
|  pokemon|              PuxOf_|                null|2021-06-23 08:34:19|doing challenge m...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qxpxh|  2021-06-23|           8|           4|            6|        2021|doing challenge m...|             9|
|  pokemon|        ProbablyPuma|       I am Rain Boi|2021-06-23 08:37:47|yall its supposed...|               0|    1| t3_o66cwy|   false|t3_o66cwy|h2qxxrd|  2021-06-23|           8|           4|            6|        2021|yall its supposed...|             7|
|  pokemon|        Fishsticks03|:gm: Avenge the F...|2021-06-23 08:38:24|         demon prawn|               0|    1| t3_o5myo5|   false|t3_o5myo5|h2qxz3e|  2021-06-23|           8|           4|            6|        2021|         demon prawn|             2|
|  pokemon|     DeadpoolCroatia|                null|2021-06-23 08:40:30|miltank was never...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qy3pv|  2021-06-23|           8|           4|            6|        2021|miltank was never...|            21|
|  pokemon|           lofi_rico|                null|2021-06-23 08:40:43|i can't believe n...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy479|  2021-06-23|           8|           4|            6|        2021|i cant believe ni...|            10|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:41:25|i guess that’s tr...|               0|    1|t1_h2pynet|   false|t3_o589jq|h2qy5nj|  2021-06-23|           8|           4|            6|        2021|i guess thats tru...|            37|
|  pokemon|    ElGatoGuerrero72|                null|2021-06-23 08:42:15|i feel like steph...|               0|    9| t3_o65117|   false|t3_o65117|h2qy7d7|  2021-06-23|           8|           4|            6|        2021|i feel like steph...|            18|
|  pokemon|          Long__Jump|                null|2021-06-23 08:42:45|shiny bibarel as ...|               0|    4| t3_o5wra5|   false|t3_o5wra5|h2qy8d4|  2021-06-23|           8|           4|            6|        2021|shiny bibarel as ...|            13|
|  pokemon|         Aryallie_18|                null|2021-06-23 08:43:25|these are adorabl...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy9tl|  2021-06-23|           8|           4|            6|        2021|these are adorabl...|             9|
|  pokemon|           Jankufood|                null|2021-06-23 08:44:46|if the original p...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycm4|  2021-06-23|           8|           4|            6|        2021|if the original p...|             8|
|  pokemon|          Snoo_58305|                null|2021-06-23 08:44:47|that’s really ama...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycnm|  2021-06-23|           8|           4|            6|        2021|thats really amaz...|            55|
|  pokemon|    Bringbackdigimon|                null|2021-06-23 08:44:51|don’t give gamefr...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycs5|  2021-06-23|           8|           4|            6|        2021|dont give gamefre...|             4|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:44:55|i loved gen 5 lol...|               0|    4| t3_o67fvr|   false|t3_o67fvr|h2qycy3|  2021-06-23|           8|           4|            6|        2021|i loved gen 5 lol...|            42|
|  pokemon|     MoonLightScreen|WHY WASN'T I IN MOON|2021-06-23 08:45:49|well, naturally. ...|               0|    3| t3_o64dri|   false|t3_o64dri|h2qyeuv|  2021-06-23|           8|           4|            6|        2021|well naturally wu...|            20|
|  pokemon|         GodOfYoshis|                null|2021-06-23 08:47:17|i mean it's the i...|               0|    1|t1_h2q71pq|   false|t3_o5pukl|h2qyhx3|  2021-06-23|           8|           4|            6|        2021|i mean its the in...|            20|
|  pokemon|        GrandWar4675|                null|2021-06-23 08:47:53|            pokimane|               0|   15|t1_h2qq5rm|   false|t3_o63atm|h2qyj7x|  2021-06-23|           8|           4|            6|        2021|            pokimane|             1|
|  pokemon|          bosswilliz|                null|2021-06-23 08:47:54|    i want them all!|               0|    1| t3_o63atm|   false|t3_o63atm|h2qyj8t|  2021-06-23|           8|           4|            6|        2021|     i want them all|             4|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
only showing top 20 rows

Save cleaned data

datastore = 'azureml://datastores/workspaceblobstore/paths'

# https://<STORAGE-ACCOUNT>.blob.core.windows.net/<CONTAINER-NAME>
Storage_URI="https://group09astorage08f5ea16c.blob.core.windows.net/azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"

workspace_default_storage_account = "group09astorage08f5ea16c"
workspace_default_container = "azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"

workspace_wasbs_base_url = (
    f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 23, Finished, Available)

sub_cleaned.write.parquet(f"{workspace_wasbs_base_url}/pokemon_sub_cleaned.parquet")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 25, Finished, Available)

com_cleaned.write.parquet(f"{workspace_wasbs_base_url}/pokemon_com_cleaned.parquet")

datastore = 'azureml://datastores/workspaceblobstore/paths'

submissions = spark.read.parquet(f"{datastore}/pokemon_sub_cleaned.parquet")
comments = spark.read.parquet(f"{datastore}/pokemon_com_cleaned.parquet")

submissions = sub_cleaned
comments = com_cleaned

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 26, Finished, Available)

Data Integration and Preparation

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import plotly.graph_objects as go
import plotly.subplots as sp
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
from plotly.subplots import make_subplots
import pyspark.sql.types as T

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 27, Finished, Available)

stocks = pd.read_csv("Users/hw487/fall-2023-reddit-project-team-09/data/csv/stocks.csv")

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 29, Finished, Available)

stocks

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 30, Finished, Available)

	Unnamed: 0	NTDOY	SONY	TYO	Date
0	0	15.740000	100.070000	7.735437	2021-01-04
1	1	16.176001	103.110001	7.793598	2021-01-05
2	2	15.738000	101.080002	7.968082	2021-01-06
3	3	15.802000	102.000000	8.026241	2021-01-07
4	4	16.010000	103.989998	8.113484	2021-01-08
...	...	...	...	...	...
560	560	9.650000	86.639999	11.965896	2023-03-27
561	561	9.610000	85.820000	11.985530	2023-03-28
562	562	9.770000	87.870003	12.044427	2023-03-29
563	563	9.630000	89.309998	12.054242	2023-03-30
564	564	9.690000	90.650002	11.818654	2023-03-31

565 rows × 5 columns

from pyspark.sql.functions import to_date
from pyspark.sql.functions import year, month, dayofweek
from pyspark.sql.functions import count, avg

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 35, Finished, Available)

# Convert Pandas DataFrame to Spark DataFrame
stocks_df = spark.createDataFrame(stocks)

# Convert 'Date' column to datetime format in stocks DataFrame
stocks_df = stocks_df.withColumn("Date", to_date("Date", "yyyy-MM-dd"))

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 32, Finished, Available)

Extract Meaningful Features

# For the stocks DataFrame
stocks_df = stocks_df.withColumn("Year", year("Date"))
stocks_df = stocks_df.withColumn("Month", month("Date"))
stocks_df = stocks_df.withColumn("DayOfWeek", dayofweek("Date"))

# Similarly for submissions and comments DataFrames
submissions = submissions.withColumn("Year", year("created_date"))
submissions = submissions.withColumn("Month", month("created_date"))
submissions = submissions.withColumn("DayOfWeek", dayofweek("created_date"))

comments = comments.withColumn("Year", year("created_date"))
comments = comments.withColumn("Month", month("created_date"))
comments = comments.withColumn("DayOfWeek", dayofweek("created_date"))

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 33, Finished, Available)

from pyspark.sql.functions import weekofyear

# Add week of year to the stocks DataFrame
stocks_df = stocks_df.withColumn("WeekOfYear", weekofyear("Date"))

# Add week of year to submissions and comments DataFrames
submissions = submissions.withColumn("WeekOfYear", weekofyear("created_date"))
comments = comments.withColumn("WeekOfYear", weekofyear("created_date"))

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 37, Finished, Available)

Data Aggregation and Merging

from pyspark.sql.functions import sum as sum_agg  # Renaming to avoid confusion with Python's built-in sum()
from pyspark.sql.functions import mean, max as _max, percentile_approx

# Aggregate Submissions by Week
submissions_agg_weekly = submissions.groupBy("WeekOfYear", "Year").agg(
    count("*").alias("total_submissions"),
    mean("score").alias("avg_submission_score"),
    sum_agg("num_comments").alias("total_submission_comments"),
    mean("num_comments").alias("avg_num_comments"),
    percentile_approx("num_comments", 0.5).alias("median_num_comments"),
    _max("num_comments").alias("max_num_comments"),
    sum_agg("num_crossposts").alias("total_submission_crossposts"),
    mean("num_crossposts").alias("avg_num_crossposts"),
    percentile_approx("num_crossposts", 0.5).alias("median_num_crossposts"),
    _max("num_crossposts").alias("max_num_crossposts"),
    mean("title_wordCount").alias("avg_title_wordCount"),
    mean("selftext_wordCount").alias("avg_selftext_wordCount")
).toPandas()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 38, Finished, Available)

submissions_agg_weekly

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 39, Finished, Available)

	WeekOfYear	Year	total_submissions	avg_submission_score	total_submission_comments	avg_num_comments	median_num_comments	max_num_comments	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount
0	8	2023	393	50.684478	14381	36.592875	5	1531	8	0.020356	0	3	8.582697	133.150127
1	8	2021	2259	99.024347	55402	24.525011	7	6134	21	0.009296	0	2	9.321824	147.490040
2	9	2023	692	40.748555	14680	21.213873	5	1063	5	0.007225	0	1	8.774566	136.365607
3	48	2022	1690	133.173373	59718	35.336095	8	3096	12	0.007101	0	3	8.676923	141.889941
4	13	2023	375	108.197333	7154	19.077333	5	912	7	0.018667	0	2	8.645333	131.466667
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
114	32	2021	505	93.233663	16866	33.398020	6	2323	5	0.009901	0	1	9.017822	127.942574
115	46	2021	1268	93.505521	43890	34.613565	7	5328	21	0.016562	0	3	8.485804	103.922713
116	45	2021	632	71.420886	26424	41.810127	7	2963	6	0.009494	0	1	9.121835	146.132911
117	42	2021	431	73.563805	18299	42.457077	7	6973	9	0.020882	0	3	8.470998	133.287703
118	44	2021	417	87.009592	18070	43.333333	7	1707	4	0.009592	0	2	8.666667	144.973621

119 rows × 14 columns

submissions_agg_weekly['Year'] = submissions_agg_weekly['Year'].astype(int)
submissions_agg_weekly['WeekOfYear'] = submissions_agg_weekly['WeekOfYear'].astype(int)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 40, Finished, Available)

submissions_agg_weekly = submissions_agg_weekly.sort_values(by=['Year', 'WeekOfYear'])

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 42, Finished, Available)

submissions_agg_weekly

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 43, Finished, Available)

	WeekOfYear	Year	total_submissions	avg_submission_score	total_submission_comments	avg_num_comments	median_num_comments	max_num_comments	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount
44	1	2021	603	78.381426	9138	15.154229	4	1247	4	0.006633	0	2	8.334992	157.920398
71	2	2021	567	62.342152	9780	17.248677	6	958	8	0.014109	0	2	8.871252	155.313933
38	3	2021	616	56.082792	10879	17.660714	5	1022	5	0.008117	0	1	8.659091	159.055195
22	4	2021	619	67.316640	10240	16.542811	4	1215	8	0.012924	0	1	8.568659	154.453958
48	5	2021	627	61.556619	9526	15.192982	5	1024	10	0.015949	0	4	8.564593	150.409888
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
61	10	2023	434	51.755760	12594	29.018433	6	2122	3	0.006912	0	1	8.880184	142.140553
5	11	2023	453	118.214128	15130	33.399558	6	2330	10	0.022075	0	5	8.456954	140.823400
28	12	2023	539	67.541744	10738	19.922078	5	860	41	0.076067	0	24	8.077922	136.018553
4	13	2023	375	108.197333	7154	19.077333	5	912	7	0.018667	0	2	8.645333	131.466667
37	52	2023	118	268.491525	7055	59.788136	8	1380	1	0.008475	0	1	8.593220	155.067797

119 rows × 14 columns

submissions_agg_weekly.to_csv("submissions_agg_weekly.csv", index=False)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 44, Finished, Available)

# Aggregate Submissions by Date
submissions_agg = submissions.groupBy("created_date").agg(
    count("*").alias("total_submissions"),
    mean("score").alias("avg_submission_score"),
    sum_agg("num_comments").alias("total_submission_comments"),
    mean("num_comments").alias("avg_num_comments"),
    percentile_approx("num_comments", 0.5).alias("median_num_comments"),
    _max("num_comments").alias("max_num_comments"),
    sum_agg("num_crossposts").alias("total_submission_crossposts"),
    mean("num_crossposts").alias("avg_num_crossposts"),
    percentile_approx("num_crossposts", 0.5).alias("median_num_crossposts"),
    _max("num_crossposts").alias("max_num_crossposts"),
    mean("title_wordCount").alias("avg_title_wordCount"),
    mean("selftext_wordCount").alias("avg_selftext_wordCount")
).toPandas()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 45, Finished, Available)

submissions_agg

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 46, Finished, Available)

	created_date	total_submissions	avg_submission_score	total_submission_comments	avg_num_comments	median_num_comments	max_num_comments	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount
0	2021-01-11	85	6.929412	1577	18.552941	4	958	0	0.000000	0	0	8.717647	142.870588
1	2021-01-12	63	3.714286	365	5.793651	5	26	1	0.015873	0	1	7.619048	134.174603
2	2021-01-15	107	7.813084	1128	10.542056	5	84	0	0.000000	0	0	9.672897	131.700935
3	2021-01-20	81	3.283951	584	7.209877	4	51	0	0.000000	0	0	9.790123	132.222222
4	2021-01-24	98	178.377551	3482	35.530612	7	841	1	0.010204	0	1	8.530612	152.877551
...	...	...	...	...	...	...	...	...	...	...	...	...	...
815	2023-03-15	62	487.145161	2473	39.887097	7	1783	6	0.096774	0	5	8.048387	99.983871
816	2023-03-18	71	110.169014	5432	76.507042	8	2330	1	0.014085	0	1	8.732394	128.929577
817	2023-03-24	66	89.015152	911	13.803030	5	272	24	0.363636	0	24	7.257576	126.484848
818	2023-03-26	95	87.873684	3653	38.452632	8	860	3	0.031579	0	1	8.747368	129.842105
819	2023-03-27	112	16.241071	1740	15.535714	4	789	1	0.008929	0	1	8.732143	110.633929

820 rows × 13 columns

submissions_agg['created_date'] = pd.to_datetime(submissions_agg['created_date'])
submissions_agg = submissions_agg.sort_values(by='created_date')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 47, Finished, Available)

submissions_agg

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 48, Finished, Available)

	created_date	total_submissions	avg_submission_score	total_submission_comments	avg_num_comments	median_num_comments	max_num_comments	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount
542	2021-01-01	52	12.576923	484	9.307692	4	69	0	0.000000	0	0	8.711538	148.692308
543	2021-01-02	109	196.844037	2714	24.899083	5	453	6	0.055046	0	3	8.256881	135.376147
357	2021-01-03	72	125.263889	1572	21.833333	8	483	2	0.027778	0	2	10.083333	119.805556
544	2021-01-04	105	65.438095	2332	22.209524	5	1043	0	0.000000	0	0	8.980952	140.961905
545	2021-01-05	75	3.853333	414	5.520000	4	29	1	0.013333	0	1	7.253333	177.533333
...	...	...	...	...	...	...	...	...	...	...	...	...	...
819	2023-03-27	112	16.241071	1740	15.535714	4	789	1	0.008929	0	1	8.732143	110.633929
540	2023-03-28	78	31.102564	1095	14.038462	4	559	0	0.000000	0	0	9.653846	176.423077
179	2023-03-29	61	190.655738	1519	24.901639	6	721	1	0.016393	0	1	7.983607	110.180328
541	2023-03-30	64	54.234375	829	12.953125	5	182	3	0.046875	0	2	7.953125	116.828125
356	2023-03-31	60	353.800000	1971	32.850000	7	912	2	0.033333	0	1	8.583333	149.166667

820 rows × 13 columns

submissions_agg.to_csv("submissions_agg_daily.csv", index=False)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 49, Finished, Available)

# Aggregate Comments by Week
comments_agg_weekly = comments.groupBy("WeekOfYear", "Year").agg(
    count("*").alias("total_comments"),
    mean("score").alias("avg_comment_score"),
    mean("body_wordCount").alias("avg_body_wordCount")
).toPandas()

comments_agg_weekly['Year'] = comments_agg_weekly['Year'].astype(int)
comments_agg_weekly['WeekOfYear'] = comments_agg_weekly['WeekOfYear'].astype(int)

comments_agg_weekly = comments_agg_weekly.sort_values(by=['Year', 'WeekOfYear'])

comments_agg_weekly.to_csv("comments_agg_weekly.csv", index=False)

# Aggregate Comments by Date
comments_agg = comments.groupBy("created_date").agg(
    count("*").alias("total_comments"),
    mean("score").alias("avg_comment_score"),
    mean("body_wordCount").alias("avg_body_wordCount")
).toPandas()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 70, Finished, Available)

comments_agg

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 71, Finished, Available)

	created_date	total_comments	avg_comment_score	avg_body_wordCount
0	2023-03-29	3925	7.854777	23.865478
1	2023-03-21	5489	11.155766	26.962653
2	2023-03-20	4703	10.543270	28.540931
3	2021-03-30	4261	6.230462	23.666041
4	2023-02-23	5019	9.108189	24.217972
...	...	...	...	...
814	2021-11-07	6859	7.212422	29.889634
815	2021-09-26	6537	8.915405	27.742389
816	2021-01-29	3966	7.781140	21.598084
817	2021-10-02	6293	10.830288	23.274432
818	2021-10-30	3843	12.745511	25.692428

819 rows × 4 columns

comments_agg['created_date'] = pd.to_datetime(comments_agg['created_date'])
comments_agg = comments_agg.sort_values(by='created_date')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 72, Finished, Available)

comments_agg

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 73, Finished, Available)

	created_date	total_comments	avg_comment_score	avg_body_wordCount
714	2021-01-01	3746	7.749867	19.772824
686	2021-01-02	2858	10.153254	26.478307
338	2021-01-03	2827	5.977361	37.826318
615	2021-01-04	4932	7.981144	25.933496
632	2021-01-05	5204	9.958301	21.860300
...	...	...	...	...
581	2023-03-27	5130	7.277778	28.739376
437	2023-03-28	6712	8.297527	28.252980
0	2023-03-29	3925	7.854777	23.865478
295	2023-03-30	3623	8.449903	20.830803
159	2023-03-31	4575	10.321967	24.299891

819 rows × 4 columns

comments_agg.to_csv("comments_agg_daily.csv", index=False)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 74, Finished, Available)

stocks_df = stocks_df.toPandas()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 75, Finished, Available)

# Convert the 'Date' column in stocks_df to datetime if it's not already
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'])

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 76, Finished, Available)

stocks_df = stocks_df.sort_values(by='Date')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 79, Finished, Available)

# Merge Submissions Aggregate with Stocks Data
combined_df = pd.merge(stocks_df, submissions_agg, left_on='Date', right_on='created_date', how='left')

# Merge Comments Aggregate with Combined Data
combined_df = pd.merge(combined_df, comments_agg, left_on='Date', right_on='created_date', how='left')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 80, Finished, Available)

combined_df

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 81, Finished, Available)

	Unnamed: 0	NTDOY	SONY	TYO	Date	Year	Month	DayOfWeek	WeekOfYear	created_date_x	...	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount	created_date_y	total_comments	avg_comment_score	avg_body_wordCount
0	0	15.740000	100.070000	7.735437	2021-01-04	2021	1	2	1	2021-01-04	...	0	0.000000	0	0	8.980952	140.961905	2021-01-04	4932.0	7.981144	25.933496
1	1	16.176001	103.110001	7.793598	2021-01-05	2021	1	3	1	2021-01-05	...	1	0.013333	0	1	7.253333	177.533333	2021-01-05	5204.0	9.958301	21.860300
2	2	15.738000	101.080002	7.968082	2021-01-06	2021	1	4	1	2021-01-06	...	1	0.012987	0	1	8.571429	153.428571	2021-01-06	4385.0	6.350741	21.136602
3	3	15.802000	102.000000	8.026241	2021-01-07	2021	1	5	1	2021-01-07	...	0	0.000000	0	0	7.952381	147.063492	2021-01-07	4923.0	8.251270	18.427382
4	4	16.010000	103.989998	8.113484	2021-01-08	2021	1	6	1	2021-01-08	...	2	0.022222	0	2	8.211111	147.822222	2021-01-08	5072.0	8.819598	21.948935
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
560	560	9.650000	86.639999	11.965896	2023-03-27	2023	3	2	13	2023-03-27	...	1	0.008929	0	1	8.732143	110.633929	2023-03-27	5130.0	7.277778	28.739376
561	561	9.610000	85.820000	11.985530	2023-03-28	2023	3	3	13	2023-03-28	...	0	0.000000	0	0	9.653846	176.423077	2023-03-28	6712.0	8.297527	28.252980
562	562	9.770000	87.870003	12.044427	2023-03-29	2023	3	4	13	2023-03-29	...	1	0.016393	0	1	7.983607	110.180328	2023-03-29	3925.0	7.854777	23.865478
563	563	9.630000	89.309998	12.054242	2023-03-30	2023	3	5	13	2023-03-30	...	3	0.046875	0	2	7.953125	116.828125	2023-03-30	3623.0	8.449903	20.830803
564	564	9.690000	90.650002	11.818654	2023-03-31	2023	3	6	13	2023-03-31	...	2	0.033333	0	1	8.583333	149.166667	2023-03-31	4575.0	10.321967	24.299891

565 rows × 26 columns

# Dropping the extra 'created_date' columns resulted from merging
combined_df = combined_df.drop(columns=['created_date_x', 'created_date_y'])

# Handling missing values
combined_df = combined_df.fillna({
    'total_submissions': 0, 'avg_submission_score': 0, 'total_submission_comments': 0,
    'total_submission_crossposts': 0, 'avg_num_comments': 0, 'median_num_comments': 0,
    'max_num_comments': 0, 'avg_num_crossposts': 0, 'median_num_crossposts': 0,
    'max_num_crossposts': 0, 'avg_title_wordCount': 0, 'avg_selftext_wordCount': 0,
    'total_comments': 0, 'avg_comment_score': 0, 'avg_body_wordCount': 0
})

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 82, Finished, Available)

combined_df

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 83, Finished, Available)

	Unnamed: 0	NTDOY	SONY	TYO	Date	Year	Month	DayOfWeek	WeekOfYear	total_submissions	...	max_num_comments	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount	total_comments	avg_comment_score	avg_body_wordCount
0	0	15.740000	100.070000	7.735437	2021-01-04	2021	1	2	1	105	...	1043	0	0.000000	0	0	8.980952	140.961905	4932.0	7.981144	25.933496
1	1	16.176001	103.110001	7.793598	2021-01-05	2021	1	3	1	75	...	29	1	0.013333	0	1	7.253333	177.533333	5204.0	9.958301	21.860300
2	2	15.738000	101.080002	7.968082	2021-01-06	2021	1	4	1	77	...	23	1	0.012987	0	1	8.571429	153.428571	4385.0	6.350741	21.136602
3	3	15.802000	102.000000	8.026241	2021-01-07	2021	1	5	1	63	...	22	0	0.000000	0	0	7.952381	147.063492	4923.0	8.251270	18.427382
4	4	16.010000	103.989998	8.113484	2021-01-08	2021	1	6	1	90	...	66	2	0.022222	0	2	8.211111	147.822222	5072.0	8.819598	21.948935
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
560	560	9.650000	86.639999	11.965896	2023-03-27	2023	3	2	13	112	...	789	1	0.008929	0	1	8.732143	110.633929	5130.0	7.277778	28.739376
561	561	9.610000	85.820000	11.985530	2023-03-28	2023	3	3	13	78	...	559	0	0.000000	0	0	9.653846	176.423077	6712.0	8.297527	28.252980
562	562	9.770000	87.870003	12.044427	2023-03-29	2023	3	4	13	61	...	721	1	0.016393	0	1	7.983607	110.180328	3925.0	7.854777	23.865478
563	563	9.630000	89.309998	12.054242	2023-03-30	2023	3	5	13	64	...	182	3	0.046875	0	2	7.953125	116.828125	3623.0	8.449903	20.830803
564	564	9.690000	90.650002	11.818654	2023-03-31	2023	3	6	13	60	...	912	2	0.033333	0	1	8.583333	149.166667	4575.0	10.321967	24.299891

565 rows × 24 columns

# Merge Submissions Aggregate with Stocks Data
combined_df_weekly = pd.merge(stocks_df, submissions_agg_weekly, on=['Year', 'WeekOfYear'], how='left')

# Merge Comments Aggregate with Combined Data
combined_df_weekly = pd.merge(combined_df_weekly, comments_agg_weekly, on=['Year', 'WeekOfYear'], how='left')

# Handling missing values
combined_df_weekly.fillna({
    'total_submissions': 0, 'avg_submission_score': 0, 'total_submission_comments': 0,
    'total_submission_crossposts': 0, 'avg_num_comments': 0, 'median_num_comments': 0,
    'max_num_comments': 0, 'avg_num_crossposts': 0, 'median_num_crossposts': 0,
    'max_num_crossposts': 0, 'avg_title_wordCount': 0, 'avg_selftext_wordCount': 0,
    'total_comments': 0, 'avg_comment_score': 0, 'avg_body_wordCount': 0
}, inplace=True)

import pandas as pd

# Calculate the start or end of the week (e.g., Sunday as the start of the week)
combined_df['WeekStartDate'] = combined_df['Date'] - pd.to_timedelta(combined_df['Date'].dt.dayofweek, unit='d')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 85, Finished, Available)

combined_df

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 87, Finished, Available)

	Unnamed: 0	NTDOY	SONY	TYO	Date	Year	Month	DayOfWeek	WeekOfYear	total_submissions	...	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount	total_comments	avg_comment_score	avg_body_wordCount	WeekStartDate
0	0	15.740000	100.070000	7.735437	2021-01-04	2021	1	2	1	105	...	0	0.000000	0	0	8.980952	140.961905	4932.0	7.981144	25.933496	2021-01-04
1	1	16.176001	103.110001	7.793598	2021-01-05	2021	1	3	1	75	...	1	0.013333	0	1	7.253333	177.533333	5204.0	9.958301	21.860300	2021-01-04
2	2	15.738000	101.080002	7.968082	2021-01-06	2021	1	4	1	77	...	1	0.012987	0	1	8.571429	153.428571	4385.0	6.350741	21.136602	2021-01-04
3	3	15.802000	102.000000	8.026241	2021-01-07	2021	1	5	1	63	...	0	0.000000	0	0	7.952381	147.063492	4923.0	8.251270	18.427382	2021-01-04
4	4	16.010000	103.989998	8.113484	2021-01-08	2021	1	6	1	90	...	2	0.022222	0	2	8.211111	147.822222	5072.0	8.819598	21.948935	2021-01-04
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
560	560	9.650000	86.639999	11.965896	2023-03-27	2023	3	2	13	112	...	1	0.008929	0	1	8.732143	110.633929	5130.0	7.277778	28.739376	2023-03-27
561	561	9.610000	85.820000	11.985530	2023-03-28	2023	3	3	13	78	...	0	0.000000	0	0	9.653846	176.423077	6712.0	8.297527	28.252980	2023-03-27
562	562	9.770000	87.870003	12.044427	2023-03-29	2023	3	4	13	61	...	1	0.016393	0	1	7.983607	110.180328	3925.0	7.854777	23.865478	2023-03-27
563	563	9.630000	89.309998	12.054242	2023-03-30	2023	3	5	13	64	...	3	0.046875	0	2	7.953125	116.828125	3623.0	8.449903	20.830803	2023-03-27
564	564	9.690000	90.650002	11.818654	2023-03-31	2023	3	6	13	60	...	2	0.033333	0	1	8.583333	149.166667	4575.0	10.321967	24.299891	2023-03-27

565 rows × 25 columns

combined_df = combined_df.drop(columns=['Unnamed: 0'])
combined_df

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 99, Finished, Available)

	NTDOY	SONY	TYO	Date	Year	Month	DayOfWeek	WeekOfYear	total_submissions	avg_submission_score	...	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount	total_comments	avg_comment_score	avg_body_wordCount	WeekStartDate
0	15.740000	100.070000	7.735437	2021-01-04	2021	1	2	1	105	65.438095	...	0	0.000000	0	0	8.980952	140.961905	4932.0	7.981144	25.933496	2021-01-04
1	16.176001	103.110001	7.793598	2021-01-05	2021	1	3	1	75	3.853333	...	1	0.013333	0	1	7.253333	177.533333	5204.0	9.958301	21.860300	2021-01-04
2	15.738000	101.080002	7.968082	2021-01-06	2021	1	4	1	77	3.090909	...	1	0.012987	0	1	8.571429	153.428571	4385.0	6.350741	21.136602	2021-01-04
3	15.802000	102.000000	8.026241	2021-01-07	2021	1	5	1	63	3.238095	...	0	0.000000	0	0	7.952381	147.063492	4923.0	8.251270	18.427382	2021-01-04
4	16.010000	103.989998	8.113484	2021-01-08	2021	1	6	1	90	19.055556	...	2	0.022222	0	2	8.211111	147.822222	5072.0	8.819598	21.948935	2021-01-04
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
560	9.650000	86.639999	11.965896	2023-03-27	2023	3	2	13	112	16.241071	...	1	0.008929	0	1	8.732143	110.633929	5130.0	7.277778	28.739376	2023-03-27
561	9.610000	85.820000	11.985530	2023-03-28	2023	3	3	13	78	31.102564	...	0	0.000000	0	0	9.653846	176.423077	6712.0	8.297527	28.252980	2023-03-27
562	9.770000	87.870003	12.044427	2023-03-29	2023	3	4	13	61	190.655738	...	1	0.016393	0	1	7.983607	110.180328	3925.0	7.854777	23.865478	2023-03-27
563	9.630000	89.309998	12.054242	2023-03-30	2023	3	5	13	64	54.234375	...	3	0.046875	0	2	7.953125	116.828125	3623.0	8.449903	20.830803	2023-03-27
564	9.690000	90.650002	11.818654	2023-03-31	2023	3	6	13	60	353.800000	...	2	0.033333	0	1	8.583333	149.166667	4575.0	10.321967	24.299891	2023-03-27

565 rows × 24 columns

combined_df.to_csv("combined_ml_daily.csv", index=False)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 100, Finished, Available)

# Group by week and calculate sum and mean
agg_columns = {
    'total_submissions': 'sum',
    'total_comments': 'sum',
    'NTDOY': 'mean',
    'SONY': 'mean',
    'TYO': 'mean',
}
weekly_df= combined_df.groupby([pd.Grouper(key='WeekStartDate', freq='W')]).agg(agg_columns).reset_index()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 93, Finished, Available)

Exploratory Data Analysis

weekly_df

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 94, Finished, Available)

	WeekStartDate	total_submissions	total_comments	NTDOY	SONY	TYO
0	2021-01-10	410	24516.0	15.8932	102.050000	7.927368
1	2021-01-17	402	28040.0	15.8692	103.445999	8.154197
2	2021-01-24	309	20095.0	15.6015	102.195002	8.118330
3	2021-01-31	408	22147.0	14.9092	97.870000	7.950633
4	2021-02-07	435	22889.0	15.0080	107.333998	8.107668
...	...	...	...	...	...	...
112	2023-03-05	512	42123.0	9.3940	83.927998	13.407248
113	2023-03-12	283	25753.0	9.4020	86.634000	13.191847
114	2023-03-19	294	24599.0	9.4860	85.529999	12.037221
115	2023-03-26	376	23007.0	9.5700	88.206000	11.796666
116	2023-04-02	375	23965.0	9.6700	88.058000	11.973750

117 rows × 6 columns

weekly_df.to_csv("combined_ml_pokemon_weekly.csv", index=False)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 101, Finished, Available)

EDA: Number of Comments and Submissions in r/pokemon with Stock Prices by week

import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Create subplots with a 2x1 grid
fig = make_subplots(rows=2, cols=1, specs=[[{"secondary_y": True}],[{"secondary_y": True}]])

# Add traces for the first subplot (Submissions and Stock Prices)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['total_submissions'], marker_color='#d13a47', opacity=.65, name="Submissions"), row=1, col=1, secondary_y=False)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['NTDOY'], marker_color='#f7c200', opacity=.65, name="NTDOY"), row=1, col=1, secondary_y=True)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['TYO'], marker_color='#42a63c', opacity=.65, name="TYO"), row=1, col=1, secondary_y=True)

# Add traces for the second subplot (Comments and Stock Prices)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['total_comments'], marker_color='#42a1b9', opacity=.65, name="Comments"), row=2, col=1, secondary_y=False)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['NTDOY'], marker_color='#f7c200', opacity=.65, name="NTDOY", showlegend=False), row=2, col=1, secondary_y=True)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['TYO'], marker_color='#42a63c', opacity=.65, name="TYO", showlegend=False), row=2, col=1, secondary_y=True)

# Update the y-axis labels
fig.update_yaxes(title_text="Submissions", secondary_y=False, row=1, col=1)
fig.update_yaxes(title_text="Mean Stock Prices", secondary_y=True, row=1, col=1)
fig.update_yaxes(title_text="Comments", secondary_y=False, row=2, col=1)
fig.update_yaxes(title_text="Mean Stock Prices", secondary_y=True, row=2, col=1)

# Update the layout for the whole figure
fig.update_layout(
    title='The number of comments and submissions in subreddit = pokemon with stock prices for each week',
    # xaxis={'title': 'Date by week'},
    paper_bgcolor='#FFFFFF', 
    plot_bgcolor='rgba(0,0,0,0)',
)

# Show the figure
fig.show()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 103, Finished, Available)

Table: Statistics Summary

# Summary statistics
print(combined_df_weekly.describe())

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 105, Finished, Available)

# Summary statistics
combined_df.describe()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 106, Finished, Available)

	NTDOY	SONY	TYO	Year	Month	DayOfWeek	WeekOfYear	total_submissions	avg_submission_score	total_submission_comments	...	max_num_comments	total_submission_crossposts	avg_num_crossposts	median_num_crossposts	max_num_crossposts	avg_title_wordCount	avg_selftext_wordCount	total_comments	avg_comment_score	avg_body_wordCount
count	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	...	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000	565.000000
mean	12.071745	96.598000	10.049639	2021.663717	6.070796	4.038938	24.392920	92.021239	31.223070	1759.134513	...	494.194690	2.132743	0.027103	0.003540	0.902655	8.638694	135.312197	6839.185841	9.737450	24.158624
std	1.817426	15.218937	1.870428	0.665921	3.542840	1.393465	15.428097	64.887297	61.358436	2617.569597	...	813.161999	6.664057	0.090783	0.059444	1.853522	0.740156	29.318434	3211.173967	2.846059	4.692633
min	9.310000	63.169998	7.735437	2021.000000	1.000000	2.000000	1.000000	32.000000	1.920000	216.000000	...	18.000000	0.000000	0.000000	0.000000	0.000000	6.333333	59.706897	0.000000	0.000000	0.000000
25%	10.644000	85.230003	8.443065	2021.000000	3.000000	3.000000	10.000000	60.000000	3.660377	554.000000	...	54.000000	0.000000	0.000000	0.000000	0.000000	8.146667	116.204545	4844.000000	7.762292	21.451534
50%	11.550000	98.250000	9.063451	2022.000000	6.000000	4.000000	23.000000	76.000000	6.578947	859.000000	...	132.000000	1.000000	0.007092	0.000000	1.000000	8.625000	132.426966	5854.000000	9.380717	23.364365
75%	13.578000	107.080002	11.691803	2022.000000	9.000000	5.000000	38.000000	98.000000	27.741935	1648.000000	...	695.000000	2.000000	0.020000	0.000000	1.000000	9.080357	148.824742	7965.000000	11.155766	26.304678
max	16.320000	128.589996	14.404585	2023.000000	12.000000	6.000000	52.000000	897.000000	743.078947	31995.000000	...	7470.000000	72.000000	1.096154	1.000000	29.000000	11.112903	409.982906	33657.000000	23.439061	40.911036

8 rows × 22 columns

combined_statistics_table = combined_df.describe()
combined_statistics_table.to_csv("combined_statistics_table.csv", index=False)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 173, Finished, Available)

EDA: Correlation Heatmap

import plotly.figure_factory as ff
import numpy as np

# Calculate correlation matrix
corr_matrix = combined_df.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Apply the mask to the correlation matrix
corr_matrix_masked = corr_matrix.mask(mask)

# Create the heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix_masked.to_numpy(),
    x=corr_matrix_masked.columns.tolist(),
    y=corr_matrix_masked.columns.tolist(),
    colorscale='Sunset',
    showscale=True,
    annotation_text=corr_matrix_masked.round(2).to_numpy()
)

# Update the layout
fig.update_layout(
    title='Correlation Heatmap',
    xaxis=dict(tickmode='array', tickvals=np.arange(len(corr_matrix.columns)), ticktext=corr_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=np.arange(len(corr_matrix.columns)), ticktext=corr_matrix.columns),
    paper_bgcolor='#FFFFFF',
    plot_bgcolor='rgba(0,0,0,0)',
)

# Show the figure
fig.show()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 108, Finished, Available)

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Calculate the correlation matrix
corr_matrix = combined_df.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(20, 16))

# Draw the heatmap with the mask
sns.heatmap(corr_matrix, mask=mask, cmap=sns.cubehelix_palette(as_cmap=True), annot=True, fmt=".2f", linewidths=.5, cbar_kws={"shrink": .5})

# Beautify the plot
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 111, Finished, Available)

Model Building

Linear Regression - Predicting ‘NTDOY’ Stock Prices

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Predicting 'NTDOY' stock prices
features = ['total_submissions', 'avg_submission_score', 'total_comments', 'avg_comment_score', 'avg_num_crossposts', 'total_submission_crossposts']

X = combined_df[features]
y = combined_df['NTDOY']

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 139, Finished, Available)

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 140, Finished, Available)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 141, Finished, Available)

# Model training
model = LinearRegression()
model.fit(X_train, y_train)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 142, Finished, Available)

LinearRegression()

# Prediction and evaluation
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

# R-squared
r_squared = model.score(X_test, y_test)
print(f'R-squared: {r_squared}')

# AIC
n = len(X_test)
k = X.shape[1] + 1
RSS = np.sum((y_test - y_pred) ** 2)
AIC = n * np.log(RSS/n) + 2 * k
print(f'AIC: {AIC}')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 143, Finished, Available)

RMSE: 2.2354823027643596
R-squared: -25.95855276387089
AIC: 195.80728204640792

from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Prediction and evaluation
y_pred = model.predict(X_test)

# R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

# AIC for Linear Regression
n = X_test.shape[0]  # Number of observations
k = X_train.shape[1] + 1  # Number of features + 1 for the intercept
RSS = np.sum((y_test - y_pred) ** 2)
AIC = n * np.log(RSS/n) + 2 * k
print(f'AIC: {AIC}')

# RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 144, Finished, Available)

R-squared: -25.95855276387089
AIC: 195.80728204640792
RMSE: 2.2354823027643596

ARIMA Time Series Trend - Predicting ‘NTDOY’ Stock Prices

from statsmodels.tsa.stattools import adfuller

def check_stationarity(series):
    result = adfuller(series)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))

# Check if 'NTDOY' stock prices are stationary
check_stationarity(combined_df['NTDOY'].dropna())

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 145, Finished, Available)

ADF Statistic: -1.868510
p-value: 0.347017
Critical Values:
    1%: -3.442
    5%: -2.867
    10%: -2.570

from statsmodels.tsa.stattools import adfuller

result = adfuller(combined_df['NTDOY'].dropna())
print('ADF Statistic:', result[0])
print('p-value:', result[1])

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 146, Finished, Available)

ADF Statistic: -1.868510294901256
p-value: 0.34701672354064506

combined_df['NTDOY_diff'] = combined_df['NTDOY'].diff().dropna()

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 147, Finished, Available)

from statsmodels.tsa.arima.model import ARIMA

# 'NTDOY_diff' is the differenced series
# Drop NaN values that were created due to differencing
df_for_arima = combined_df[['NTDOY_diff']].dropna()

# Fit ARIMA model
# ARIMA with order (p=1, d=1, q=1)
model = ARIMA(df_for_arima, order=(1, 1, 1))
model_fit = model.fit()

# Summary of the model
print(model_fit.summary())

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 168, Finished, Available)

/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

                               SARIMAX Results                                
==============================================================================
Dep. Variable:             NTDOY_diff   No. Observations:                  564
Model:                 ARIMA(1, 1, 1)   Log Likelihood                  87.867
Date:                Mon, 27 Nov 2023   AIC                           -169.735
Time:                        23:38:28   BIC                           -156.735
Sample:                             0   HQIC                          -164.660
                                - 564                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0998      0.037     -2.704      0.007      -0.172      -0.027
ma.L1         -0.9999      0.469     -2.131      0.033      -1.920      -0.080
sigma2         0.0424      0.020      2.126      0.033       0.003       0.081
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                70.51
Prob(Q):                              0.99   Prob(JB):                         0.00
Heteroskedasticity (H):               0.34   Skew:                            -0.06
Prob(H) (two-sided):                  0.00   Kurtosis:                         4.73
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

# Forecasting
forecast = model_fit.forecast(steps=5)  # forecast next 5 periods
forecast

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 169, Finished, Available)

/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:376: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

564   -0.017879
565   -0.010103
566   -0.010879
567   -0.010802
568   -0.010810
Name: predicted_mean, dtype: float64

forecast_values = forecast.to_list()  # Convert forecast to a list

# Generate future dates starting from the day after the last date
last_date = combined_df['Date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=len(forecast_values))

# Pair forecast values with future dates
forecast_with_dates = pd.Series(data=forecast_values, index=future_dates)
forecast_with_dates

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 170, Finished, Available)

2023-04-01   -0.017879
2023-04-02   -0.010103
2023-04-03   -0.010879
2023-04-04   -0.010802
2023-04-05   -0.010810
Freq: D, dtype: float64

# Get the last actual value
last_actual_value = combined_df['NTDOY'].iloc[-1]

# Cumulatively sum the forecasted differences
cumulative_sum = forecast_with_dates.cumsum()

# Revert the differencing
reverted_forecast = last_actual_value + cumulative_sum

# Now, 'reverted_forecast' contains the forecasted NTDOY prices
print(reverted_forecast)

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 171, Finished, Available)

2023-04-01    9.672121
2023-04-02    9.662018
2023-04-03    9.651138
2023-04-04    9.640336
2023-04-05    9.629527
Freq: D, dtype: float64

from sklearn.metrics import mean_squared_error
from math import sqrt

y_true_values = [9.69, 9.69, 9.74, 10.18, 9.94] 
y_true_dates = pd.date_range(start="2023-04-01", end="2023-04-05")
y_true = pd.Series(data=y_true_values, index=y_true_dates)

# Calculate RMSE
rmse = sqrt(mean_squared_error(y_true, reverted_forecast))
print(f'RMSE: {rmse}')

StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 172, Finished, Available)

RMSE: 0.281648511155487