Code: ML-Topic 10

spark
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 6, Finished, Available)

SparkSession - hive

SparkContext

Spark UI

Version
v3.2.2.5.1-100879434
Master
yarn
AppName
Azure ML Experiment
import pyspark.sql.functions as F
from pyspark.sql.functions import sum as _sum, mean, stddev, max as _max, min as _min, count, percentile_approx, year, month, dayofmonth, ceil, col, dayofweek, hour, explode, date_format, lower, size, split, regexp_replace, isnan, when
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 7, Finished, Available)

Read full reddit data

blob_account_name = "marckvnonprodblob"
blob_container_name = "bigdata"
# read only
blob_sas_token = "?sv=2021-10-04&st=2023-10-04T01%3A42%3A59Z&se=2024-01-02T02%3A42%3A00Z&sr=c&sp=rlf&sig=w3CH9MbCOpwO7DtHlrahc7AlRPxSZZb8MOgS6TaXLzI%3D"

wasbs_base_url = (
    f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/"
)
spark.conf.set(
    f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net",
    blob_sas_token,
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 55, Finished, Available)
comments_path = "reddit-parquet/comments/"
submissions_path = "reddit-parquet/submissions/"
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 56, Finished, Available)
comments_df = spark.read.parquet(f"{wasbs_base_url}{comments_path}")
submissions_df = spark.read.parquet(f"{wasbs_base_url}{submissions_path}")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 57, Finished, Available)
submissions = submissions_df.filter(submissions_df.subreddit == "pokemon")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 11, Finished, Available)
comments = comments_df.filter(comments_df.subreddit == "pokemon")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 58, Finished, Available)
sub = submissions.select("subreddit", "author", "author_flair_text", "created_utc", "title", "selftext", "num_comments", "num_crossposts", "over_18", "score", "stickied", "id")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 13, Finished, Available)
sub.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 14, Finished, Available)
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
|subreddit|           author|   author_flair_text|        created_utc|               title|            selftext|num_comments|num_crossposts|over_18|score|stickied|     id|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
|  pokemon|  OneWhoGetsBread|                null|2023-02-22 21:08:56|the PokemonTogeth...|So several days a...|           0|             0|  false|    6|   false|119d8jo|
|  pokemon|          lumixod|                null|2023-02-22 21:13:47|Wholesome friendship|                    |          69|             0|  false| 1947|   false|119dfv9|
|  pokemon|         KubfuKid|                null|2023-02-22 21:16:45|(OC) Diego the Gh...|                    |           2|             0|  false|   18|   false|119dkh9|
|  pokemon|blackjackgabbiani|                null|2023-02-22 21:16:58|Who's a non-villa...|For me, Tyme insp...|         372|             0|  false|  406|   false|119dkrj|
|  pokemon|        [deleted]|                null|2023-02-22 21:17:16|Zard Hunting in B...|           [removed]|           0|             0|  false|    1|   false|119dl6k|
|  pokemon|Totallynotttegegg|                null|2023-02-22 21:17:58|i have a realization|​\n\n[...|           0|             0|  false|    5|   false|119dm5g|
|  pokemon|          MeiLei-|:906::722::495::2...|2023-02-22 21:18:19|bored in class. h...|                    |           6|             0|  false|    0|   false|119dmnh|
|  pokemon|        [deleted]|                null|2023-02-22 21:19:53|Original 151, cau...|           [deleted]|          15|             0|  false|  153|   false|119dp0i|
|  pokemon|       gravemind9|                null|2023-02-22 21:25:09|not a single thou...|                    |           3|             0|  false|    6|   false|119dx5q|
|  pokemon|  coolnessAlert39|                null|2023-02-22 21:26:11|Name any Bug type...|Ok now we’re doin...|          32|             0|  false|    0|   false|119dymy|
|  pokemon|    War_Daddy_992|                null|2023-02-22 21:26:21|Team Magma create...|                    |          42|             0|  false|  262|   false|119dyv4|
|  pokemon|         Redd_ddd|                null|2023-02-22 21:28:13|I gave Ghetsis a ...|                    |          28|             0|  false|   88|   false|119e1e5|
|  pokemon|       Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|My starters for e...|Gen 1: Charizard ...|           1|             0|  false|    0|   false|117vi5f|
|  pokemon|    DenseRead9852|                null|2023-02-21 05:33:56|I really fucking ...|I feel like it's ...|          13|             0|  false|    0|   false|117vluc|
|  pokemon|        [deleted]|                null|2023-02-21 05:36:05|Opinion on Pokemo...|           [removed]|           0|             0|  false|    1|   false|117vn8n|
|  pokemon|       UncleQnobi|                null|2023-02-21 05:43:01|     Team Gyarados🥶|                    |           0|             0|  false|    1|   false|117vrkl|
|  pokemon|  eyemcantoeknees|                null|2023-02-21 05:45:19|I made Tinkaton o...|                    |          10|             0|  false|  192|   false|117vszo|
|  pokemon|        [deleted]|                null|2023-02-21 05:48:49|Tough question;If...|           [removed]|           0|             0|  false|    1|   false|117vv70|
|  pokemon|        [deleted]|                null|2023-02-21 05:49:41|        Vaporeon...?|           [deleted]|           1|             0|  false|    0|   false|117vvqi|
|  pokemon|         Attakrus|                null|2023-02-21 05:51:46|Tough question;If...|           [removed]|          29|             0|  false|    1|   false|117vx29|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+
only showing top 20 rows
com = comments.select("subreddit", "author", "author_flair_text", "created_utc", "body", "controversiality", "score",  "parent_id", "stickied", "link_id", "id")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 59, Finished, Available)
com.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 60, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|subreddit|              author|   author_flair_text|        created_utc|                body|controversiality|score| parent_id|stickied|  link_id|     id|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
|  pokemon|            KatonRyu|                null|2021-06-23 08:33:18|I still play like...|               0|    1| t3_o5h6hg|   false|t3_o5h6hg|h2qxnm1|
|  pokemon|        purejackbaby|                null|2021-06-23 08:34:09|Could you like, g...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qxpjq|
|  pokemon|Nothingyoudoisawaste|                null|2021-06-23 08:34:16|It felt forced - ...|               0|    2|t1_gzxgl8w|   false|t3_nnyg0q|h2qxpu5|
|  pokemon|              PuxOf_|                null|2021-06-23 08:34:19|doing challenge m...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qxpxh|
|  pokemon|           [deleted]|                null|2021-06-23 08:34:34|           [removed]|               0|    1| t3_o67wf5|   false|t3_o67wf5|h2qxqik|
|  pokemon|        ProbablyPuma|       I am Rain Boi|2021-06-23 08:37:47|Yall its supposed...|               0|    1| t3_o66cwy|   false|t3_o66cwy|h2qxxrd|
|  pokemon|           [deleted]|                null|2021-06-23 08:38:13|           [removed]|               0|    1|t1_h2pogtp|   false|t3_o609sa|h2qxyp7|
|  pokemon|        Fishsticks03|:gm: Avenge the F...|2021-06-23 08:38:24|         demon prawn|               0|    1| t3_o5myo5|   false|t3_o5myo5|h2qxz3e|
|  pokemon|     DeadpoolCroatia|                null|2021-06-23 08:40:30|Miltank was never...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qy3pv|
|  pokemon|           lofi_rico|                null|2021-06-23 08:40:43|I can't believe N...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy479|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:41:25|I guess that’s tr...|               0|    1|t1_h2pynet|   false|t3_o589jq|h2qy5nj|
|  pokemon|    ElGatoGuerrero72|                null|2021-06-23 08:42:15|I feel like Steph...|               0|    9| t3_o65117|   false|t3_o65117|h2qy7d7|
|  pokemon|          Long__Jump|                null|2021-06-23 08:42:45|Shiny Bibarel as ...|               0|    4| t3_o5wra5|   false|t3_o5wra5|h2qy8d4|
|  pokemon|         Aryallie_18|                null|2021-06-23 08:43:25|These are adorabl...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy9tl|
|  pokemon|           Jankufood|                null|2021-06-23 08:44:46|If the original P...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycm4|
|  pokemon|          Snoo_58305|                null|2021-06-23 08:44:47|That’s really ama...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycnm|
|  pokemon|    Bringbackdigimon|                null|2021-06-23 08:44:51|Don’t give gamefr...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycs5|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:44:55|I loved gen 5 lol...|               0|    4| t3_o67fvr|   false|t3_o67fvr|h2qycy3|
|  pokemon|           [deleted]|                null|2021-06-23 08:45:21|           [removed]|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qydtu|
|  pokemon|     MoonLightScreen|WHY WASN'T I IN MOON|2021-06-23 08:45:49|Well, naturally. ...|               0|    3| t3_o64dri|   false|t3_o64dri|h2qyeuv|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+
only showing top 20 rows

Data Cleaning

  1. Filter out corrupted data
  2. Clean Text data
  3. Create new columns (date, year, hour, week, month, cleaned text, wordCount)
    • Conduct word count on cleaned text for each data point
sub = sub.filter(
    (col('title')!='') & \
    (col('title')!='[deleted]') & \
    (col('title')!='[removed]') & \
    (col('selftext')!='') & \
    (col('selftext')!='[deleted]') & \
    (col('selftext')!='[removed]') & \
    (col('author')!='[deleted]') & \
    (col('author')!='[removed]')
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 17, Finished, Available)
com = com.filter(
    (col('body')!='') & \
    (col('body')!='[deleted]') & \
    (col('body')!='[removed]') & \
    (col('author')!='[deleted]') & \
    (col('author')!='[removed]')
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 61, Finished, Available)
sub_cleaned = (
    sub
    .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
    .withColumn("created_hour", hour("created_utc")) # create hour column
    .withColumn("created_week", dayofweek("created_utc")) # create day of the week column
    .withColumn("created_month", month("created_utc")) # create month of the year column
    .withColumn("created_year", year("created_utc")) # create the year column
    .withColumn("title", lower(col('title'))) # text cleaning: lowercase
    .withColumn("selftext", lower(col('selftext'))) # text cleaning: lowercase
    .withColumn("cleaned_title", regexp_replace(col('title'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned_title", regexp_replace(col('cleaned_title'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('title_wordCount', size(split(col('cleaned_title'), ' '))) # word count
    .withColumn("cleaned_selftext", regexp_replace(col('selftext'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned_selftext", regexp_replace(col('cleaned_selftext'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('selftext_wordCount', size(split(col('cleaned_selftext'), ' '))) # word count
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 19, Finished, Available)
com_cleaned = (
    com
    .withColumn("created_date", date_format("created_utc", "yyyy-MM-dd")) # create date column
    .withColumn("created_hour", hour("created_utc")) # create hour column
    .withColumn("created_week", dayofweek("created_utc")) # create day of the week column
    .withColumn("created_month", month("created_utc")) # create month of the year column
    .withColumn("created_year", year("created_utc")) # create the year column
    .withColumn("body", lower(col('body'))) # text cleaning: lowercase
    .withColumn("cleaned", regexp_replace(col('body'), r'[^a-zA-Z0-9\s]', '')) # text cleaning: only contain words or number
    .withColumn("cleaned", regexp_replace(col('cleaned'), r'\s+', ' ')) # text cleaning: remove extra space in text
    .withColumn('body_wordCount', size(split(col('cleaned'), ' '))) # word count
)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 62, Finished, Available)
sub_cleaned.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 21, Finished, Available)
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
|subreddit|           author|   author_flair_text|        created_utc|               title|            selftext|num_comments|num_crossposts|over_18|score|stickied|     id|created_date|created_hour|created_week|created_month|created_year|       cleaned_title|title_wordCount|    cleaned_selftext|selftext_wordCount|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
|  pokemon|  OneWhoGetsBread|                null|2023-02-22 21:08:56|the pokemontogeth...|so several days a...|           0|             0|  false|    6|   false|119d8jo|  2023-02-22|          21|           4|            2|        2023|the pokemontogeth...|              3|so several days a...|               241|
|  pokemon|blackjackgabbiani|                null|2023-02-22 21:16:58|who's a non-villa...|for me, tyme insp...|         372|             0|  false|  406|   false|119dkrj|  2023-02-22|          21|           4|            2|        2023|whos a nonvillain...|              9|for me tyme inspi...|               122|
|  pokemon|Totallynotttegegg|                null|2023-02-22 21:17:58|i have a realization|​\n\n[...|           0|             0|  false|    5|   false|119dm5g|  2023-02-22|          21|           4|            2|        2023|i have a realization|              4|ampx200b brock is...|                 6|
|  pokemon|  coolnessAlert39|                null|2023-02-22 21:26:11|name any bug type...|ok now we’re doin...|          32|             0|  false|    0|   false|119dymy|  2023-02-22|          21|           4|            2|        2023|name any bug type...|             15|ok now were doing...|                81|
|  pokemon|       Ill-Ad3844|customise me! :02...|2023-02-21 05:28:13|my starters for e...|gen 1: charizard ...|           1|             0|  false|    0|   false|117vi5f|  2023-02-21|           5|           3|            2|        2023|my starters for e...|              5|gen 1 charizard g...|                50|
|  pokemon|    DenseRead9852|                null|2023-02-21 05:33:56|i really fucking ...|i feel like it's ...|          13|             0|  false|    0|   false|117vluc|  2023-02-21|           5|           3|            2|        2023|i really fucking ...|             24|i feel like its f...|                65|
|  pokemon|      Shiny_Hypno|                null|2023-02-21 06:01:24|one of the follow...|i came up with th...|           0|             0|  false|    3|   false|117w3da|  2023-02-21|           6|           3|            2|        2023|one of the follow...|             16|i came up with th...|                91|
|  pokemon|     EmiKoizuwumi|                null|2023-02-21 06:34:43|emi's daily pokem...|[art by me!](http...|           0|             0|  false|    5|   false|117wo87|  2023-02-21|           6|           3|            2|        2023|emis daily pokemo...|             10|art by mehttpspre...|               530|
|  pokemon|       sirius1208|                null|2023-02-21 05:07:59|first time ev tra...|i'm playing throu...|           5|             0|  false|    2|   false|117v4t1|  2023-02-21|           5|           3|            2|        2023|first time ev tra...|              4|im playing throug...|                84|
|  pokemon|      Mr-Green129|                null|2023-02-21 19:35:05|monotype run for ...|i have had shield...|           2|             0|  false|    1|   false|118d5sj|  2023-02-21|          19|           3|            2|        2023|monotype run for ...|              5|i have had shield...|                51|
|  pokemon|         Sgrinfio|                null|2021-02-23 08:03:16|what's the differ...|i always thought ...|           8|             0|  false|    5|   false| lqcz1n|  2021-02-23|           8|           3|            2|        2021|whats the differe...|              7|i always thought ...|                96|
|  pokemon|     superharry24|                null|2021-02-23 03:37:09|which anime serie...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq86lq|  2021-02-23|           3|           3|            2|        2021|which anime serie...|              8|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:39:29|which anime serie...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq885n|  2021-02-23|           3|           3|            2|        2021|which anime serie...|              8|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:43:01|which anime serie...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq8all|  2021-02-23|           3|           3|            2|        2021|which anime serie...|              8|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:44:19|favorite series p...|[removed]\n\n[vie...|           0|             0|  false|    1|   false| lq8bfn|  2021-02-23|           3|           3|            2|        2021|favorite series p...|              4|removed view poll...|                 3|
|  pokemon|     superharry24|                null|2021-02-23 03:48:55|what is your favo...|just wondering wh...|           5|             0|  false|    5|   false| lq8ek4|  2021-02-23|           3|           3|            2|        2021|what is your favo...|              7|just wondering wh...|                73|
|  pokemon|     superharry24|                null|2021-02-23 03:49:48|what is your favo...|just wondering wh...|           4|             0|  false|    3|   false| lq8f5t|  2021-02-23|           3|           3|            2|        2021|what is your favo...|              7|just wondering wh...|                73|
|  pokemon|         PCN24454|                null|2021-02-23 04:21:44|what are your big...|what's something ...|           4|             0|  false|    3|   false| lq91fh|  2021-02-23|           4|           3|            2|        2021|what are your big...|              8|whats something t...|               133|
|  pokemon|          Kato756|                null|2021-02-23 02:57:25|did you guys see ...|so i guess we wil...|           4|             0|  false|    2|   false| lq7egb|  2021-02-23|           2|           3|            2|        2021|did you guys see ...|             20|so i guess we wil...|                54|
|  pokemon|  SnowPhoenix9999|                 ...|2021-02-23 05:00:19|[rebuild tuesday]...|hello everyone!\n...|           8|             0|  false|    1|   false| lq7iq8|  2021-02-23|           5|           3|            2|        2021|rebuild tuesday r...|              3|hello everyone we...|               364|
+---------+-----------------+--------------------+-------------------+--------------------+--------------------+------------+--------------+-------+-----+--------+-------+------------+------------+------------+-------------+------------+--------------------+---------------+--------------------+------------------+
only showing top 20 rows
com_cleaned.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 66, Finished, Available)
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|subreddit|              author|   author_flair_text|        created_utc|                body|controversiality|score| parent_id|stickied|  link_id|     id|created_date|created_hour|created_week|created_month|created_year|             cleaned|body_wordCount|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
|  pokemon|            KatonRyu|                null|2021-06-23 08:33:18|i still play like...|               0|    1| t3_o5h6hg|   false|t3_o5h6hg|h2qxnm1|  2021-06-23|           8|           4|            6|        2021|i still play like...|            28|
|  pokemon|        purejackbaby|                null|2021-06-23 08:34:09|could you like, g...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qxpjq|  2021-06-23|           8|           4|            6|        2021|could you like go...|            28|
|  pokemon|Nothingyoudoisawaste|                null|2021-06-23 08:34:16|it felt forced - ...|               0|    2|t1_gzxgl8w|   false|t3_nnyg0q|h2qxpu5|  2021-06-23|           8|           4|            6|        2021|it felt forced i ...|           161|
|  pokemon|              PuxOf_|                null|2021-06-23 08:34:19|doing challenge m...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qxpxh|  2021-06-23|           8|           4|            6|        2021|doing challenge m...|             9|
|  pokemon|        ProbablyPuma|       I am Rain Boi|2021-06-23 08:37:47|yall its supposed...|               0|    1| t3_o66cwy|   false|t3_o66cwy|h2qxxrd|  2021-06-23|           8|           4|            6|        2021|yall its supposed...|             7|
|  pokemon|        Fishsticks03|:gm: Avenge the F...|2021-06-23 08:38:24|         demon prawn|               0|    1| t3_o5myo5|   false|t3_o5myo5|h2qxz3e|  2021-06-23|           8|           4|            6|        2021|         demon prawn|             2|
|  pokemon|     DeadpoolCroatia|                null|2021-06-23 08:40:30|miltank was never...|               0|    1| t3_o5j81l|   false|t3_o5j81l|h2qy3pv|  2021-06-23|           8|           4|            6|        2021|miltank was never...|            21|
|  pokemon|           lofi_rico|                null|2021-06-23 08:40:43|i can't believe n...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy479|  2021-06-23|           8|           4|            6|        2021|i cant believe ni...|            10|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:41:25|i guess that’s tr...|               0|    1|t1_h2pynet|   false|t3_o589jq|h2qy5nj|  2021-06-23|           8|           4|            6|        2021|i guess thats tru...|            37|
|  pokemon|    ElGatoGuerrero72|                null|2021-06-23 08:42:15|i feel like steph...|               0|    9| t3_o65117|   false|t3_o65117|h2qy7d7|  2021-06-23|           8|           4|            6|        2021|i feel like steph...|            18|
|  pokemon|          Long__Jump|                null|2021-06-23 08:42:45|shiny bibarel as ...|               0|    4| t3_o5wra5|   false|t3_o5wra5|h2qy8d4|  2021-06-23|           8|           4|            6|        2021|shiny bibarel as ...|            13|
|  pokemon|         Aryallie_18|                null|2021-06-23 08:43:25|these are adorabl...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qy9tl|  2021-06-23|           8|           4|            6|        2021|these are adorabl...|             9|
|  pokemon|           Jankufood|                null|2021-06-23 08:44:46|if the original p...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycm4|  2021-06-23|           8|           4|            6|        2021|if the original p...|             8|
|  pokemon|          Snoo_58305|                null|2021-06-23 08:44:47|that’s really ama...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycnm|  2021-06-23|           8|           4|            6|        2021|thats really amaz...|            55|
|  pokemon|    Bringbackdigimon|                null|2021-06-23 08:44:51|don’t give gamefr...|               0|    1| t3_o63atm|   false|t3_o63atm|h2qycs5|  2021-06-23|           8|           4|            6|        2021|dont give gamefre...|             4|
|  pokemon|Aggravating-Public15|                null|2021-06-23 08:44:55|i loved gen 5 lol...|               0|    4| t3_o67fvr|   false|t3_o67fvr|h2qycy3|  2021-06-23|           8|           4|            6|        2021|i loved gen 5 lol...|            42|
|  pokemon|     MoonLightScreen|WHY WASN'T I IN MOON|2021-06-23 08:45:49|well, naturally. ...|               0|    3| t3_o64dri|   false|t3_o64dri|h2qyeuv|  2021-06-23|           8|           4|            6|        2021|well naturally wu...|            20|
|  pokemon|         GodOfYoshis|                null|2021-06-23 08:47:17|i mean it's the i...|               0|    1|t1_h2q71pq|   false|t3_o5pukl|h2qyhx3|  2021-06-23|           8|           4|            6|        2021|i mean its the in...|            20|
|  pokemon|        GrandWar4675|                null|2021-06-23 08:47:53|            pokimane|               0|   15|t1_h2qq5rm|   false|t3_o63atm|h2qyj7x|  2021-06-23|           8|           4|            6|        2021|            pokimane|             1|
|  pokemon|          bosswilliz|                null|2021-06-23 08:47:54|    i want them all!|               0|    1| t3_o63atm|   false|t3_o63atm|h2qyj8t|  2021-06-23|           8|           4|            6|        2021|     i want them all|             4|
+---------+--------------------+--------------------+-------------------+--------------------+----------------+-----+----------+--------+---------+-------+------------+------------+------------+-------------+------------+--------------------+--------------+
only showing top 20 rows

Save cleaned data

datastore = 'azureml://datastores/workspaceblobstore/paths'

# https://<STORAGE-ACCOUNT>.blob.core.windows.net/<CONTAINER-NAME>
Storage_URI="https://group09astorage08f5ea16c.blob.core.windows.net/azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"

workspace_default_storage_account = "group09astorage08f5ea16c"
workspace_default_container = "azureml-blobstore-600c08e7-3c4d-4e17-a310-86a7327468a9"

workspace_wasbs_base_url = (
    f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 23, Finished, Available)
sub_cleaned.write.parquet(f"{workspace_wasbs_base_url}/pokemon_sub_cleaned.parquet")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 25, Finished, Available)
com_cleaned.write.parquet(f"{workspace_wasbs_base_url}/pokemon_com_cleaned.parquet")
datastore = 'azureml://datastores/workspaceblobstore/paths'

submissions = spark.read.parquet(f"{datastore}/pokemon_sub_cleaned.parquet")
comments = spark.read.parquet(f"{datastore}/pokemon_com_cleaned.parquet")
submissions = sub_cleaned
comments = com_cleaned
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 26, Finished, Available)

Data Integration and Preparation

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.ticker import FuncFormatter
import plotly.graph_objects as go
import plotly.subplots as sp
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
from plotly.subplots import make_subplots
import pyspark.sql.types as T
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 27, Finished, Available)
stocks = pd.read_csv("Users/hw487/fall-2023-reddit-project-team-09/data/csv/stocks.csv")
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 29, Finished, Available)
stocks
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 30, Finished, Available)
Unnamed: 0 NTDOY SONY TYO Date
0 0 15.740000 100.070000 7.735437 2021-01-04
1 1 16.176001 103.110001 7.793598 2021-01-05
2 2 15.738000 101.080002 7.968082 2021-01-06
3 3 15.802000 102.000000 8.026241 2021-01-07
4 4 16.010000 103.989998 8.113484 2021-01-08
... ... ... ... ... ...
560 560 9.650000 86.639999 11.965896 2023-03-27
561 561 9.610000 85.820000 11.985530 2023-03-28
562 562 9.770000 87.870003 12.044427 2023-03-29
563 563 9.630000 89.309998 12.054242 2023-03-30
564 564 9.690000 90.650002 11.818654 2023-03-31

565 rows × 5 columns

from pyspark.sql.functions import to_date
from pyspark.sql.functions import year, month, dayofweek
from pyspark.sql.functions import count, avg
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 35, Finished, Available)
# Convert Pandas DataFrame to Spark DataFrame
stocks_df = spark.createDataFrame(stocks)

# Convert 'Date' column to datetime format in stocks DataFrame
stocks_df = stocks_df.withColumn("Date", to_date("Date", "yyyy-MM-dd"))
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 32, Finished, Available)

Extract Meaningful Features

# For the stocks DataFrame
stocks_df = stocks_df.withColumn("Year", year("Date"))
stocks_df = stocks_df.withColumn("Month", month("Date"))
stocks_df = stocks_df.withColumn("DayOfWeek", dayofweek("Date"))

# Similarly for submissions and comments DataFrames
submissions = submissions.withColumn("Year", year("created_date"))
submissions = submissions.withColumn("Month", month("created_date"))
submissions = submissions.withColumn("DayOfWeek", dayofweek("created_date"))

comments = comments.withColumn("Year", year("created_date"))
comments = comments.withColumn("Month", month("created_date"))
comments = comments.withColumn("DayOfWeek", dayofweek("created_date"))
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 33, Finished, Available)
from pyspark.sql.functions import weekofyear

# Add week of year to the stocks DataFrame
stocks_df = stocks_df.withColumn("WeekOfYear", weekofyear("Date"))

# Add week of year to submissions and comments DataFrames
submissions = submissions.withColumn("WeekOfYear", weekofyear("created_date"))
comments = comments.withColumn("WeekOfYear", weekofyear("created_date"))
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 37, Finished, Available)

Data Aggregation and Merging

from pyspark.sql.functions import sum as sum_agg  # Renaming to avoid confusion with Python's built-in sum()
from pyspark.sql.functions import mean, max as _max, percentile_approx

# Aggregate Submissions by Week
submissions_agg_weekly = submissions.groupBy("WeekOfYear", "Year").agg(
    count("*").alias("total_submissions"),
    mean("score").alias("avg_submission_score"),
    sum_agg("num_comments").alias("total_submission_comments"),
    mean("num_comments").alias("avg_num_comments"),
    percentile_approx("num_comments", 0.5).alias("median_num_comments"),
    _max("num_comments").alias("max_num_comments"),
    sum_agg("num_crossposts").alias("total_submission_crossposts"),
    mean("num_crossposts").alias("avg_num_crossposts"),
    percentile_approx("num_crossposts", 0.5).alias("median_num_crossposts"),
    _max("num_crossposts").alias("max_num_crossposts"),
    mean("title_wordCount").alias("avg_title_wordCount"),
    mean("selftext_wordCount").alias("avg_selftext_wordCount")
).toPandas()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 38, Finished, Available)
submissions_agg_weekly
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 39, Finished, Available)
WeekOfYear Year total_submissions avg_submission_score total_submission_comments avg_num_comments median_num_comments max_num_comments total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount
0 8 2023 393 50.684478 14381 36.592875 5 1531 8 0.020356 0 3 8.582697 133.150127
1 8 2021 2259 99.024347 55402 24.525011 7 6134 21 0.009296 0 2 9.321824 147.490040
2 9 2023 692 40.748555 14680 21.213873 5 1063 5 0.007225 0 1 8.774566 136.365607
3 48 2022 1690 133.173373 59718 35.336095 8 3096 12 0.007101 0 3 8.676923 141.889941
4 13 2023 375 108.197333 7154 19.077333 5 912 7 0.018667 0 2 8.645333 131.466667
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
114 32 2021 505 93.233663 16866 33.398020 6 2323 5 0.009901 0 1 9.017822 127.942574
115 46 2021 1268 93.505521 43890 34.613565 7 5328 21 0.016562 0 3 8.485804 103.922713
116 45 2021 632 71.420886 26424 41.810127 7 2963 6 0.009494 0 1 9.121835 146.132911
117 42 2021 431 73.563805 18299 42.457077 7 6973 9 0.020882 0 3 8.470998 133.287703
118 44 2021 417 87.009592 18070 43.333333 7 1707 4 0.009592 0 2 8.666667 144.973621

119 rows × 14 columns

submissions_agg_weekly['Year'] = submissions_agg_weekly['Year'].astype(int)
submissions_agg_weekly['WeekOfYear'] = submissions_agg_weekly['WeekOfYear'].astype(int)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 40, Finished, Available)
submissions_agg_weekly = submissions_agg_weekly.sort_values(by=['Year', 'WeekOfYear'])
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 42, Finished, Available)
submissions_agg_weekly
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 43, Finished, Available)
WeekOfYear Year total_submissions avg_submission_score total_submission_comments avg_num_comments median_num_comments max_num_comments total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount
44 1 2021 603 78.381426 9138 15.154229 4 1247 4 0.006633 0 2 8.334992 157.920398
71 2 2021 567 62.342152 9780 17.248677 6 958 8 0.014109 0 2 8.871252 155.313933
38 3 2021 616 56.082792 10879 17.660714 5 1022 5 0.008117 0 1 8.659091 159.055195
22 4 2021 619 67.316640 10240 16.542811 4 1215 8 0.012924 0 1 8.568659 154.453958
48 5 2021 627 61.556619 9526 15.192982 5 1024 10 0.015949 0 4 8.564593 150.409888
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
61 10 2023 434 51.755760 12594 29.018433 6 2122 3 0.006912 0 1 8.880184 142.140553
5 11 2023 453 118.214128 15130 33.399558 6 2330 10 0.022075 0 5 8.456954 140.823400
28 12 2023 539 67.541744 10738 19.922078 5 860 41 0.076067 0 24 8.077922 136.018553
4 13 2023 375 108.197333 7154 19.077333 5 912 7 0.018667 0 2 8.645333 131.466667
37 52 2023 118 268.491525 7055 59.788136 8 1380 1 0.008475 0 1 8.593220 155.067797

119 rows × 14 columns

submissions_agg_weekly.to_csv("submissions_agg_weekly.csv", index=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 44, Finished, Available)
# Aggregate Submissions by Date
submissions_agg = submissions.groupBy("created_date").agg(
    count("*").alias("total_submissions"),
    mean("score").alias("avg_submission_score"),
    sum_agg("num_comments").alias("total_submission_comments"),
    mean("num_comments").alias("avg_num_comments"),
    percentile_approx("num_comments", 0.5).alias("median_num_comments"),
    _max("num_comments").alias("max_num_comments"),
    sum_agg("num_crossposts").alias("total_submission_crossposts"),
    mean("num_crossposts").alias("avg_num_crossposts"),
    percentile_approx("num_crossposts", 0.5).alias("median_num_crossposts"),
    _max("num_crossposts").alias("max_num_crossposts"),
    mean("title_wordCount").alias("avg_title_wordCount"),
    mean("selftext_wordCount").alias("avg_selftext_wordCount")
).toPandas()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 45, Finished, Available)
submissions_agg
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 46, Finished, Available)
created_date total_submissions avg_submission_score total_submission_comments avg_num_comments median_num_comments max_num_comments total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount
0 2021-01-11 85 6.929412 1577 18.552941 4 958 0 0.000000 0 0 8.717647 142.870588
1 2021-01-12 63 3.714286 365 5.793651 5 26 1 0.015873 0 1 7.619048 134.174603
2 2021-01-15 107 7.813084 1128 10.542056 5 84 0 0.000000 0 0 9.672897 131.700935
3 2021-01-20 81 3.283951 584 7.209877 4 51 0 0.000000 0 0 9.790123 132.222222
4 2021-01-24 98 178.377551 3482 35.530612 7 841 1 0.010204 0 1 8.530612 152.877551
... ... ... ... ... ... ... ... ... ... ... ... ... ...
815 2023-03-15 62 487.145161 2473 39.887097 7 1783 6 0.096774 0 5 8.048387 99.983871
816 2023-03-18 71 110.169014 5432 76.507042 8 2330 1 0.014085 0 1 8.732394 128.929577
817 2023-03-24 66 89.015152 911 13.803030 5 272 24 0.363636 0 24 7.257576 126.484848
818 2023-03-26 95 87.873684 3653 38.452632 8 860 3 0.031579 0 1 8.747368 129.842105
819 2023-03-27 112 16.241071 1740 15.535714 4 789 1 0.008929 0 1 8.732143 110.633929

820 rows × 13 columns

submissions_agg['created_date'] = pd.to_datetime(submissions_agg['created_date'])
submissions_agg = submissions_agg.sort_values(by='created_date')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 47, Finished, Available)
submissions_agg
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 48, Finished, Available)
created_date total_submissions avg_submission_score total_submission_comments avg_num_comments median_num_comments max_num_comments total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount
542 2021-01-01 52 12.576923 484 9.307692 4 69 0 0.000000 0 0 8.711538 148.692308
543 2021-01-02 109 196.844037 2714 24.899083 5 453 6 0.055046 0 3 8.256881 135.376147
357 2021-01-03 72 125.263889 1572 21.833333 8 483 2 0.027778 0 2 10.083333 119.805556
544 2021-01-04 105 65.438095 2332 22.209524 5 1043 0 0.000000 0 0 8.980952 140.961905
545 2021-01-05 75 3.853333 414 5.520000 4 29 1 0.013333 0 1 7.253333 177.533333
... ... ... ... ... ... ... ... ... ... ... ... ... ...
819 2023-03-27 112 16.241071 1740 15.535714 4 789 1 0.008929 0 1 8.732143 110.633929
540 2023-03-28 78 31.102564 1095 14.038462 4 559 0 0.000000 0 0 9.653846 176.423077
179 2023-03-29 61 190.655738 1519 24.901639 6 721 1 0.016393 0 1 7.983607 110.180328
541 2023-03-30 64 54.234375 829 12.953125 5 182 3 0.046875 0 2 7.953125 116.828125
356 2023-03-31 60 353.800000 1971 32.850000 7 912 2 0.033333 0 1 8.583333 149.166667

820 rows × 13 columns

submissions_agg.to_csv("submissions_agg_daily.csv", index=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 49, Finished, Available)
# Aggregate Comments by Week
comments_agg_weekly = comments.groupBy("WeekOfYear", "Year").agg(
    count("*").alias("total_comments"),
    mean("score").alias("avg_comment_score"),
    mean("body_wordCount").alias("avg_body_wordCount")
).toPandas()
comments_agg_weekly['Year'] = comments_agg_weekly['Year'].astype(int)
comments_agg_weekly['WeekOfYear'] = comments_agg_weekly['WeekOfYear'].astype(int)
comments_agg_weekly = comments_agg_weekly.sort_values(by=['Year', 'WeekOfYear'])
comments_agg_weekly.to_csv("comments_agg_weekly.csv", index=False)
# Aggregate Comments by Date
comments_agg = comments.groupBy("created_date").agg(
    count("*").alias("total_comments"),
    mean("score").alias("avg_comment_score"),
    mean("body_wordCount").alias("avg_body_wordCount")
).toPandas()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 70, Finished, Available)
comments_agg
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 71, Finished, Available)
created_date total_comments avg_comment_score avg_body_wordCount
0 2023-03-29 3925 7.854777 23.865478
1 2023-03-21 5489 11.155766 26.962653
2 2023-03-20 4703 10.543270 28.540931
3 2021-03-30 4261 6.230462 23.666041
4 2023-02-23 5019 9.108189 24.217972
... ... ... ... ...
814 2021-11-07 6859 7.212422 29.889634
815 2021-09-26 6537 8.915405 27.742389
816 2021-01-29 3966 7.781140 21.598084
817 2021-10-02 6293 10.830288 23.274432
818 2021-10-30 3843 12.745511 25.692428

819 rows × 4 columns

comments_agg['created_date'] = pd.to_datetime(comments_agg['created_date'])
comments_agg = comments_agg.sort_values(by='created_date')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 72, Finished, Available)
comments_agg
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 73, Finished, Available)
created_date total_comments avg_comment_score avg_body_wordCount
714 2021-01-01 3746 7.749867 19.772824
686 2021-01-02 2858 10.153254 26.478307
338 2021-01-03 2827 5.977361 37.826318
615 2021-01-04 4932 7.981144 25.933496
632 2021-01-05 5204 9.958301 21.860300
... ... ... ... ...
581 2023-03-27 5130 7.277778 28.739376
437 2023-03-28 6712 8.297527 28.252980
0 2023-03-29 3925 7.854777 23.865478
295 2023-03-30 3623 8.449903 20.830803
159 2023-03-31 4575 10.321967 24.299891

819 rows × 4 columns

comments_agg.to_csv("comments_agg_daily.csv", index=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 74, Finished, Available)
stocks_df = stocks_df.toPandas()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 75, Finished, Available)
# Convert the 'Date' column in stocks_df to datetime if it's not already
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'])
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 76, Finished, Available)
stocks_df = stocks_df.sort_values(by='Date')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 79, Finished, Available)
# Merge Submissions Aggregate with Stocks Data
combined_df = pd.merge(stocks_df, submissions_agg, left_on='Date', right_on='created_date', how='left')

# Merge Comments Aggregate with Combined Data
combined_df = pd.merge(combined_df, comments_agg, left_on='Date', right_on='created_date', how='left')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 80, Finished, Available)
combined_df
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 81, Finished, Available)
Unnamed: 0 NTDOY SONY TYO Date Year Month DayOfWeek WeekOfYear created_date_x ... total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount created_date_y total_comments avg_comment_score avg_body_wordCount
0 0 15.740000 100.070000 7.735437 2021-01-04 2021 1 2 1 2021-01-04 ... 0 0.000000 0 0 8.980952 140.961905 2021-01-04 4932.0 7.981144 25.933496
1 1 16.176001 103.110001 7.793598 2021-01-05 2021 1 3 1 2021-01-05 ... 1 0.013333 0 1 7.253333 177.533333 2021-01-05 5204.0 9.958301 21.860300
2 2 15.738000 101.080002 7.968082 2021-01-06 2021 1 4 1 2021-01-06 ... 1 0.012987 0 1 8.571429 153.428571 2021-01-06 4385.0 6.350741 21.136602
3 3 15.802000 102.000000 8.026241 2021-01-07 2021 1 5 1 2021-01-07 ... 0 0.000000 0 0 7.952381 147.063492 2021-01-07 4923.0 8.251270 18.427382
4 4 16.010000 103.989998 8.113484 2021-01-08 2021 1 6 1 2021-01-08 ... 2 0.022222 0 2 8.211111 147.822222 2021-01-08 5072.0 8.819598 21.948935
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
560 560 9.650000 86.639999 11.965896 2023-03-27 2023 3 2 13 2023-03-27 ... 1 0.008929 0 1 8.732143 110.633929 2023-03-27 5130.0 7.277778 28.739376
561 561 9.610000 85.820000 11.985530 2023-03-28 2023 3 3 13 2023-03-28 ... 0 0.000000 0 0 9.653846 176.423077 2023-03-28 6712.0 8.297527 28.252980
562 562 9.770000 87.870003 12.044427 2023-03-29 2023 3 4 13 2023-03-29 ... 1 0.016393 0 1 7.983607 110.180328 2023-03-29 3925.0 7.854777 23.865478
563 563 9.630000 89.309998 12.054242 2023-03-30 2023 3 5 13 2023-03-30 ... 3 0.046875 0 2 7.953125 116.828125 2023-03-30 3623.0 8.449903 20.830803
564 564 9.690000 90.650002 11.818654 2023-03-31 2023 3 6 13 2023-03-31 ... 2 0.033333 0 1 8.583333 149.166667 2023-03-31 4575.0 10.321967 24.299891

565 rows × 26 columns

# Dropping the extra 'created_date' columns resulted from merging
combined_df = combined_df.drop(columns=['created_date_x', 'created_date_y'])

# Handling missing values
combined_df = combined_df.fillna({
    'total_submissions': 0, 'avg_submission_score': 0, 'total_submission_comments': 0,
    'total_submission_crossposts': 0, 'avg_num_comments': 0, 'median_num_comments': 0,
    'max_num_comments': 0, 'avg_num_crossposts': 0, 'median_num_crossposts': 0,
    'max_num_crossposts': 0, 'avg_title_wordCount': 0, 'avg_selftext_wordCount': 0,
    'total_comments': 0, 'avg_comment_score': 0, 'avg_body_wordCount': 0
})
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 82, Finished, Available)
combined_df
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 83, Finished, Available)
Unnamed: 0 NTDOY SONY TYO Date Year Month DayOfWeek WeekOfYear total_submissions ... max_num_comments total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount total_comments avg_comment_score avg_body_wordCount
0 0 15.740000 100.070000 7.735437 2021-01-04 2021 1 2 1 105 ... 1043 0 0.000000 0 0 8.980952 140.961905 4932.0 7.981144 25.933496
1 1 16.176001 103.110001 7.793598 2021-01-05 2021 1 3 1 75 ... 29 1 0.013333 0 1 7.253333 177.533333 5204.0 9.958301 21.860300
2 2 15.738000 101.080002 7.968082 2021-01-06 2021 1 4 1 77 ... 23 1 0.012987 0 1 8.571429 153.428571 4385.0 6.350741 21.136602
3 3 15.802000 102.000000 8.026241 2021-01-07 2021 1 5 1 63 ... 22 0 0.000000 0 0 7.952381 147.063492 4923.0 8.251270 18.427382
4 4 16.010000 103.989998 8.113484 2021-01-08 2021 1 6 1 90 ... 66 2 0.022222 0 2 8.211111 147.822222 5072.0 8.819598 21.948935
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
560 560 9.650000 86.639999 11.965896 2023-03-27 2023 3 2 13 112 ... 789 1 0.008929 0 1 8.732143 110.633929 5130.0 7.277778 28.739376
561 561 9.610000 85.820000 11.985530 2023-03-28 2023 3 3 13 78 ... 559 0 0.000000 0 0 9.653846 176.423077 6712.0 8.297527 28.252980
562 562 9.770000 87.870003 12.044427 2023-03-29 2023 3 4 13 61 ... 721 1 0.016393 0 1 7.983607 110.180328 3925.0 7.854777 23.865478
563 563 9.630000 89.309998 12.054242 2023-03-30 2023 3 5 13 64 ... 182 3 0.046875 0 2 7.953125 116.828125 3623.0 8.449903 20.830803
564 564 9.690000 90.650002 11.818654 2023-03-31 2023 3 6 13 60 ... 912 2 0.033333 0 1 8.583333 149.166667 4575.0 10.321967 24.299891

565 rows × 24 columns

# Merge Submissions Aggregate with Stocks Data
combined_df_weekly = pd.merge(stocks_df, submissions_agg_weekly, on=['Year', 'WeekOfYear'], how='left')

# Merge Comments Aggregate with Combined Data
combined_df_weekly = pd.merge(combined_df_weekly, comments_agg_weekly, on=['Year', 'WeekOfYear'], how='left')

# Handling missing values
combined_df_weekly.fillna({
    'total_submissions': 0, 'avg_submission_score': 0, 'total_submission_comments': 0,
    'total_submission_crossposts': 0, 'avg_num_comments': 0, 'median_num_comments': 0,
    'max_num_comments': 0, 'avg_num_crossposts': 0, 'median_num_crossposts': 0,
    'max_num_crossposts': 0, 'avg_title_wordCount': 0, 'avg_selftext_wordCount': 0,
    'total_comments': 0, 'avg_comment_score': 0, 'avg_body_wordCount': 0
}, inplace=True)
import pandas as pd

# Calculate the start or end of the week (e.g., Sunday as the start of the week)
combined_df['WeekStartDate'] = combined_df['Date'] - pd.to_timedelta(combined_df['Date'].dt.dayofweek, unit='d')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 85, Finished, Available)
combined_df
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 87, Finished, Available)
Unnamed: 0 NTDOY SONY TYO Date Year Month DayOfWeek WeekOfYear total_submissions ... total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount total_comments avg_comment_score avg_body_wordCount WeekStartDate
0 0 15.740000 100.070000 7.735437 2021-01-04 2021 1 2 1 105 ... 0 0.000000 0 0 8.980952 140.961905 4932.0 7.981144 25.933496 2021-01-04
1 1 16.176001 103.110001 7.793598 2021-01-05 2021 1 3 1 75 ... 1 0.013333 0 1 7.253333 177.533333 5204.0 9.958301 21.860300 2021-01-04
2 2 15.738000 101.080002 7.968082 2021-01-06 2021 1 4 1 77 ... 1 0.012987 0 1 8.571429 153.428571 4385.0 6.350741 21.136602 2021-01-04
3 3 15.802000 102.000000 8.026241 2021-01-07 2021 1 5 1 63 ... 0 0.000000 0 0 7.952381 147.063492 4923.0 8.251270 18.427382 2021-01-04
4 4 16.010000 103.989998 8.113484 2021-01-08 2021 1 6 1 90 ... 2 0.022222 0 2 8.211111 147.822222 5072.0 8.819598 21.948935 2021-01-04
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
560 560 9.650000 86.639999 11.965896 2023-03-27 2023 3 2 13 112 ... 1 0.008929 0 1 8.732143 110.633929 5130.0 7.277778 28.739376 2023-03-27
561 561 9.610000 85.820000 11.985530 2023-03-28 2023 3 3 13 78 ... 0 0.000000 0 0 9.653846 176.423077 6712.0 8.297527 28.252980 2023-03-27
562 562 9.770000 87.870003 12.044427 2023-03-29 2023 3 4 13 61 ... 1 0.016393 0 1 7.983607 110.180328 3925.0 7.854777 23.865478 2023-03-27
563 563 9.630000 89.309998 12.054242 2023-03-30 2023 3 5 13 64 ... 3 0.046875 0 2 7.953125 116.828125 3623.0 8.449903 20.830803 2023-03-27
564 564 9.690000 90.650002 11.818654 2023-03-31 2023 3 6 13 60 ... 2 0.033333 0 1 8.583333 149.166667 4575.0 10.321967 24.299891 2023-03-27

565 rows × 25 columns

combined_df = combined_df.drop(columns=['Unnamed: 0'])
combined_df
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 99, Finished, Available)
NTDOY SONY TYO Date Year Month DayOfWeek WeekOfYear total_submissions avg_submission_score ... total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount total_comments avg_comment_score avg_body_wordCount WeekStartDate
0 15.740000 100.070000 7.735437 2021-01-04 2021 1 2 1 105 65.438095 ... 0 0.000000 0 0 8.980952 140.961905 4932.0 7.981144 25.933496 2021-01-04
1 16.176001 103.110001 7.793598 2021-01-05 2021 1 3 1 75 3.853333 ... 1 0.013333 0 1 7.253333 177.533333 5204.0 9.958301 21.860300 2021-01-04
2 15.738000 101.080002 7.968082 2021-01-06 2021 1 4 1 77 3.090909 ... 1 0.012987 0 1 8.571429 153.428571 4385.0 6.350741 21.136602 2021-01-04
3 15.802000 102.000000 8.026241 2021-01-07 2021 1 5 1 63 3.238095 ... 0 0.000000 0 0 7.952381 147.063492 4923.0 8.251270 18.427382 2021-01-04
4 16.010000 103.989998 8.113484 2021-01-08 2021 1 6 1 90 19.055556 ... 2 0.022222 0 2 8.211111 147.822222 5072.0 8.819598 21.948935 2021-01-04
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
560 9.650000 86.639999 11.965896 2023-03-27 2023 3 2 13 112 16.241071 ... 1 0.008929 0 1 8.732143 110.633929 5130.0 7.277778 28.739376 2023-03-27
561 9.610000 85.820000 11.985530 2023-03-28 2023 3 3 13 78 31.102564 ... 0 0.000000 0 0 9.653846 176.423077 6712.0 8.297527 28.252980 2023-03-27
562 9.770000 87.870003 12.044427 2023-03-29 2023 3 4 13 61 190.655738 ... 1 0.016393 0 1 7.983607 110.180328 3925.0 7.854777 23.865478 2023-03-27
563 9.630000 89.309998 12.054242 2023-03-30 2023 3 5 13 64 54.234375 ... 3 0.046875 0 2 7.953125 116.828125 3623.0 8.449903 20.830803 2023-03-27
564 9.690000 90.650002 11.818654 2023-03-31 2023 3 6 13 60 353.800000 ... 2 0.033333 0 1 8.583333 149.166667 4575.0 10.321967 24.299891 2023-03-27

565 rows × 24 columns

combined_df.to_csv("combined_ml_daily.csv", index=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 100, Finished, Available)
# Group by week and calculate sum and mean
agg_columns = {
    'total_submissions': 'sum',
    'total_comments': 'sum',
    'NTDOY': 'mean',
    'SONY': 'mean',
    'TYO': 'mean',
}
weekly_df= combined_df.groupby([pd.Grouper(key='WeekStartDate', freq='W')]).agg(agg_columns).reset_index()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 93, Finished, Available)

Exploratory Data Analysis

weekly_df
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 94, Finished, Available)
WeekStartDate total_submissions total_comments NTDOY SONY TYO
0 2021-01-10 410 24516.0 15.8932 102.050000 7.927368
1 2021-01-17 402 28040.0 15.8692 103.445999 8.154197
2 2021-01-24 309 20095.0 15.6015 102.195002 8.118330
3 2021-01-31 408 22147.0 14.9092 97.870000 7.950633
4 2021-02-07 435 22889.0 15.0080 107.333998 8.107668
... ... ... ... ... ... ...
112 2023-03-05 512 42123.0 9.3940 83.927998 13.407248
113 2023-03-12 283 25753.0 9.4020 86.634000 13.191847
114 2023-03-19 294 24599.0 9.4860 85.529999 12.037221
115 2023-03-26 376 23007.0 9.5700 88.206000 11.796666
116 2023-04-02 375 23965.0 9.6700 88.058000 11.973750

117 rows × 6 columns

weekly_df.to_csv("combined_ml_pokemon_weekly.csv", index=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 101, Finished, Available)

EDA: Number of Comments and Submissions in r/pokemon with Stock Prices by week

import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Create subplots with a 2x1 grid
fig = make_subplots(rows=2, cols=1, specs=[[{"secondary_y": True}],[{"secondary_y": True}]])

# Add traces for the first subplot (Submissions and Stock Prices)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['total_submissions'], marker_color='#d13a47', opacity=.65, name="Submissions"), row=1, col=1, secondary_y=False)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['NTDOY'], marker_color='#f7c200', opacity=.65, name="NTDOY"), row=1, col=1, secondary_y=True)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['TYO'], marker_color='#42a63c', opacity=.65, name="TYO"), row=1, col=1, secondary_y=True)

# Add traces for the second subplot (Comments and Stock Prices)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['total_comments'], marker_color='#42a1b9', opacity=.65, name="Comments"), row=2, col=1, secondary_y=False)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['NTDOY'], marker_color='#f7c200', opacity=.65, name="NTDOY", showlegend=False), row=2, col=1, secondary_y=True)
fig.add_trace(go.Scatter(x=weekly_df['WeekStartDate'], y=weekly_df['TYO'], marker_color='#42a63c', opacity=.65, name="TYO", showlegend=False), row=2, col=1, secondary_y=True)

# Update the y-axis labels
fig.update_yaxes(title_text="Submissions", secondary_y=False, row=1, col=1)
fig.update_yaxes(title_text="Mean Stock Prices", secondary_y=True, row=1, col=1)
fig.update_yaxes(title_text="Comments", secondary_y=False, row=2, col=1)
fig.update_yaxes(title_text="Mean Stock Prices", secondary_y=True, row=2, col=1)

# Update the layout for the whole figure
fig.update_layout(
    title='The number of comments and submissions in subreddit = pokemon with stock prices for each week',
    # xaxis={'title': 'Date by week'},
    paper_bgcolor='#FFFFFF', 
    plot_bgcolor='rgba(0,0,0,0)',
)

# Show the figure
fig.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 103, Finished, Available)

Table: Statistics Summary

# Summary statistics
print(combined_df_weekly.describe())
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 105, Finished, Available)
# Summary statistics
combined_df.describe()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 106, Finished, Available)
NTDOY SONY TYO Year Month DayOfWeek WeekOfYear total_submissions avg_submission_score total_submission_comments ... max_num_comments total_submission_crossposts avg_num_crossposts median_num_crossposts max_num_crossposts avg_title_wordCount avg_selftext_wordCount total_comments avg_comment_score avg_body_wordCount
count 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 ... 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000 565.000000
mean 12.071745 96.598000 10.049639 2021.663717 6.070796 4.038938 24.392920 92.021239 31.223070 1759.134513 ... 494.194690 2.132743 0.027103 0.003540 0.902655 8.638694 135.312197 6839.185841 9.737450 24.158624
std 1.817426 15.218937 1.870428 0.665921 3.542840 1.393465 15.428097 64.887297 61.358436 2617.569597 ... 813.161999 6.664057 0.090783 0.059444 1.853522 0.740156 29.318434 3211.173967 2.846059 4.692633
min 9.310000 63.169998 7.735437 2021.000000 1.000000 2.000000 1.000000 32.000000 1.920000 216.000000 ... 18.000000 0.000000 0.000000 0.000000 0.000000 6.333333 59.706897 0.000000 0.000000 0.000000
25% 10.644000 85.230003 8.443065 2021.000000 3.000000 3.000000 10.000000 60.000000 3.660377 554.000000 ... 54.000000 0.000000 0.000000 0.000000 0.000000 8.146667 116.204545 4844.000000 7.762292 21.451534
50% 11.550000 98.250000 9.063451 2022.000000 6.000000 4.000000 23.000000 76.000000 6.578947 859.000000 ... 132.000000 1.000000 0.007092 0.000000 1.000000 8.625000 132.426966 5854.000000 9.380717 23.364365
75% 13.578000 107.080002 11.691803 2022.000000 9.000000 5.000000 38.000000 98.000000 27.741935 1648.000000 ... 695.000000 2.000000 0.020000 0.000000 1.000000 9.080357 148.824742 7965.000000 11.155766 26.304678
max 16.320000 128.589996 14.404585 2023.000000 12.000000 6.000000 52.000000 897.000000 743.078947 31995.000000 ... 7470.000000 72.000000 1.096154 1.000000 29.000000 11.112903 409.982906 33657.000000 23.439061 40.911036

8 rows × 22 columns

combined_statistics_table = combined_df.describe()
combined_statistics_table.to_csv("combined_statistics_table.csv", index=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 173, Finished, Available)

EDA: Correlation Heatmap

import plotly.figure_factory as ff
import numpy as np

# Calculate correlation matrix
corr_matrix = combined_df.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Apply the mask to the correlation matrix
corr_matrix_masked = corr_matrix.mask(mask)

# Create the heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix_masked.to_numpy(),
    x=corr_matrix_masked.columns.tolist(),
    y=corr_matrix_masked.columns.tolist(),
    colorscale='Sunset',
    showscale=True,
    annotation_text=corr_matrix_masked.round(2).to_numpy()
)

# Update the layout
fig.update_layout(
    title='Correlation Heatmap',
    xaxis=dict(tickmode='array', tickvals=np.arange(len(corr_matrix.columns)), ticktext=corr_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=np.arange(len(corr_matrix.columns)), ticktext=corr_matrix.columns),
    paper_bgcolor='#FFFFFF',
    plot_bgcolor='rgba(0,0,0,0)',
)

# Show the figure
fig.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 108, Finished, Available)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Calculate the correlation matrix
corr_matrix = combined_df.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(20, 16))

# Draw the heatmap with the mask
sns.heatmap(corr_matrix, mask=mask, cmap=sns.cubehelix_palette(as_cmap=True), annot=True, fmt=".2f", linewidths=.5, cbar_kws={"shrink": .5})

# Beautify the plot
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 111, Finished, Available)

Model Building

Linear Regression - Predicting ‘NTDOY’ Stock Prices

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Predicting 'NTDOY' stock prices
features = ['total_submissions', 'avg_submission_score', 'total_comments', 'avg_comment_score', 'avg_num_crossposts', 'total_submission_crossposts']

X = combined_df[features]
y = combined_df['NTDOY']
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 139, Finished, Available)
# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 140, Finished, Available)
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 141, Finished, Available)
# Model training
model = LinearRegression()
model.fit(X_train, y_train)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 142, Finished, Available)
LinearRegression()
# Prediction and evaluation
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

# R-squared
r_squared = model.score(X_test, y_test)
print(f'R-squared: {r_squared}')

# AIC
n = len(X_test)
k = X.shape[1] + 1
RSS = np.sum((y_test - y_pred) ** 2)
AIC = n * np.log(RSS/n) + 2 * k
print(f'AIC: {AIC}')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 143, Finished, Available)
RMSE: 2.2354823027643596
R-squared: -25.95855276387089
AIC: 195.80728204640792
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Prediction and evaluation
y_pred = model.predict(X_test)

# R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

# AIC for Linear Regression
n = X_test.shape[0]  # Number of observations
k = X_train.shape[1] + 1  # Number of features + 1 for the intercept
RSS = np.sum((y_test - y_pred) ** 2)
AIC = n * np.log(RSS/n) + 2 * k
print(f'AIC: {AIC}')

# RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 144, Finished, Available)
R-squared: -25.95855276387089
AIC: 195.80728204640792
RMSE: 2.2354823027643596

ARIMA Time Series Trend - Predicting ‘NTDOY’ Stock Prices

from statsmodels.tsa.stattools import adfuller

def check_stationarity(series):
    result = adfuller(series)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))

# Check if 'NTDOY' stock prices are stationary
check_stationarity(combined_df['NTDOY'].dropna())
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 145, Finished, Available)
ADF Statistic: -1.868510
p-value: 0.347017
Critical Values:
    1%: -3.442
    5%: -2.867
    10%: -2.570
from statsmodels.tsa.stattools import adfuller

result = adfuller(combined_df['NTDOY'].dropna())
print('ADF Statistic:', result[0])
print('p-value:', result[1])
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 146, Finished, Available)
ADF Statistic: -1.868510294901256
p-value: 0.34701672354064506
combined_df['NTDOY_diff'] = combined_df['NTDOY'].diff().dropna()
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 147, Finished, Available)
from statsmodels.tsa.arima.model import ARIMA

# 'NTDOY_diff' is the differenced series
# Drop NaN values that were created due to differencing
df_for_arima = combined_df[['NTDOY_diff']].dropna()

# Fit ARIMA model
# ARIMA with order (p=1, d=1, q=1)
model = ARIMA(df_for_arima, order=(1, 1, 1))
model_fit = model.fit()

# Summary of the model
print(model_fit.summary())
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 168, Finished, Available)
/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:578: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.
                               SARIMAX Results                                
==============================================================================
Dep. Variable:             NTDOY_diff   No. Observations:                  564
Model:                 ARIMA(1, 1, 1)   Log Likelihood                  87.867
Date:                Mon, 27 Nov 2023   AIC                           -169.735
Time:                        23:38:28   BIC                           -156.735
Sample:                             0   HQIC                          -164.660
                                - 564                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0998      0.037     -2.704      0.007      -0.172      -0.027
ma.L1         -0.9999      0.469     -2.131      0.033      -1.920      -0.080
sigma2         0.0424      0.020      2.126      0.033       0.003       0.081
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):                70.51
Prob(Q):                              0.99   Prob(JB):                         0.00
Heteroskedasticity (H):               0.34   Skew:                            -0.06
Prob(H) (two-sided):                  0.00   Kurtosis:                         4.73
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
# Forecasting
forecast = model_fit.forecast(steps=5)  # forecast next 5 periods
forecast
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 169, Finished, Available)
/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:376: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.
564   -0.017879
565   -0.010103
566   -0.010879
567   -0.010802
568   -0.010810
Name: predicted_mean, dtype: float64
forecast_values = forecast.to_list()  # Convert forecast to a list

# Generate future dates starting from the day after the last date
last_date = combined_df['Date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=len(forecast_values))

# Pair forecast values with future dates
forecast_with_dates = pd.Series(data=forecast_values, index=future_dates)
forecast_with_dates
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 170, Finished, Available)
2023-04-01   -0.017879
2023-04-02   -0.010103
2023-04-03   -0.010879
2023-04-04   -0.010802
2023-04-05   -0.010810
Freq: D, dtype: float64
# Get the last actual value
last_actual_value = combined_df['NTDOY'].iloc[-1]

# Cumulatively sum the forecasted differences
cumulative_sum = forecast_with_dates.cumsum()

# Revert the differencing
reverted_forecast = last_actual_value + cumulative_sum

# Now, 'reverted_forecast' contains the forecasted NTDOY prices
print(reverted_forecast)
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 171, Finished, Available)
2023-04-01    9.672121
2023-04-02    9.662018
2023-04-03    9.651138
2023-04-04    9.640336
2023-04-05    9.629527
Freq: D, dtype: float64
from sklearn.metrics import mean_squared_error
from math import sqrt

y_true_values = [9.69, 9.69, 9.74, 10.18, 9.94] 
y_true_dates = pd.date_range(start="2023-04-01", end="2023-04-05")
y_true = pd.Series(data=y_true_values, index=y_true_dates)

# Calculate RMSE
rmse = sqrt(mean_squared_error(y_true, reverted_forecast))
print(f'RMSE: {rmse}')
StatementMeta(bdbc6c25-e1fb-4fd1-9ce0-b2c60e73c096, 18, 172, Finished, Available)
RMSE: 0.281648511155487