# Importing the libraries 
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
from pyspark.sql.functions import length
from pyspark.sql import functions as F


datastore = 'azureml://datastores/workspaceblobstore/paths/'
submissions_path = 'filtered-submissions'
submissions_df = spark.read.parquet(f"{datastore}{submissions_path}")

# take a subset of columns
df = submissions_df.select("subreddit", "author", "title", "selftext",
                             "created_utc", "num_comments", "score", 
                             "over_18", "media", "pinned", "locked", 
                             "disable_comments", "domain", "hidden", 
                             "distinguished", "hide_score")

# calculate post length
df = df.withColumn('post_length', length(df.title) + length(df.selftext))


df = df.withColumn('created_utc', F.to_timestamp('created_utc'))

# Extract time-based features
df = df.withColumn('hour_of_day', F.hour('created_utc'))
df = df.withColumn('day_of_week', F.dayofweek('created_utc'))  # 1 (Sunday) to 7 (Saturday)
# Map each day of the week from numeric to string
df = df.withColumn('day_of_week_str', F.expr("""
    CASE day_of_week 
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END
"""))
df = df.withColumn('day_of_month', F.dayofmonth('created_utc'))
df = df.withColumn('month', F.month('created_utc'))
df = df.withColumn('year', F.year('created_utc'))

df = df.withColumn('has_media', F.col('media').isNotNull())

df = df.drop(*["media", "disable_comments", "distinguished"])

StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 108, 9, Finished, Available)


PLOT_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "plots")
CSV_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "csv")

StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 105, 8, Finished, Available)


df_plotly = df.select(["subreddit", "num_comments", "score", "has_media", "post_length"])
df_plotly = df_plotly.filter(df_plotly.subreddit.isin('movies', 'anime', 'television'))
df_plotly_pd = df_plotly.toPandas()

color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the scatter plot with custom colors
fig = px.scatter(
    df_plotly_pd, 
    y='post_length', 
    x='score', 
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    size='num_comments', 
    labels={'num_comments': 'Number of Comments', 'score': 'Score', 
            'subreddit': 'Subreddit', 'post_length': 'Post Length'},
    title='Engagement Dynamics of Reddit Posts Across Entertainment Subreddits'
)

# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])

# Show the plot
fig.show()

fig.write_html(f"{PLOT_DIR}/engagement_eda.html")

StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 48, 9, Finished, Available)


df_plotly_pd["has_media"] = df_plotly_pd["has_media"].astype('str')


color_map = {
    'False': '#FF4301',
    'True': '#ffe100' 
}

# Create the scatter plot with custom colors
fig = px.scatter(
    df_plotly_pd, 
    y='post_length', 
    x='score', 
    color='has_media',
    color_discrete_map=color_map,  # Use the custom color map
    size='num_comments', 
    size_max=50,  # Adjusting the maximum size of the bubbles
    labels={'num_comments': 'Number of Comments', 'score': 'Score', 
            'has_media': 'Has Media', 'post_length': 'Post Length'},
    title='Engagement Dynamics of Reddit Posts with and without Media'
)

# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])
# fig.update_traces(marker=dict(size_max=30))
fig.update_layout(
    title_x=0.5)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/engagement_with_media.html")


# Group by 'subreddit' and 'has_media', and count the occurrences
grouped_data = df_plotly_pd.groupby(['subreddit', 'has_media']).size().reset_index(name='count')
total_counts = grouped_data.groupby('subreddit')['count'].transform('sum')
grouped_data['percentage'] = grouped_data['count'] / total_counts * 100
grouped_data['percentage'] = grouped_data['percentage'].round(2)
grouped_data['percentage_text'] = grouped_data['percentage'].round(2).astype(str) + '%'

color_map = {
    'False': '#FF4301',
    'True': '#ffe100' 
}

fig = px.bar(
    grouped_data, 
    x='subreddit', 
    y='percentage', 
    color='has_media', 
    color_discrete_map=color_map,  # Use the custom color map
    barmode='group',
    text='percentage_text',
    title='Percentage of Posts with and without Media per Subreddit',
    labels={'percentage': 'Percentage of Posts', 'subreddit': 'Subreddit'}
)

# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_layout(
    title_x=0.5)


# Show the plot
fig.show()

fig.write_html("../../data/plots/percentage_of_posts_with_media_per_subreddit.html")


df_datetime_pd = pd.read_csv("../../data/csv/year_month_day_eda.csv")
df_datetime_pd.head()


df_datetime_avg_score_pd = pd.read_csv("../../data/csv/year_month_day_avgscore_eda.csv")
df_datetime_avg_score_pd.head()


df_datetime_pd_ym = df_datetime_pd.groupby(["month", "year", "subreddit"], as_index=False)["count"].sum()
# Convert year, month, and day_of_month to a datetime column in Pandas
df_datetime_pd_ym['date'] = pd.to_datetime(df_datetime_pd_ym[['year', 'month']].assign(day=1))
df_datetime_pd_ym = df_datetime_pd_ym.sort_values(by="date")
df_datetime_pd_ym.head()


df_datetime_avg_score_pd['date'] = pd.to_datetime(df_datetime_avg_score_pd[['year', 'month']].assign(day=1))
df_datetime_avg_score_pd = df_datetime_avg_score_pd.sort_values(by="date")
df_datetime_avg_score_pd.head()


# Define the division factors in a dictionary
divisors = {
    'movies': 382085,
    'anime': 404298,
    'television': 89586
}

# Function to apply the custom division based on the subreddit
def custom_divide(row):
    return row['count'] / divisors[row['subreddit']]

# Apply the function to each row
df_datetime_pd_ym['normalized_count'] = df_datetime_pd_ym.apply(custom_divide, axis=1)


# Custom color map
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the time series plot using Plotly
fig = px.line(
    df_datetime_pd_ym,
    x='date',
    y='normalized_count',
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    labels={'count': 'Post Count', 'date': 'Date', 'subreddit': 'Subreddit'},
    line_shape="spline",
    title='Number of posts across the years (2021-2023)',
    render_mode='svg'
)

# Improve the clarity of the plot
fig.update_traces(
    line=dict(width=2),  # Thinner line
    mode='lines+markers',  # Show markers as well as lines
    marker=dict(size=4, opacity=0.6),  # Smaller markers with some transparency
    opacity=0.7  # Lines are a bit transparent to reduce visual clutter
)

# Enhance the layout
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Add range slider for interactivity
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1M", step="month", stepmode="backward"),
                dict(count=6, label="6M", step="month", stepmode="backward"),
                dict(count=1, label="1Y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        )
    )
)

fig.update_yaxes(title_text='Normalized Post Count <br> (Post Count / Total Count)')
fig.update_xaxes(title_text = 'Date (2021-2023)')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    # width=1000,   # Increasing width
    # height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/time_series_eda.html")


# Custom color map
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the time series plot using Plotly
fig = px.line(
    df_datetime_avg_score_pd,
    x='date',
    y='average_score',
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    labels={'average_score': 'Average Score', 'date': 'Date', 'subreddit': 'Subreddit'},
    line_shape="spline",
    title='Average score of posts across the years (2021-2023)',
    render_mode='svg'
)

# Improve the clarity of the plot
fig.update_traces(
    line=dict(width=2),  # Thinner line
    mode='lines+markers',  # Show markers as well as lines
    marker=dict(size=4, opacity=0.6),  # Smaller markers with some transparency
    opacity=0.7  # Lines are a bit transparent to reduce visual clutter
)

# Enhance the layout
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Add range slider for interactivity
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1M", step="month", stepmode="backward"),
                dict(count=6, label="6M", step="month", stepmode="backward"),
                dict(count=1, label="1Y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        )
    )
)

fig.update_yaxes(title_text='Average Score')
fig.update_xaxes(title_text = 'Date (2021-2023)')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    # width=1000,   # Increasing width
    # height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/time_series_score_eda.html")


day_of_month_pd = pd.read_csv(f"../../data/csv/day_of_month_avg_eda.csv")
day_of_month_count_pd = pd.read_csv("../../data/csv/daily_weekly_count_eda.csv")


day_of_month_pd = day_of_month_pd.sort_values(by='day_of_month')
day_of_month_count_pd = day_of_month_count_pd.sort_values(by='day_of_month')


day_of_month_pd_1 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["anime", "movies", "television"])]
day_of_month_pd_2 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["Animesuggest", "televisionsuggestions", "MovieSuggestions"])]


color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the line chart 
fig = px.line(
    day_of_month_pd_1, 
    x='day_of_month', 
    y='average_score', 
    color='subreddit',
    color_discrete_map=color_map,
    labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
    title='Average Score by Day of Month Across Subreddits',
    line_shape="spline",
)

fig.update_xaxes(range=[1, 31])  
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/avg_score_eda.html")


color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the line chart 
fig = px.line(
    day_of_month_count_pd, 
    x='day_of_month', 
    y='count', 
    color='subreddit',
    color_discrete_map=color_map,
    labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
    title='Count of posts by Day of Month Across Subreddits',
    line_shape="spline",
)

fig.update_xaxes(range=[1, 31])  
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/day_of_month_count_eda.html")


df_daily_weekly = df.groupBy(["day_of_week_str", "hour_of_day", "subreddit"]).count().toPandas()

StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 9, Finished, Available)


df_daily_weekly.to_csv(f"{CSV_DIR}/daily_weekly_eda.csv", index=False)

StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 15, Finished, Available)


df_daily_weekly = pd.read_csv(f"../../data/csv/daily_weekly_eda.csv")
df_daily_weekly.head()


df_daily_weekly_avgscore = pd.read_csv("../../data/csv/daily_weekly_avgscore_eda.csv")
df_daily_weekly_avgscore = df_daily_weekly_avgscore.sort_values(by="hour_of_day")
df_daily_weekly_avgscore.head()


df_daily_weekly['normalized_count'] = df_daily_weekly.apply(custom_divide, axis=1)


color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}
fig = px.sunburst(df_daily_weekly, 
                path=['subreddit', 'day_of_week_str', 'hour_of_day'], 
                values='count', 
                color='subreddit',
                color_discrete_map=color_map,
                title="Distribution of posts across Days of Week and Hours of Day")

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()
fig.write_html(f"../../data/plots/sunburst_eda.html")


color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}
fig = px.sunburst(df_daily_weekly_avgscore, 
                path=['subreddit', 'day_of_week_str', 'hour_of_day'], 
                values='average_score', 
                color='subreddit',
                color_discrete_map=color_map,
                title="Average scores of posts across Days of Week and Hours of Day")

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()
fig.write_html(f"../../data/plots/sunburst_avgscore_eda.html")


df_top_posts_scores = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores.head()


df_top_posts_scores_post_count = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores_post_count


movies_df = df_top_posts_scores_post_count.sort_values(by='score', ascending=False)
movies_df =movies_df.head(15)
movies_df


#creating new dataframes according to the subreddit category
movie_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
anime_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()


# Add Annotations
movie_annotations = [
    dict(x="lionsgate",
         y=9,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
    dict(x="CosmicBlazeKnight",
         y=2,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
    dict(x="officialtobeymaguire",
         y=1,
         xref="x", yref="y",
         text="Due to the limited number of posts, bars for the pointed authors may be hard to spot",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="uxhelpneeded",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]
tv_annotations = [
    dict(x="manskies",
         y=2,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bar",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]
anime_annotations = [
   dict(x="Srikkk",
         y=32,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bars",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="enterthedragonpunch",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]

# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Update layout for better visualization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(

        title_text="Post Count",
        # range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
    ),
    annotations=movie_annotations,
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Post count of authors with top scores for movies subreddit","annotations": movie_annotations}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Post count of authors with top scores for television subreddit","annotations": tv_annotations}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Post count of authors with top scores for anime subreddit","annotations": anime_annotations}]),
               #  dict(label="All",
               #       method="update",
               #       args=[{"visible": [True, True, True]},
               #             {"title": "Top 10 active authors across all 3 subreddits"}]),
            ]),
            x=0.9,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Set title
fig.update_layout(title_text="Post count of authors with top scores for movies subreddit")
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()

fig.write_html(f"../../data/plots/top10_authorscore_postcount_eda.html")


authors_with_top_comments_post_counts = pd.read_csv("../../data/csv/authors_with_top_comments_post_counts.csv")


#creating new dataframes for each subreddit category
movie_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count", "num_comments"]].drop_duplicates()
tv_author_df = tv_author_df.nlargest(10, 'num_comments')
tv_author_df.drop(columns=['num_comments'], axis=1, inplace=True)
tv_author_df = tv_author_df.sort_values(by = "count", ascending = False)
anime_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()


# Add Annotations
movie_annotations = [
    dict(x="LETS_MAKE_IT_AWKWARD",
         y=1,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bars",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="officialtobeymaguire",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1)
]
tv_annotations = [
    dict(x="ewzetf",
         y=3,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
    dict(x="Midnight_Oil_",
         y=1,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bars",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="thetanhausergate",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]
anime_annotations = [
    dict(x="AnimeMod",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=0),
]

# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Update layout for better visualization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(

        title_text="Post Count",
        # range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
    ),
    annotations=movie_annotations,
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Post counts of authors with top comments for movies subreddit","annotations": movie_annotations}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Post counts of authors with top comments for television subreddit","annotations": tv_annotations}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Post counts of authors with top comments for anime subreddit","annotations": anime_annotations}]),
            ]),
            x=0.9,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Set title
fig.update_layout(title_text="Post counts of authors with top comments for movies subreddit")
# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()

fig.write_html(f"../../data/plots/Top_10_authors_with_top_comments.html")


# Loading the dataset 
df = pd.read_csv("../../data/csv/num_missing_val.csv") 
df.head()


#renaming the columns
df.rename(columns={'Column': 'Column Name'}, inplace=True)


# Sort the DataFrame by 'Missing Values' in descending order
df_sorted = df.sort_values(by='Missing Values', ascending=False)

#creating table
fig = go.Figure(data=[go.Table(
    header=dict(values=list(df_sorted.columns),
                fill_color='#FF4301',
                font=dict(color='white'),  # Set font color for header
                align='left'),
    cells=dict(values=[df_sorted['Column Name'], df_sorted['Missing Values']],
               fill_color='#ececec', #setting font color for rows
               align='left'))
])
fig.update_layout(title=dict(text="Distribution of missing values"))
# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)
# Exporting the figure to a html file
fig.write_html("../../data/plots/table_missing_values.html")
fig.show()


# Loading the dataset 
df_subreddit = pd.read_csv("../../data/csv/subreddit_count.csv") 
df_subreddit


#renaming the subreddits
df_subreddit =df_subreddit.replace("anime", "r/anime")
df_subreddit =df_subreddit.replace("television", "r/television")
df_subreddit =df_subreddit.replace("televisionsuggestions", "r/televisionsuggestions")
df_subreddit =df_subreddit.replace("movies", "r/movies")
df_subreddit =df_subreddit.replace("Animesuggest", "r/Animesuggest")
df_subreddit =df_subreddit.replace("MovieSuggestions", "r/MovieSuggestions")


# Reorder it based on the values:
ordered_df = df_subreddit.sort_values(by='count')
my_range=range(1,len(df_subreddit.index)+1)

# Horizontal version
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['count'], color='lightgrey')
plt.plot(ordered_df['count'], my_range, "D", markerfacecolor='#FF4301', markeredgecolor='#FF4301')
plt.yticks(my_range, ordered_df['subreddit'])
plt.gca().get_xaxis().set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x / 1000)}K'))
plt.xlim(0, ordered_df['count'].max() + 5000) 

# Annotating count numbers over the diamonds and pushing them more to the left
for i, count in zip(my_range, ordered_df['count']):
    plt.annotate(f'{int(count):,}', xy=(count + 5300, i), ha='left', va='center', fontsize=10, color='black')

# Adding  x-axis and y-axis labels
plt.xlabel('Count', fontsize=13)  
plt.ylabel('Subreddit', fontsize=13)  

# Setting background color to none
plt.gca().set_facecolor('none')

# Remove borders
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add caption at the top
plt.text(0.5, 1.05, 'Subreddit Counts Analysis', ha='center', va='center', fontsize=16,transform=plt.gca().transAxes)

# Save the figure as a PNG file
plt.savefig("../../data/plots/subreddit_count_analysis.png", bbox_inches='tight', dpi=300)

plt.show()


# load dataset
author_df = pd.read_csv("../../data/csv/author_eda.csv")
author_df


#creating dataframes according to the subreddit category
movie_author_df = author_df[author_df['subreddit'] == 'movies']
tv_author_df = author_df[author_df['subreddit'] == 'television']
anime_author_df = author_df[author_df['subreddit'] == 'anime']
movie_author_df


# Create figure
fig = go.Figure()

# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301'
                            ,hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100'
                     ,hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Update layout with customization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(
         title_text="Post Count",
    ),
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 active authors for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 active authors for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 active authors for anime subreddit"}]),
               
            ]),
            x=0.85,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Set title
fig.update_layout(title_text="Top 10 active authors for movies subreddit")
fig.write_html("../../data/plots/Top_author_post.html")
fig.show()


#import data
anime_comments_df = pd.read_csv("../../data/csv/top_comments_anime.csv")
movies_comments_df = pd.read_csv("../../data/csv/top_comments_movies.csv")
tvshows_comments_df = pd.read_csv("../../data/csv/top_comments_tv_show.csv")


#remove rows with [deleted] author
movies_comments_df = movies_comments_df[movies_comments_df['author'] != '[deleted]']
# creating a new column "content"
movies_comments_df["selftext"].fillna(" ", inplace=True)
movies_comments_df["Content"] = "Title: " + movies_comments_df["title"] + "Body: " + movies_comments_df["selftext"]
#Selecting columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
movies_comments_df = movies_comments_df[selected_columns].head(10)
movies_comments_df


#remove rows with [deleted] author
anime_comments_df = anime_comments_df[anime_comments_df['author'] != '[deleted]']
# creating a new column content
anime_comments_df["selftext"].fillna(" ", inplace=True)
anime_comments_df["Content"] = "Title: " + anime_comments_df["title"] + "Body: " + anime_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
anime_comments_df = anime_comments_df[selected_columns].head(10)
anime_comments_df


#remove rows with [deleted] author
tvshows_comments_df = tvshows_comments_df[tvshows_comments_df['author'] != '[deleted]']
# creating a new column content
tvshows_comments_df["selftext"].fillna(" ", inplace=True)
tvshows_comments_df["Content"] = "Title: " + tvshows_comments_df["title"] + "Body: " + tvshows_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
tvshows_comments_df = tvshows_comments_df[selected_columns].head(10)
tvshows_comments_df


selected_columns_movies = ['Content', 'num_comments', 'author']
data_values_movies = [movies_comments_df[col].tolist() for col in selected_columns_movies]

selected_columns_tv = ['Content', 'num_comments', 'author']
data_values_tv = [tvshows_comments_df[col].tolist() for col in selected_columns_tv]

selected_columns_anime = ['Content', 'num_comments', 'author']
data_values_anime = [anime_comments_df[col].tolist() for col in selected_columns_anime]

# Specify the new header names
header_values = ['Content', 'Number of Comments', 'Author']

header_color="#FF4301"
body_color="#ececec"
font_color = "white"
font_color_cell = "black"
# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_movies, fill_color=body_color, font=dict(color=font_color_cell)), 
                       columnwidth=[2.5, 0.25, 0.4]))

# Add bar trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_tv, fill_color=body_color, font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2.5, 0.25, 0.4]))

# Add bar trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_anime, fill_color=body_color, font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2.5, 0.25, 0.4]))

# Update layout for better visualization
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 posts having maximum comments for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 posts having maximum comments for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 posts having maximum comments for anime subreddit"}]),]),
            x=0.85,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.05,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ],
    margin=dict(l=10, r=10, t=15, b=20),  # Adjust margins
    height=1000  # Adjust height
    )

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Set title
fig.update_layout(title_text="Top 10 post having maximum comments for movies subreddit")

# Export the figure to a html file
fig.write_html("../../data/plots/table_top_comments.html")
fig.show()


#import data
scores_df = pd.read_csv("../../data/csv/top_author_score_postcount_eda.csv")
scores_df.head()


# creating a new column "content"
scores_df["selftext"].fillna(" ", inplace=True)
scores_df["Content"] = "Title: " + scores_df["title"] + "Body: " + scores_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "score","author","subreddit"]
scores_df = scores_df[selected_columns]
scores_df.head()


#making dataframes according to each reddit category
movie_score_df = scores_df[scores_df['subreddit'] == 'movies']
tv_score_df = scores_df[scores_df['subreddit'] == 'television']
anime_score_df = scores_df[scores_df['subreddit'] == 'anime']


selected_columns_movies = ['Content', 'score', 'author']
data_values_movies = [movie_score_df[col].tolist() for col in selected_columns_movies]

selected_columns_tv = ['Content', 'score', 'author']
data_values_tv = [tv_score_df[col].tolist() for col in selected_columns_tv]

selected_columns_anime = ['Content', 'score', 'author']
data_values_anime = [anime_score_df[col].tolist() for col in selected_columns_anime]

# Specify the new header names
header_values = ['Content', 'Score', 'Author']

header_color="#FF4301"
body_color="#ececec"
font_color = "white"
font_color_cell="black"
# Create figure
fig = go.Figure()

# Add trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_movies, fill_color=body_color,font=dict(color=font_color_cell)), 
                       columnwidth=[2, 0.25, 0.45]))

# Add  trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_tv, fill_color=body_color,font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2, 0.25, 0.45]))

# Add  trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_anime, fill_color=body_color,font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2.5, 0.2, 0.45]))

# Update layout for better visualization
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 posts having maximum score for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 posts having maximum score for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 posts having maximum score for anime subreddit"}]),]),
            x=0.85,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.15,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ],
    margin=dict(l=20, r=20, t=15, b=20),  # Adjust margins
    height=350  # Adjust height
    )

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Set title
fig.update_layout(title_text="Top 10 Post having maximum score for movies subreddit")

# Export the figure to a html file
fig.write_html("../../data/plots/table_top_score.html")

fig.show()


df_external_movies = pd.read_csv("../../data/csv/best_movies_netflix_ext.csv") 
df_external_movies.head()


df_external_shows = pd.read_csv("../../data/csv/best_shows_netflix_ext.csv") 
df_external_movies.head()

	Unnamed: 0	subreddit	title	num_comments	selftext	author	score	rank	count
0	0	anime	"Berserk" creator Kentaro Miura dead at 54	1762	NaN	enterthedragonpunch	33384	1	1
1	1	anime	Who will be the first seed in Best Girl 8?	619	Hi everyone, we are currently trialing a new f...	mpp00	31830	2	241
2	2	anime	Best Girl 9 Prediction Tournament!	264	NaN	mpp00	30302	3	241
3	3	anime	The Devil is a Part-Timer Season 2 Announced!	2486	NaN	Srikkk	30213	4	32
4	4	anime	"Spice and Wolf" New Anime Announced	1897	NaN	dorkmax_executives	29222	5	304

	Unnamed: 0	subreddit	title	num_comments	selftext	author	score	rank	count
0	0	anime	"Berserk" creator Kentaro Miura dead at 54	1762	NaN	enterthedragonpunch	33384	1	1
1	1	anime	Who will be the first seed in Best Girl 8?	619	Hi everyone, we are currently trialing a new f...	mpp00	31830	2	241
2	2	anime	Best Girl 9 Prediction Tournament!	264	NaN	mpp00	30302	3	241
3	3	anime	The Devil is a Part-Timer Season 2 Announced!	2486	NaN	Srikkk	30213	4	32
4	4	anime	"Spice and Wolf" New Anime Announced	1897	NaN	dorkmax_executives	29222	5	304
5	5	anime	'Konosuba' Season 3 Announced	993	NaN	RobotiSC	28555	6	519
6	6	anime	"JoJo's Bizarre Adventure Part 6" Anime Announced	1334	NaN	Lovro26	27942	7	1450
7	7	anime	Konosuba \| New Anime Key Visual (HQ)	762	NaN	MarvelsGrantMan136	27634	8	3480
8	8	anime	One Punch Man Season 3 Announced	1264	NaN	Turbostrider27	27156	9	2235
9	9	anime	Shingeki no Kyojin: The Final Season - Episode...	6434	Shingeki no Kyojin: The Final Season, episod...	AutoLovepon	26579	10	6982
10	10	television	Former 'Reading Rainbow' host LeVar Burton wan...	2903	NaN	esporx	61389	1	141
11	11	television	LeVar Burton wants Jeopardy producers to know ...	1513	NaN	chanma50	45599	2	1662
12	12	television	For the Love of God, Let LeVar Burton Host Jeo...	1602	NaN	manskies	45499	3	2
13	13	television	‘Futurama’ Revival Ordered at Hulu With Multip...	2533	NaN	chanma50	40529	4	1662
14	14	television	‘Mindhunter’ Director Urges Fans to Make Noise...	1149	NaN	MarvelsGrantMan136	39701	5	3480
15	15	television	Jessica Walter Dies: Emmy-Winning ‘Arrested De...	1746	NaN	chanma50	39183	6	1662
16	16	television	Biden Inauguration Captures Bigger Audience Th...	2038	NaN	chanma50	38453	7	1662
17	17	television	Terry Crews Receives A Star On The Hollywood W...	661	NaN	Gato1980	35198	8	210
18	18	television	Pedro Pascal To Star As Joel In ‘The Last Of U...	2671	NaN	chanma50	34814	9	1662
19	19	television	Conan O’Brien Deserved Better. One of the most...	1851	NaN	Samoht99	34665	10	1180
20	20	movies	Hi, I’m Keanu Reeves, AMA	33376	NaN	lionsgate	282232	1	9
21	21	movies	Hi, I’m Tobey Maguire, actor/executive produce...	17793	NaN	officialtobeymaguire	192782	2	1
22	22	movies	Hello, I’m Nicolas Cage and welcome to Ask Me ...	26670	NaN	lionsgate	189542	3	9
23	23	movies	Please Bring Back Voice Actors, Stop Celebrity...	5191	NaN	fungobat	137551	4	82
24	24	movies	Brendan Fraser Wins Academy Award for Best Act...	3290	NaN	MarvelsGrantMan136	109148	5	3480
25	25	movies	‘Dune’ Sequel Greenlit By Legendary For Exclus...	6559	NaN	CosmicBlazeKnight	108958	6	2
26	26	movies	Guy On Doomed Planet Mostly Concerned With Ski...	5291	NaN	uxhelpneeded	103672	7	1
27	27	movies	Gilbert Gottfried, Comedian and ‘Aladdin’ Star...	4875	NaN	chanma50	103500	8	1662
28	28	movies	WillSmith Banned from Attending Oscars Ceremon...	10783	NaN	MarvelsGrantMan136	101136	9	3480
29	29	movies	Robbie Coltrane, Comic Performer Who Played Ha...	2467	NaN	MarvelsGrantMan136	94867	10	3480

	Unnamed: 0	subreddit	title	num_comments	selftext	author	score	rank	count
20	20	movies	Hi, I’m Keanu Reeves, AMA	33376	NaN	lionsgate	282232	1	9
21	21	movies	Hi, I’m Tobey Maguire, actor/executive produce...	17793	NaN	officialtobeymaguire	192782	2	1
22	22	movies	Hello, I’m Nicolas Cage and welcome to Ask Me ...	26670	NaN	lionsgate	189542	3	9
23	23	movies	Please Bring Back Voice Actors, Stop Celebrity...	5191	NaN	fungobat	137551	4	82
24	24	movies	Brendan Fraser Wins Academy Award for Best Act...	3290	NaN	MarvelsGrantMan136	109148	5	3480
25	25	movies	‘Dune’ Sequel Greenlit By Legendary For Exclus...	6559	NaN	CosmicBlazeKnight	108958	6	2
26	26	movies	Guy On Doomed Planet Mostly Concerned With Ski...	5291	NaN	uxhelpneeded	103672	7	1
27	27	movies	Gilbert Gottfried, Comedian and ‘Aladdin’ Star...	4875	NaN	chanma50	103500	8	1662
28	28	movies	WillSmith Banned from Attending Oscars Ceremon...	10783	NaN	MarvelsGrantMan136	101136	9	3480
29	29	movies	Robbie Coltrane, Comic Performer Who Played Ha...	2467	NaN	MarvelsGrantMan136	94867	10	3480
10	10	television	Former 'Reading Rainbow' host LeVar Burton wan...	2903	NaN	esporx	61389	1	141
11	11	television	LeVar Burton wants Jeopardy producers to know ...	1513	NaN	chanma50	45599	2	1662
12	12	television	For the Love of God, Let LeVar Burton Host Jeo...	1602	NaN	manskies	45499	3	2
13	13	television	‘Futurama’ Revival Ordered at Hulu With Multip...	2533	NaN	chanma50	40529	4	1662
14	14	television	‘Mindhunter’ Director Urges Fans to Make Noise...	1149	NaN	MarvelsGrantMan136	39701	5	3480

	Unnamed: 0	author	subreddit	count	rank
0	1	AutoLovepon	anime	6982	2
1	2	Turbostrider27	anime	2215	3
2	3	Gvostfr	anime	1677	4
3	4	Lovro26	anime	1450	5
4	5	inspyral	anime	787	6
5	6	AnimeMod	anime	755	7
6	7	SuperAlloyBerserker	anime	595	8
7	8	Shimmering-Sky	anime	557	9
8	9	RobotiSC	anime	517	10
9	10	SorcererOfTheLake	anime	441	11
10	12	wednesdaygiftinfo	movies	24222	2
11	13	allthebestmovies	movies	10353	3
12	14	Alternative-Bat-2458	movies	3538	4
13	15	Ornery-Control-9474	movies	3475	5
14	16	Sisiwakanamaru	movies	1861	6
15	17	MovieBattleGame	movies	1471	7
16	18	MarvelsGrantMan136	movies	1362	8
17	19	chanma50	movies	939	9
18	20	Samoht99	movies	842	10
19	21	indig0sixalpha	movies	820	11
20	23	MarvelsGrantMan136	television	1960	2
21	24	Sisiwakanamaru	television	1139	3
22	25	klutzysunshine	television	966	4
23	26	chanma50	television	723	5
24	27	misana123	television	663	6
25	28	PetyrDayne	television	657	7
26	29	Neo2199	television	602	8
27	30	indig0sixalpha	television	564	9
28	31	GroundbreakingSet187	television	436	10
29	32	DemiFiendRSA	television	403	11

	Unnamed: 0	author	subreddit	count	rank
10	12	wednesdaygiftinfo	movies	24222	2
11	13	allthebestmovies	movies	10353	3
12	14	Alternative-Bat-2458	movies	3538	4
13	15	Ornery-Control-9474	movies	3475	5
14	16	Sisiwakanamaru	movies	1861	6
15	17	MovieBattleGame	movies	1471	7
16	18	MarvelsGrantMan136	movies	1362	8
17	19	chanma50	movies	939	9
18	20	Samoht99	movies	842	10
19	21	indig0sixalpha	movies	820	11

Importing the libraries¶

Exploratory Data Analysis¶

Read Data from Spark¶

Data Cleaning¶

Scatterplot for Engagement metrics for Reddit posts:¶

Engagement on the basis of media post¶

Distribution of Reddits posts over time from 2021¶

Count and Average Score of posts from 2021-2023¶

Count and Average Score across different days of the month¶

Count and Average Score of posts across different days of week and hours of day¶

Analysis of post counts for Authors with highest score¶

Top Authors with Top Comments¶

Table to display missing values in dataset¶

Lollipop plot to display the Subreddit count¶

Plot for authors with top post¶

Table for posts with top comments¶

Table for posts with top score¶

External Dataset Table¶

	subreddit	day	month	year	count
0	movies	19	11	2022	320
1	television	16	6	2021	113
2	anime	31	3	2021	766
3	television	18	5	2021	135
4	television	19	7	2022	132

	subreddit	year	month	average_score
0	movies	2021	5	92.467274
1	movies	2021	10	153.312225
2	television	2021	3	326.358652
3	television	2021	6	278.586502
4	anime	2021	9	71.181420

	month	year	subreddit	count	date
0	1	2021	anime	22775	2021-01-01
1	1	2021	movies	15673	2021-01-01
2	1	2021	television	3513	2021-01-01
11	2	2021	television	3629	2021-02-01
10	2	2021	movies	15617	2021-02-01

	subreddit	year	month	average_score	date
20	movies	2021	1	169.982262	2021-01-01
26	anime	2021	1	66.665379	2021-01-01
23	television	2021	1	376.816112	2021-01-01
24	anime	2021	2	67.384173	2021-02-01
21	television	2021	2	326.005511	2021-02-01

	day_of_week_str	hour_of_day	subreddit	count
0	Friday	8	anime	1768
1	Friday	14	television	791
2	Tuesday	15	movies	2932
3	Monday	16	movies	3206
4	Monday	18	television	900

	subreddit	day_of_week_str	average_score
0	television	Wednesday	243.692586
91	movies	Friday	136.785370
109	television	Saturday	215.709350
139	television	Friday	233.622340
156	movies	Wednesday	97.904864

	Unnamed: 0	subreddit	count
0	0	anime	404298
1	1	television	89586
2	2	televisionsuggestions	7991
3	3	movies	382085
4	4	Animesuggest	74101
5	5	MovieSuggestions	58907

	Content	num_comments	author
0	Title: Name a single movie, where the sequel o...	35446	dpemerson76
1	Title: Hi, I’m Keanu Reeves, AMABody:	33376	lionsgate
2	Title: Official Discussion - Zack Snyder's Jus...	30350	LiteraryBoner
3	Title: 1 Teen Dead, Another Critically Injured...	28664	prsnreddit
4	Title: Hello, I’m Nicolas Cage and welcome to ...	26670	lionsgate
5	Title: Official Oscars Discussion Thread 2022B...	22097	LiteraryBoner
6	Title: Official Discussion - Spider-Man: No Wa...	21419	LiteraryBoner
8	Title: Official Discussion - Avatar: The Way o...	19888	LiteraryBoner
9	Title: Official Oscars Discussion Thread 2023B...	18380	LiteraryBoner
10	Title: Hi, I’m Tobey Maguire, actor/executive ...	17793	officialtobeymaguire

	Content	num_comments	author
0	Title: Casual Discussion Fridays - Week of Mar...	18528	AnimeMod
1	Title: Casual Discussion Fridays - Week of Apr...	18074	AnimeMod
2	Title: Casual Discussion Fridays - Week of Apr...	16970	AnimeMod
3	Title: Casual Discussion Fridays - Week of Mar...	16651	AnimeMod
4	Title: Casual Discussion Fridays - Week of Jun...	16258	AutoModerator
5	Title: Casual Discussion Fridays - Week of Apr...	16229	AnimeMod
6	Title: Casual Discussion Fridays - Week of Apr...	16104	AnimeMod
7	Title: Casual Discussion Fridays - Week of Apr...	16084	AnimeMod
8	Title: Casual Discussion Fridays - Week of Sep...	15984	AutoModerator
9	Title: Casual Discussion Fridays - Week of Jan...	15277	AnimeMod

	Content	num_comments	author
0	Title: Will Smith Slaps Chris Rock at The Osca...	9332	Midnight_Oil_
1	Title: Dave Chappelle Lands Emmy Nomination fo...	7589	Neo2199
2	Title: ‘House of the Dragon’ Star Steve Toussa...	6660	overvivideo
3	Title: GLAAD condemns Dave Chappelle, Netflix ...	6550	LarryPeru
4	Title: Dave Chappelle Calls Kids Who Dared Cri...	5976	inthetownwhere
5	Title: ‘Cowboy Bebop’ Canceled By Netflix Afte...	5974	MarvelsGrantMan136
6	Title: What color is an elf? Or a Sea Snake? A...	5828	ewzetf
7	Title: Gina Carano Star Wars: She is No Longer...	5745	thetanhausergate
8	Title: Netflix Co-CEO Ted Sarandos Defends Dav...	5740	Neo2199
9	Title: The Last of Us - Series Premiere Discus...	5721	NicholasCajun

	Column	Missing Values
0	subreddit	0
1	author	0
2	title	0
3	selftext	0
4	created_utc	0

	TITLE	RELEASE_YEAR	SCORE	NUMBER_OF_VOTES	DURATION	MAIN_GENRE	MAIN_PRODUCTION
0	David Attenborough: A Life on Our Planet	2020	9.0	31180	83	documentary	GB
1	Inception	2010	8.8	2268288	148	scifi	GB
2	Forrest Gump	1994	8.8	1994599	142	drama	US
3	Anbe Sivam	2003	8.7	20595	160	comedy	IN
4	Bo Burnham: Inside	2021	8.7	44074	87	comedy	US

	TITLE	RELEASE_YEAR	SCORE	NUMBER_OF_VOTES	DURATION	MAIN_GENRE	MAIN_PRODUCTION
0	David Attenborough: A Life on Our Planet	2020	9.0	31180	83	documentary	GB
1	Inception	2010	8.8	2268288	148	scifi	GB
2	Forrest Gump	1994	8.8	1994599	142	drama	US
3	Anbe Sivam	2003	8.7	20595	160	comedy	IN
4	Bo Burnham: Inside	2021	8.7	44074	87	comedy	US

	Content	score	author	subreddit
0	Title: "Berserk" creator Kentaro Miura dead at...	33384	enterthedragonpunch	anime
1	Title: Who will be the first seed in Best Girl...	31830	mpp00	anime
2	Title: Best Girl 9 Prediction Tournament! Body:	30302	mpp00	anime
3	Title: The Devil is a Part-Timer Season 2 Anno...	30213	Srikkk	anime
4	Title: "Spice and Wolf" New Anime AnnouncedBod...	29222	dorkmax_executives	anime