Importing the libraries¶

In [2]:
# Importing the libraries 
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
from pyspark.sql.functions import length
from pyspark.sql import functions as F

Exploratory Data Analysis¶

Read Data from Spark¶

Data Cleaning¶

In [239]:
datastore = 'azureml://datastores/workspaceblobstore/paths/'
submissions_path = 'filtered-submissions'
submissions_df = spark.read.parquet(f"{datastore}{submissions_path}")

# take a subset of columns
df = submissions_df.select("subreddit", "author", "title", "selftext",
                             "created_utc", "num_comments", "score", 
                             "over_18", "media", "pinned", "locked", 
                             "disable_comments", "domain", "hidden", 
                             "distinguished", "hide_score")

# calculate post length
df = df.withColumn('post_length', length(df.title) + length(df.selftext))


df = df.withColumn('created_utc', F.to_timestamp('created_utc'))

# Extract time-based features
df = df.withColumn('hour_of_day', F.hour('created_utc'))
df = df.withColumn('day_of_week', F.dayofweek('created_utc'))  # 1 (Sunday) to 7 (Saturday)
# Map each day of the week from numeric to string
df = df.withColumn('day_of_week_str', F.expr("""
    CASE day_of_week 
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END
"""))
df = df.withColumn('day_of_month', F.dayofmonth('created_utc'))
df = df.withColumn('month', F.month('created_utc'))
df = df.withColumn('year', F.year('created_utc'))

df = df.withColumn('has_media', F.col('media').isNotNull())

df = df.drop(*["media", "disable_comments", "distinguished"])
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 108, 9, Finished, Available)
In [3]:
PLOT_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "plots")
CSV_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "csv")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 105, 8, Finished, Available)

Scatterplot for Engagement metrics for Reddit posts:¶

In [4]:
df_plotly = df.select(["subreddit", "num_comments", "score", "has_media", "post_length"])
df_plotly = df_plotly.filter(df_plotly.subreddit.isin('movies', 'anime', 'television'))
df_plotly_pd = df_plotly.toPandas()

color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the scatter plot with custom colors
fig = px.scatter(
    df_plotly_pd, 
    y='post_length', 
    x='score', 
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    size='num_comments', 
    labels={'num_comments': 'Number of Comments', 'score': 'Score', 
            'subreddit': 'Subreddit', 'post_length': 'Post Length'},
    title='Engagement Dynamics of Reddit Posts Across Entertainment Subreddits'
)

# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])

# Show the plot
fig.show()

fig.write_html(f"{PLOT_DIR}/engagement_eda.html")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 48, 9, Finished, Available)

Engagement on the basis of media post¶

In [263]:
df_plotly_pd["has_media"] = df_plotly_pd["has_media"].astype('str')
In [264]:
color_map = {
    'False': '#FF4301',
    'True': '#ffe100' 
}

# Create the scatter plot with custom colors
fig = px.scatter(
    df_plotly_pd, 
    y='post_length', 
    x='score', 
    color='has_media',
    color_discrete_map=color_map,  # Use the custom color map
    size='num_comments', 
    size_max=50,  # Adjusting the maximum size of the bubbles
    labels={'num_comments': 'Number of Comments', 'score': 'Score', 
            'has_media': 'Has Media', 'post_length': 'Post Length'},
    title='Engagement Dynamics of Reddit Posts with and without Media'
)

# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])
# fig.update_traces(marker=dict(size_max=30))
fig.update_layout(
    title_x=0.5)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/engagement_with_media.html")
In [267]:
# Group by 'subreddit' and 'has_media', and count the occurrences
grouped_data = df_plotly_pd.groupby(['subreddit', 'has_media']).size().reset_index(name='count')
total_counts = grouped_data.groupby('subreddit')['count'].transform('sum')
grouped_data['percentage'] = grouped_data['count'] / total_counts * 100
grouped_data['percentage'] = grouped_data['percentage'].round(2)
grouped_data['percentage_text'] = grouped_data['percentage'].round(2).astype(str) + '%'

color_map = {
    'False': '#FF4301',
    'True': '#ffe100' 
}

fig = px.bar(
    grouped_data, 
    x='subreddit', 
    y='percentage', 
    color='has_media', 
    color_discrete_map=color_map,  # Use the custom color map
    barmode='group',
    text='percentage_text',
    title='Percentage of Posts with and without Media per Subreddit',
    labels={'percentage': 'Percentage of Posts', 'subreddit': 'Subreddit'}
)

# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_layout(
    title_x=0.5)


# Show the plot
fig.show()

fig.write_html("../../data/plots/percentage_of_posts_with_media_per_subreddit.html")

Distribution of Reddits posts over time from 2021¶

Count and Average Score of posts from 2021-2023¶

In [225]:
df_datetime_pd = pd.read_csv("../../data/csv/year_month_day_eda.csv")
df_datetime_pd.head()
Out[225]:
subreddit day month year count
0 movies 19 11 2022 320
1 television 16 6 2021 113
2 anime 31 3 2021 766
3 television 18 5 2021 135
4 television 19 7 2022 132
In [226]:
df_datetime_avg_score_pd = pd.read_csv("../../data/csv/year_month_day_avgscore_eda.csv")
df_datetime_avg_score_pd.head()
Out[226]:
subreddit year month average_score
0 movies 2021 5 92.467274
1 movies 2021 10 153.312225
2 television 2021 3 326.358652
3 television 2021 6 278.586502
4 anime 2021 9 71.181420
In [227]:
df_datetime_pd_ym = df_datetime_pd.groupby(["month", "year", "subreddit"], as_index=False)["count"].sum()
# Convert year, month, and day_of_month to a datetime column in Pandas
df_datetime_pd_ym['date'] = pd.to_datetime(df_datetime_pd_ym[['year', 'month']].assign(day=1))
df_datetime_pd_ym = df_datetime_pd_ym.sort_values(by="date")
df_datetime_pd_ym.head()
Out[227]:
month year subreddit count date
0 1 2021 anime 22775 2021-01-01
1 1 2021 movies 15673 2021-01-01
2 1 2021 television 3513 2021-01-01
11 2 2021 television 3629 2021-02-01
10 2 2021 movies 15617 2021-02-01
In [228]:
df_datetime_avg_score_pd['date'] = pd.to_datetime(df_datetime_avg_score_pd[['year', 'month']].assign(day=1))
df_datetime_avg_score_pd = df_datetime_avg_score_pd.sort_values(by="date")
df_datetime_avg_score_pd.head()
Out[228]:
subreddit year month average_score date
20 movies 2021 1 169.982262 2021-01-01
26 anime 2021 1 66.665379 2021-01-01
23 television 2021 1 376.816112 2021-01-01
24 anime 2021 2 67.384173 2021-02-01
21 television 2021 2 326.005511 2021-02-01
In [229]:
# Define the division factors in a dictionary
divisors = {
    'movies': 382085,
    'anime': 404298,
    'television': 89586
}

# Function to apply the custom division based on the subreddit
def custom_divide(row):
    return row['count'] / divisors[row['subreddit']]

# Apply the function to each row
df_datetime_pd_ym['normalized_count'] = df_datetime_pd_ym.apply(custom_divide, axis=1)
In [230]:
# Custom color map
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the time series plot using Plotly
fig = px.line(
    df_datetime_pd_ym,
    x='date',
    y='normalized_count',
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    labels={'count': 'Post Count', 'date': 'Date', 'subreddit': 'Subreddit'},
    line_shape="spline",
    title='Number of posts across the years (2021-2023)',
    render_mode='svg'
)

# Improve the clarity of the plot
fig.update_traces(
    line=dict(width=2),  # Thinner line
    mode='lines+markers',  # Show markers as well as lines
    marker=dict(size=4, opacity=0.6),  # Smaller markers with some transparency
    opacity=0.7  # Lines are a bit transparent to reduce visual clutter
)

# Enhance the layout
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Add range slider for interactivity
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1M", step="month", stepmode="backward"),
                dict(count=6, label="6M", step="month", stepmode="backward"),
                dict(count=1, label="1Y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        )
    )
)

fig.update_yaxes(title_text='Normalized Post Count <br> (Post Count / Total Count)')
fig.update_xaxes(title_text = 'Date (2021-2023)')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    # width=1000,   # Increasing width
    # height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/time_series_eda.html")
In [231]:
# Custom color map
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the time series plot using Plotly
fig = px.line(
    df_datetime_avg_score_pd,
    x='date',
    y='average_score',
    color='subreddit',
    color_discrete_map=color_map,  # Use the custom color map
    labels={'average_score': 'Average Score', 'date': 'Date', 'subreddit': 'Subreddit'},
    line_shape="spline",
    title='Average score of posts across the years (2021-2023)',
    render_mode='svg'
)

# Improve the clarity of the plot
fig.update_traces(
    line=dict(width=2),  # Thinner line
    mode='lines+markers',  # Show markers as well as lines
    marker=dict(size=4, opacity=0.6),  # Smaller markers with some transparency
    opacity=0.7  # Lines are a bit transparent to reduce visual clutter
)

# Enhance the layout
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Add range slider for interactivity
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1M", step="month", stepmode="backward"),
                dict(count=6, label="6M", step="month", stepmode="backward"),
                dict(count=1, label="1Y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        )
    )
)

fig.update_yaxes(title_text='Average Score')
fig.update_xaxes(title_text = 'Date (2021-2023)')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    # width=1000,   # Increasing width
    # height=600   # Increasing height
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/time_series_score_eda.html")

Count and Average Score across different days of the month¶

In [233]:
day_of_month_pd = pd.read_csv(f"../../data/csv/day_of_month_avg_eda.csv")
day_of_month_count_pd = pd.read_csv("../../data/csv/daily_weekly_count_eda.csv")
In [234]:
day_of_month_pd = day_of_month_pd.sort_values(by='day_of_month')
day_of_month_count_pd = day_of_month_count_pd.sort_values(by='day_of_month')
In [235]:
day_of_month_pd_1 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["anime", "movies", "television"])]
day_of_month_pd_2 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["Animesuggest", "televisionsuggestions", "MovieSuggestions"])]
In [236]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the line chart 
fig = px.line(
    day_of_month_pd_1, 
    x='day_of_month', 
    y='average_score', 
    color='subreddit',
    color_discrete_map=color_map,
    labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
    title='Average Score by Day of Month Across Subreddits',
    line_shape="spline",
)

fig.update_xaxes(range=[1, 31])  
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/avg_score_eda.html")
In [237]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}

# Create the line chart 
fig = px.line(
    day_of_month_count_pd, 
    x='day_of_month', 
    y='count', 
    color='subreddit',
    color_discrete_map=color_map,
    labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
    title='Count of posts by Day of Month Across Subreddits',
    line_shape="spline",
)

fig.update_xaxes(range=[1, 31])  
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
    width=800,   # Increasing width
    height=600   # Increasing height
)

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Show the plot
fig.show()

fig.write_html(f"../../data/plots/day_of_month_count_eda.html")

Count and Average Score of posts across different days of week and hours of day¶

In [238]:
df_daily_weekly = df.groupBy(["day_of_week_str", "hour_of_day", "subreddit"]).count().toPandas()
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 9, Finished, Available)
In [10]:
df_daily_weekly.to_csv(f"{CSV_DIR}/daily_weekly_eda.csv", index=False)
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 15, Finished, Available)
In [240]:
df_daily_weekly = pd.read_csv(f"../../data/csv/daily_weekly_eda.csv")
df_daily_weekly.head()
Out[240]:
day_of_week_str hour_of_day subreddit count
0 Friday 8 anime 1768
1 Friday 14 television 791
2 Tuesday 15 movies 2932
3 Monday 16 movies 3206
4 Monday 18 television 900
In [242]:
df_daily_weekly_avgscore = pd.read_csv("../../data/csv/daily_weekly_avgscore_eda.csv")
df_daily_weekly_avgscore = df_daily_weekly_avgscore.sort_values(by="hour_of_day")
df_daily_weekly_avgscore.head()
Out[242]:
subreddit hour_of_day day_of_week_str average_score
0 television 0 Wednesday 243.692586
91 movies 0 Friday 136.785370
109 television 0 Saturday 215.709350
139 television 0 Friday 233.622340
156 movies 0 Wednesday 97.904864
In [34]:
df_daily_weekly['normalized_count'] = df_daily_weekly.apply(custom_divide, axis=1)
In [244]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}
fig = px.sunburst(df_daily_weekly, 
                path=['subreddit', 'day_of_week_str', 'hour_of_day'], 
                values='count', 
                color='subreddit',
                color_discrete_map=color_map,
                title="Distribution of posts across Days of Week and Hours of Day")

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()
fig.write_html(f"../../data/plots/sunburst_eda.html")
In [243]:
color_map = {
    'movies': '#FF4301',
    'anime': '#ff9200',  
    'television': '#ffe100' 
}
fig = px.sunburst(df_daily_weekly_avgscore, 
                path=['subreddit', 'day_of_week_str', 'hour_of_day'], 
                values='average_score', 
                color='subreddit',
                color_discrete_map=color_map,
                title="Average scores of posts across Days of Week and Hours of Day")

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()
fig.write_html(f"../../data/plots/sunburst_avgscore_eda.html")

Analysis of post counts for Authors with highest score¶

In [254]:
df_top_posts_scores = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores.head()
Out[254]:
Unnamed: 0 subreddit title num_comments selftext author score rank count
0 0 anime "Berserk" creator Kentaro Miura dead at 54 1762 NaN enterthedragonpunch 33384 1 1
1 1 anime Who will be the first seed in Best Girl 8? 619 Hi everyone, we are currently trialing a new f... mpp00 31830 2 241
2 2 anime Best Girl 9 Prediction Tournament! 264 NaN mpp00 30302 3 241
3 3 anime The Devil is a Part-Timer Season 2 Announced! 2486 NaN Srikkk 30213 4 32
4 4 anime "Spice and Wolf" New Anime Announced 1897 NaN dorkmax_executives 29222 5 304
In [255]:
df_top_posts_scores_post_count = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores_post_count
Out[255]:
Unnamed: 0 subreddit title num_comments selftext author score rank count
0 0 anime "Berserk" creator Kentaro Miura dead at 54 1762 NaN enterthedragonpunch 33384 1 1
1 1 anime Who will be the first seed in Best Girl 8? 619 Hi everyone, we are currently trialing a new f... mpp00 31830 2 241
2 2 anime Best Girl 9 Prediction Tournament! 264 NaN mpp00 30302 3 241
3 3 anime The Devil is a Part-Timer Season 2 Announced! 2486 NaN Srikkk 30213 4 32
4 4 anime "Spice and Wolf" New Anime Announced 1897 NaN dorkmax_executives 29222 5 304
5 5 anime 'Konosuba' Season 3 Announced 993 NaN RobotiSC 28555 6 519
6 6 anime "JoJo's Bizarre Adventure Part 6" Anime Announced 1334 NaN Lovro26 27942 7 1450
7 7 anime Konosuba | New Anime Key Visual (HQ) 762 NaN MarvelsGrantMan136 27634 8 3480
8 8 anime One Punch Man Season 3 Announced 1264 NaN Turbostrider27 27156 9 2235
9 9 anime Shingeki no Kyojin: The Final Season - Episode... 6434 *Shingeki no Kyojin: The Final Season*, episod... AutoLovepon 26579 10 6982
10 10 television Former 'Reading Rainbow' host LeVar Burton wan... 2903 NaN esporx 61389 1 141
11 11 television LeVar Burton wants Jeopardy producers to know ... 1513 NaN chanma50 45599 2 1662
12 12 television For the Love of God, Let LeVar Burton Host Jeo... 1602 NaN manskies 45499 3 2
13 13 television ‘Futurama’ Revival Ordered at Hulu With Multip... 2533 NaN chanma50 40529 4 1662
14 14 television ‘Mindhunter’ Director Urges Fans to Make Noise... 1149 NaN MarvelsGrantMan136 39701 5 3480
15 15 television Jessica Walter Dies: Emmy-Winning ‘Arrested De... 1746 NaN chanma50 39183 6 1662
16 16 television Biden Inauguration Captures Bigger Audience Th... 2038 NaN chanma50 38453 7 1662
17 17 television Terry Crews Receives A Star On The Hollywood W... 661 NaN Gato1980 35198 8 210
18 18 television Pedro Pascal To Star As Joel In ‘The Last Of U... 2671 NaN chanma50 34814 9 1662
19 19 television Conan O’Brien Deserved Better. One of the most... 1851 NaN Samoht99 34665 10 1180
20 20 movies Hi, I’m Keanu Reeves, AMA 33376 NaN lionsgate 282232 1 9
21 21 movies Hi, I’m Tobey Maguire, actor/executive produce... 17793 NaN officialtobeymaguire 192782 2 1
22 22 movies Hello, I’m Nicolas Cage and welcome to Ask Me ... 26670 NaN lionsgate 189542 3 9
23 23 movies Please Bring Back Voice Actors, Stop Celebrity... 5191 NaN fungobat 137551 4 82
24 24 movies Brendan Fraser Wins Academy Award for Best Act... 3290 NaN MarvelsGrantMan136 109148 5 3480
25 25 movies ‘Dune’ Sequel Greenlit By Legendary For Exclus... 6559 NaN CosmicBlazeKnight 108958 6 2
26 26 movies Guy On Doomed Planet Mostly Concerned With Ski... 5291 NaN uxhelpneeded 103672 7 1
27 27 movies Gilbert Gottfried, Comedian and ‘Aladdin’ Star... 4875 NaN chanma50 103500 8 1662
28 28 movies WillSmith Banned from Attending Oscars Ceremon... 10783 NaN MarvelsGrantMan136 101136 9 3480
29 29 movies Robbie Coltrane, Comic Performer Who Played Ha... 2467 NaN MarvelsGrantMan136 94867 10 3480
In [247]:
movies_df = df_top_posts_scores_post_count.sort_values(by='score', ascending=False)
movies_df =movies_df.head(15)
movies_df
Out[247]:
Unnamed: 0 subreddit title num_comments selftext author score rank count
20 20 movies Hi, I’m Keanu Reeves, AMA 33376 NaN lionsgate 282232 1 9
21 21 movies Hi, I’m Tobey Maguire, actor/executive produce... 17793 NaN officialtobeymaguire 192782 2 1
22 22 movies Hello, I’m Nicolas Cage and welcome to Ask Me ... 26670 NaN lionsgate 189542 3 9
23 23 movies Please Bring Back Voice Actors, Stop Celebrity... 5191 NaN fungobat 137551 4 82
24 24 movies Brendan Fraser Wins Academy Award for Best Act... 3290 NaN MarvelsGrantMan136 109148 5 3480
25 25 movies ‘Dune’ Sequel Greenlit By Legendary For Exclus... 6559 NaN CosmicBlazeKnight 108958 6 2
26 26 movies Guy On Doomed Planet Mostly Concerned With Ski... 5291 NaN uxhelpneeded 103672 7 1
27 27 movies Gilbert Gottfried, Comedian and ‘Aladdin’ Star... 4875 NaN chanma50 103500 8 1662
28 28 movies WillSmith Banned from Attending Oscars Ceremon... 10783 NaN MarvelsGrantMan136 101136 9 3480
29 29 movies Robbie Coltrane, Comic Performer Who Played Ha... 2467 NaN MarvelsGrantMan136 94867 10 3480
10 10 television Former 'Reading Rainbow' host LeVar Burton wan... 2903 NaN esporx 61389 1 141
11 11 television LeVar Burton wants Jeopardy producers to know ... 1513 NaN chanma50 45599 2 1662
12 12 television For the Love of God, Let LeVar Burton Host Jeo... 1602 NaN manskies 45499 3 2
13 13 television ‘Futurama’ Revival Ordered at Hulu With Multip... 2533 NaN chanma50 40529 4 1662
14 14 television ‘Mindhunter’ Director Urges Fans to Make Noise... 1149 NaN MarvelsGrantMan136 39701 5 3480
In [256]:
#creating new dataframes according to the subreddit category
movie_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
anime_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
In [259]:
# Add Annotations
movie_annotations = [
    dict(x="lionsgate",
         y=9,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
    dict(x="CosmicBlazeKnight",
         y=2,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
    dict(x="officialtobeymaguire",
         y=1,
         xref="x", yref="y",
         text="Due to the limited number of posts, bars for the pointed authors may be hard to spot",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="uxhelpneeded",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]
tv_annotations = [
    dict(x="manskies",
         y=2,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bar",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]
anime_annotations = [
   dict(x="Srikkk",
         y=32,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bars",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="enterthedragonpunch",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]

# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Update layout for better visualization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(

        title_text="Post Count",
        # range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
    ),
    annotations=movie_annotations,
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Post count of authors with top scores for movies subreddit","annotations": movie_annotations}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Post count of authors with top scores for television subreddit","annotations": tv_annotations}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Post count of authors with top scores for anime subreddit","annotations": anime_annotations}]),
               #  dict(label="All",
               #       method="update",
               #       args=[{"visible": [True, True, True]},
               #             {"title": "Top 10 active authors across all 3 subreddits"}]),
            ]),
            x=0.9,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Set title
fig.update_layout(title_text="Post count of authors with top scores for movies subreddit")
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()

fig.write_html(f"../../data/plots/top10_authorscore_postcount_eda.html")

Top Authors with Top Comments¶

In [250]:
authors_with_top_comments_post_counts = pd.read_csv("../../data/csv/authors_with_top_comments_post_counts.csv")
In [251]:
#creating new dataframes for each subreddit category
movie_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count", "num_comments"]].drop_duplicates()
tv_author_df = tv_author_df.nlargest(10, 'num_comments')
tv_author_df.drop(columns=['num_comments'], axis=1, inplace=True)
tv_author_df = tv_author_df.sort_values(by = "count", ascending = False)
anime_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
In [253]:
# Add Annotations
movie_annotations = [
    dict(x="LETS_MAKE_IT_AWKWARD",
         y=1,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bars",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="officialtobeymaguire",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1)
]
tv_annotations = [
    dict(x="ewzetf",
         y=3,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
    dict(x="Midnight_Oil_",
         y=1,
         xref="x", yref="y",
         text="Since the Post Count is low, we can barely see the bars",
         ax=0, ay=-50, showarrow=True,
         arrowhead=1),
    dict(x="thetanhausergate",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=-40, showarrow=True,
         arrowhead=1),
]
anime_annotations = [
    dict(x="AnimeMod",
         y=1,
         xref="x", yref="y",
         text="",
         ax=0, ay=0),
]

# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Update layout for better visualization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(

        title_text="Post Count",
        # range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
    ),
    annotations=movie_annotations,
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Post counts of authors with top comments for movies subreddit","annotations": movie_annotations}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Post counts of authors with top comments for television subreddit","annotations": tv_annotations}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Post counts of authors with top comments for anime subreddit","annotations": anime_annotations}]),
            ]),
            x=0.9,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Set title
fig.update_layout(title_text="Post counts of authors with top comments for movies subreddit")
# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

fig.show()

fig.write_html(f"../../data/plots/Top_10_authors_with_top_comments.html")

Table to display missing values in dataset¶

In [3]:
# Loading the dataset 
df = pd.read_csv("../../data/csv/num_missing_val.csv") 
df.head()
Out[3]:
Column Missing Values
0 subreddit 0
1 author 0
2 title 0
3 selftext 0
4 created_utc 0
In [4]:
#renaming the columns
df.rename(columns={'Column': 'Column Name'}, inplace=True)
In [183]:
# Sort the DataFrame by 'Missing Values' in descending order
df_sorted = df.sort_values(by='Missing Values', ascending=False)

#creating table
fig = go.Figure(data=[go.Table(
    header=dict(values=list(df_sorted.columns),
                fill_color='#FF4301',
                font=dict(color='white'),  # Set font color for header
                align='left'),
    cells=dict(values=[df_sorted['Column Name'], df_sorted['Missing Values']],
               fill_color='#ececec', #setting font color for rows
               align='left'))
])
fig.update_layout(title=dict(text="Distribution of missing values"))
# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)
# Exporting the figure to a html file
fig.write_html("../../data/plots/table_missing_values.html")
fig.show()

Lollipop plot to display the Subreddit count¶

In [14]:
# Loading the dataset 
df_subreddit = pd.read_csv("../../data/csv/subreddit_count.csv") 
df_subreddit
Out[14]:
Unnamed: 0 subreddit count
0 0 anime 404298
1 1 television 89586
2 2 televisionsuggestions 7991
3 3 movies 382085
4 4 Animesuggest 74101
5 5 MovieSuggestions 58907
In [15]:
#renaming the subreddits
df_subreddit =df_subreddit.replace("anime", "r/anime")
df_subreddit =df_subreddit.replace("television", "r/television")
df_subreddit =df_subreddit.replace("televisionsuggestions", "r/televisionsuggestions")
df_subreddit =df_subreddit.replace("movies", "r/movies")
df_subreddit =df_subreddit.replace("Animesuggest", "r/Animesuggest")
df_subreddit =df_subreddit.replace("MovieSuggestions", "r/MovieSuggestions")
In [37]:
# Reorder it based on the values:
ordered_df = df_subreddit.sort_values(by='count')
my_range=range(1,len(df_subreddit.index)+1)

# Horizontal version
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['count'], color='lightgrey')
plt.plot(ordered_df['count'], my_range, "D", markerfacecolor='#FF4301', markeredgecolor='#FF4301')
plt.yticks(my_range, ordered_df['subreddit'])
plt.gca().get_xaxis().set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x / 1000)}K'))
plt.xlim(0, ordered_df['count'].max() + 5000) 

# Annotating count numbers over the diamonds and pushing them more to the left
for i, count in zip(my_range, ordered_df['count']):
    plt.annotate(f'{int(count):,}', xy=(count + 5300, i), ha='left', va='center', fontsize=10, color='black')

# Adding  x-axis and y-axis labels
plt.xlabel('Count', fontsize=13)  
plt.ylabel('Subreddit', fontsize=13)  

# Setting background color to none
plt.gca().set_facecolor('none')

# Remove borders
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add caption at the top
plt.text(0.5, 1.05, 'Subreddit Counts Analysis', ha='center', va='center', fontsize=16,transform=plt.gca().transAxes)

# Save the figure as a PNG file
plt.savefig("../../data/plots/subreddit_count_analysis.png", bbox_inches='tight', dpi=300)

plt.show()

Plot for authors with top post¶

In [19]:
# load dataset
author_df = pd.read_csv("../../data/csv/author_eda.csv")
author_df
Out[19]:
Unnamed: 0 author subreddit count rank
0 1 AutoLovepon anime 6982 2
1 2 Turbostrider27 anime 2215 3
2 3 Gvostfr anime 1677 4
3 4 Lovro26 anime 1450 5
4 5 inspyral anime 787 6
5 6 AnimeMod anime 755 7
6 7 SuperAlloyBerserker anime 595 8
7 8 Shimmering-Sky anime 557 9
8 9 RobotiSC anime 517 10
9 10 SorcererOfTheLake anime 441 11
10 12 wednesdaygiftinfo movies 24222 2
11 13 allthebestmovies movies 10353 3
12 14 Alternative-Bat-2458 movies 3538 4
13 15 Ornery-Control-9474 movies 3475 5
14 16 Sisiwakanamaru movies 1861 6
15 17 MovieBattleGame movies 1471 7
16 18 MarvelsGrantMan136 movies 1362 8
17 19 chanma50 movies 939 9
18 20 Samoht99 movies 842 10
19 21 indig0sixalpha movies 820 11
20 23 MarvelsGrantMan136 television 1960 2
21 24 Sisiwakanamaru television 1139 3
22 25 klutzysunshine television 966 4
23 26 chanma50 television 723 5
24 27 misana123 television 663 6
25 28 PetyrDayne television 657 7
26 29 Neo2199 television 602 8
27 30 indig0sixalpha television 564 9
28 31 GroundbreakingSet187 television 436 10
29 32 DemiFiendRSA television 403 11
In [30]:
#creating dataframes according to the subreddit category
movie_author_df = author_df[author_df['subreddit'] == 'movies']
tv_author_df = author_df[author_df['subreddit'] == 'television']
anime_author_df = author_df[author_df['subreddit'] == 'anime']
movie_author_df
Out[30]:
Unnamed: 0 author subreddit count rank
10 12 wednesdaygiftinfo movies 24222 2
11 13 allthebestmovies movies 10353 3
12 14 Alternative-Bat-2458 movies 3538 4
13 15 Ornery-Control-9474 movies 3475 5
14 16 Sisiwakanamaru movies 1861 6
15 17 MovieBattleGame movies 1471 7
16 18 MarvelsGrantMan136 movies 1362 8
17 19 chanma50 movies 939 9
18 20 Samoht99 movies 842 10
19 21 indig0sixalpha movies 820 11
In [36]:
# Create figure
fig = go.Figure()

# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301'
                            ,hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100'
                     ,hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False,  marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))

# Update layout with customization
fig.update_layout(
    plot_bgcolor='white',  # Set background color to white
     xaxis=dict(title_text="Author"), # Set x-axis line color
     yaxis=dict(
         title_text="Post Count",
    ),
    updatemenus=[
       
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 active authors for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 active authors for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 active authors for anime subreddit"}]),
               
            ]),
            x=0.85,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.25,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ])

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Set title
fig.update_layout(title_text="Top 10 active authors for movies subreddit")
fig.write_html("../../data/plots/Top_author_post.html")
fig.show()

Table for posts with top comments¶

In [37]:
#import data
anime_comments_df = pd.read_csv("../../data/csv/top_comments_anime.csv")
movies_comments_df = pd.read_csv("../../data/csv/top_comments_movies.csv")
tvshows_comments_df = pd.read_csv("../../data/csv/top_comments_tv_show.csv")
In [38]:
#remove rows with [deleted] author
movies_comments_df = movies_comments_df[movies_comments_df['author'] != '[deleted]']
# creating a new column "content"
movies_comments_df["selftext"].fillna(" ", inplace=True)
movies_comments_df["Content"] = "Title: " + movies_comments_df["title"] + "Body: " + movies_comments_df["selftext"]
#Selecting columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
movies_comments_df = movies_comments_df[selected_columns].head(10)
movies_comments_df
Out[38]:
Content num_comments author
0 Title: Name a single movie, where the sequel o... 35446 dpemerson76
1 Title: Hi, I’m Keanu Reeves, AMABody: 33376 lionsgate
2 Title: Official Discussion - Zack Snyder's Jus... 30350 LiteraryBoner
3 Title: 1 Teen Dead, Another Critically Injured... 28664 prsnreddit
4 Title: Hello, I’m Nicolas Cage and welcome to ... 26670 lionsgate
5 Title: Official Oscars Discussion Thread 2022B... 22097 LiteraryBoner
6 Title: Official Discussion - Spider-Man: No Wa... 21419 LiteraryBoner
8 Title: Official Discussion - Avatar: The Way o... 19888 LiteraryBoner
9 Title: Official Oscars Discussion Thread 2023B... 18380 LiteraryBoner
10 Title: Hi, I’m Tobey Maguire, actor/executive ... 17793 officialtobeymaguire
In [39]:
#remove rows with [deleted] author
anime_comments_df = anime_comments_df[anime_comments_df['author'] != '[deleted]']
# creating a new column content
anime_comments_df["selftext"].fillna(" ", inplace=True)
anime_comments_df["Content"] = "Title: " + anime_comments_df["title"] + "Body: " + anime_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
anime_comments_df = anime_comments_df[selected_columns].head(10)
anime_comments_df
Out[39]:
Content num_comments author
0 Title: Casual Discussion Fridays - Week of Mar... 18528 AnimeMod
1 Title: Casual Discussion Fridays - Week of Apr... 18074 AnimeMod
2 Title: Casual Discussion Fridays - Week of Apr... 16970 AnimeMod
3 Title: Casual Discussion Fridays - Week of Mar... 16651 AnimeMod
4 Title: Casual Discussion Fridays - Week of Jun... 16258 AutoModerator
5 Title: Casual Discussion Fridays - Week of Apr... 16229 AnimeMod
6 Title: Casual Discussion Fridays - Week of Apr... 16104 AnimeMod
7 Title: Casual Discussion Fridays - Week of Apr... 16084 AnimeMod
8 Title: Casual Discussion Fridays - Week of Sep... 15984 AutoModerator
9 Title: Casual Discussion Fridays - Week of Jan... 15277 AnimeMod
In [40]:
#remove rows with [deleted] author
tvshows_comments_df = tvshows_comments_df[tvshows_comments_df['author'] != '[deleted]']
# creating a new column content
tvshows_comments_df["selftext"].fillna(" ", inplace=True)
tvshows_comments_df["Content"] = "Title: " + tvshows_comments_df["title"] + "Body: " + tvshows_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
tvshows_comments_df = tvshows_comments_df[selected_columns].head(10)
tvshows_comments_df
Out[40]:
Content num_comments author
0 Title: Will Smith Slaps Chris Rock at The Osca... 9332 Midnight_Oil_
1 Title: Dave Chappelle Lands Emmy Nomination fo... 7589 Neo2199
2 Title: ‘House of the Dragon’ Star Steve Toussa... 6660 overvivideo
3 Title: GLAAD condemns Dave Chappelle, Netflix ... 6550 LarryPeru
4 Title: Dave Chappelle Calls Kids Who Dared Cri... 5976 inthetownwhere
5 Title: ‘Cowboy Bebop’ Canceled By Netflix Afte... 5974 MarvelsGrantMan136
6 Title: What color is an elf? Or a Sea Snake? A... 5828 ewzetf
7 Title: Gina Carano Star Wars: She is No Longer... 5745 thetanhausergate
8 Title: Netflix Co-CEO Ted Sarandos Defends Dav... 5740 Neo2199
9 Title: The Last of Us - Series Premiere Discus... 5721 NicholasCajun
In [186]:
selected_columns_movies = ['Content', 'num_comments', 'author']
data_values_movies = [movies_comments_df[col].tolist() for col in selected_columns_movies]

selected_columns_tv = ['Content', 'num_comments', 'author']
data_values_tv = [tvshows_comments_df[col].tolist() for col in selected_columns_tv]

selected_columns_anime = ['Content', 'num_comments', 'author']
data_values_anime = [anime_comments_df[col].tolist() for col in selected_columns_anime]

# Specify the new header names
header_values = ['Content', 'Number of Comments', 'Author']

header_color="#FF4301"
body_color="#ececec"
font_color = "white"
font_color_cell = "black"
# Create figure
fig = go.Figure()

# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_movies, fill_color=body_color, font=dict(color=font_color_cell)), 
                       columnwidth=[2.5, 0.25, 0.4]))

# Add bar trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_tv, fill_color=body_color, font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2.5, 0.25, 0.4]))

# Add bar trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_anime, fill_color=body_color, font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2.5, 0.25, 0.4]))

# Update layout for better visualization
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 posts having maximum comments for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 posts having maximum comments for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 posts having maximum comments for anime subreddit"}]),]),
            x=0.85,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.05,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ],
    margin=dict(l=10, r=10, t=15, b=20),  # Adjust margins
    height=1000  # Adjust height
    )

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Set title
fig.update_layout(title_text="Top 10 post having maximum comments for movies subreddit")

# Export the figure to a html file
fig.write_html("../../data/plots/table_top_comments.html")
fig.show()

Table for posts with top score¶

In [62]:
#import data
scores_df = pd.read_csv("../../data/csv/top_author_score_postcount_eda.csv")
scores_df.head()
Out[62]:
Unnamed: 0 subreddit title num_comments selftext author score rank count
0 0 anime "Berserk" creator Kentaro Miura dead at 54 1762 NaN enterthedragonpunch 33384 1 1
1 1 anime Who will be the first seed in Best Girl 8? 619 Hi everyone, we are currently trialing a new f... mpp00 31830 2 241
2 2 anime Best Girl 9 Prediction Tournament! 264 NaN mpp00 30302 3 241
3 3 anime The Devil is a Part-Timer Season 2 Announced! 2486 NaN Srikkk 30213 4 32
4 4 anime "Spice and Wolf" New Anime Announced 1897 NaN dorkmax_executives 29222 5 304
In [63]:
# creating a new column "content"
scores_df["selftext"].fillna(" ", inplace=True)
scores_df["Content"] = "Title: " + scores_df["title"] + "Body: " + scores_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "score","author","subreddit"]
scores_df = scores_df[selected_columns]
scores_df.head()
Out[63]:
Content score author subreddit
0 Title: "Berserk" creator Kentaro Miura dead at... 33384 enterthedragonpunch anime
1 Title: Who will be the first seed in Best Girl... 31830 mpp00 anime
2 Title: Best Girl 9 Prediction Tournament! Body: 30302 mpp00 anime
3 Title: The Devil is a Part-Timer Season 2 Anno... 30213 Srikkk anime
4 Title: "Spice and Wolf" New Anime AnnouncedBod... 29222 dorkmax_executives anime
In [64]:
#making dataframes according to each reddit category
movie_score_df = scores_df[scores_df['subreddit'] == 'movies']
tv_score_df = scores_df[scores_df['subreddit'] == 'television']
anime_score_df = scores_df[scores_df['subreddit'] == 'anime']
In [187]:
selected_columns_movies = ['Content', 'score', 'author']
data_values_movies = [movie_score_df[col].tolist() for col in selected_columns_movies]

selected_columns_tv = ['Content', 'score', 'author']
data_values_tv = [tv_score_df[col].tolist() for col in selected_columns_tv]

selected_columns_anime = ['Content', 'score', 'author']
data_values_anime = [anime_score_df[col].tolist() for col in selected_columns_anime]

# Specify the new header names
header_values = ['Content', 'Score', 'Author']

header_color="#FF4301"
body_color="#ececec"
font_color = "white"
font_color_cell="black"
# Create figure
fig = go.Figure()

# Add trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_movies, fill_color=body_color,font=dict(color=font_color_cell)), 
                       columnwidth=[2, 0.25, 0.45]))

# Add  trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_tv, fill_color=body_color,font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2, 0.25, 0.45]))

# Add  trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)), 
                       cells=dict(values=data_values_anime, fill_color=body_color,font=dict(color=font_color_cell)),
                       visible=False, 
                       columnwidth=[2.5, 0.2, 0.45]))

# Update layout for better visualization
fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=list([
                 dict(label="Movie",
                     method="update",
                     args=[{"visible": [True, False, False]},
                           {"title": "Top 10 posts having maximum score for movies subreddit"}]),
                dict(label="Television",
                     method="update",
                     args=[{"visible": [False, True, False]},
                           {"title": "Top 10 posts having maximum score for television subreddit"}]),
                dict(label="Anime",
                     method="update",
                     args=[{"visible": [False, False, True]},
                           {"title": "Top 10 posts having maximum score for anime subreddit"}]),]),
            x=0.85,  # Adjusted the dropdown position to the top
            xanchor='left',  # Anchored the dropdown to the left
            y=1.15,  # Adjusted the dropdown position to the top
            yanchor='top'  # Anchored the dropdown to the top
        )
    ],
    margin=dict(l=20, r=20, t=15, b=20),  # Adjust margins
    height=350  # Adjust height
    )

# Adjusting the layout
fig.update_layout(
    title_x=0.5, # Centering the title
)

# Set title
fig.update_layout(title_text="Top 10 Post having maximum score for movies subreddit")

# Export the figure to a html file
fig.write_html("../../data/plots/table_top_score.html")

fig.show()

External Dataset Table¶

In [54]:
df_external_movies = pd.read_csv("../../data/csv/best_movies_netflix_ext.csv") 
df_external_movies.head()
Out[54]:
TITLE RELEASE_YEAR SCORE NUMBER_OF_VOTES DURATION MAIN_GENRE MAIN_PRODUCTION
0 David Attenborough: A Life on Our Planet 2020 9.0 31180 83 documentary GB
1 Inception 2010 8.8 2268288 148 scifi GB
2 Forrest Gump 1994 8.8 1994599 142 drama US
3 Anbe Sivam 2003 8.7 20595 160 comedy IN
4 Bo Burnham: Inside 2021 8.7 44074 87 comedy US
In [55]:
df_external_shows = pd.read_csv("../../data/csv/best_shows_netflix_ext.csv") 
df_external_movies.head()
Out[55]:
TITLE RELEASE_YEAR SCORE NUMBER_OF_VOTES DURATION MAIN_GENRE MAIN_PRODUCTION
0 David Attenborough: A Life on Our Planet 2020 9.0 31180 83 documentary GB
1 Inception 2010 8.8 2268288 148 scifi GB
2 Forrest Gump 1994 8.8 1994599 142 drama US
3 Anbe Sivam 2003 8.7 20595 160 comedy IN
4 Bo Burnham: Inside 2021 8.7 44074 87 comedy US