# Importing the libraries
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
from pyspark.sql.functions import length
from pyspark.sql import functions as F
datastore = 'azureml://datastores/workspaceblobstore/paths/'
submissions_path = 'filtered-submissions'
submissions_df = spark.read.parquet(f"{datastore}{submissions_path}")
# take a subset of columns
df = submissions_df.select("subreddit", "author", "title", "selftext",
"created_utc", "num_comments", "score",
"over_18", "media", "pinned", "locked",
"disable_comments", "domain", "hidden",
"distinguished", "hide_score")
# calculate post length
df = df.withColumn('post_length', length(df.title) + length(df.selftext))
df = df.withColumn('created_utc', F.to_timestamp('created_utc'))
# Extract time-based features
df = df.withColumn('hour_of_day', F.hour('created_utc'))
df = df.withColumn('day_of_week', F.dayofweek('created_utc')) # 1 (Sunday) to 7 (Saturday)
# Map each day of the week from numeric to string
df = df.withColumn('day_of_week_str', F.expr("""
CASE day_of_week
WHEN 1 THEN 'Sunday'
WHEN 2 THEN 'Monday'
WHEN 3 THEN 'Tuesday'
WHEN 4 THEN 'Wednesday'
WHEN 5 THEN 'Thursday'
WHEN 6 THEN 'Friday'
WHEN 7 THEN 'Saturday'
END
"""))
df = df.withColumn('day_of_month', F.dayofmonth('created_utc'))
df = df.withColumn('month', F.month('created_utc'))
df = df.withColumn('year', F.year('created_utc'))
df = df.withColumn('has_media', F.col('media').isNotNull())
df = df.drop(*["media", "disable_comments", "distinguished"])
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 108, 9, Finished, Available)
PLOT_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "plots")
CSV_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "csv")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 105, 8, Finished, Available)
df_plotly = df.select(["subreddit", "num_comments", "score", "has_media", "post_length"])
df_plotly = df_plotly.filter(df_plotly.subreddit.isin('movies', 'anime', 'television'))
df_plotly_pd = df_plotly.toPandas()
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
# Create the scatter plot with custom colors
fig = px.scatter(
df_plotly_pd,
y='post_length',
x='score',
color='subreddit',
color_discrete_map=color_map, # Use the custom color map
size='num_comments',
labels={'num_comments': 'Number of Comments', 'score': 'Score',
'subreddit': 'Subreddit', 'post_length': 'Post Length'},
title='Engagement Dynamics of Reddit Posts Across Entertainment Subreddits'
)
# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])
# Show the plot
fig.show()
fig.write_html(f"{PLOT_DIR}/engagement_eda.html")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 48, 9, Finished, Available)
df_plotly_pd["has_media"] = df_plotly_pd["has_media"].astype('str')
color_map = {
'False': '#FF4301',
'True': '#ffe100'
}
# Create the scatter plot with custom colors
fig = px.scatter(
df_plotly_pd,
y='post_length',
x='score',
color='has_media',
color_discrete_map=color_map, # Use the custom color map
size='num_comments',
size_max=50, # Adjusting the maximum size of the bubbles
labels={'num_comments': 'Number of Comments', 'score': 'Score',
'has_media': 'Has Media', 'post_length': 'Post Length'},
title='Engagement Dynamics of Reddit Posts with and without Media'
)
# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])
# fig.update_traces(marker=dict(size_max=30))
fig.update_layout(
title_x=0.5)
# Show the plot
fig.show()
fig.write_html(f"../../data/plots/engagement_with_media.html")
# Group by 'subreddit' and 'has_media', and count the occurrences
grouped_data = df_plotly_pd.groupby(['subreddit', 'has_media']).size().reset_index(name='count')
total_counts = grouped_data.groupby('subreddit')['count'].transform('sum')
grouped_data['percentage'] = grouped_data['count'] / total_counts * 100
grouped_data['percentage'] = grouped_data['percentage'].round(2)
grouped_data['percentage_text'] = grouped_data['percentage'].round(2).astype(str) + '%'
color_map = {
'False': '#FF4301',
'True': '#ffe100'
}
fig = px.bar(
grouped_data,
x='subreddit',
y='percentage',
color='has_media',
color_discrete_map=color_map, # Use the custom color map
barmode='group',
text='percentage_text',
title='Percentage of Posts with and without Media per Subreddit',
labels={'percentage': 'Percentage of Posts', 'subreddit': 'Subreddit'}
)
# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_layout(
title_x=0.5)
# Show the plot
fig.show()
fig.write_html("../../data/plots/percentage_of_posts_with_media_per_subreddit.html")
df_datetime_pd = pd.read_csv("../../data/csv/year_month_day_eda.csv")
df_datetime_pd.head()
subreddit | day | month | year | count | |
---|---|---|---|---|---|
0 | movies | 19 | 11 | 2022 | 320 |
1 | television | 16 | 6 | 2021 | 113 |
2 | anime | 31 | 3 | 2021 | 766 |
3 | television | 18 | 5 | 2021 | 135 |
4 | television | 19 | 7 | 2022 | 132 |
df_datetime_avg_score_pd = pd.read_csv("../../data/csv/year_month_day_avgscore_eda.csv")
df_datetime_avg_score_pd.head()
subreddit | year | month | average_score | |
---|---|---|---|---|
0 | movies | 2021 | 5 | 92.467274 |
1 | movies | 2021 | 10 | 153.312225 |
2 | television | 2021 | 3 | 326.358652 |
3 | television | 2021 | 6 | 278.586502 |
4 | anime | 2021 | 9 | 71.181420 |
df_datetime_pd_ym = df_datetime_pd.groupby(["month", "year", "subreddit"], as_index=False)["count"].sum()
# Convert year, month, and day_of_month to a datetime column in Pandas
df_datetime_pd_ym['date'] = pd.to_datetime(df_datetime_pd_ym[['year', 'month']].assign(day=1))
df_datetime_pd_ym = df_datetime_pd_ym.sort_values(by="date")
df_datetime_pd_ym.head()
month | year | subreddit | count | date | |
---|---|---|---|---|---|
0 | 1 | 2021 | anime | 22775 | 2021-01-01 |
1 | 1 | 2021 | movies | 15673 | 2021-01-01 |
2 | 1 | 2021 | television | 3513 | 2021-01-01 |
11 | 2 | 2021 | television | 3629 | 2021-02-01 |
10 | 2 | 2021 | movies | 15617 | 2021-02-01 |
df_datetime_avg_score_pd['date'] = pd.to_datetime(df_datetime_avg_score_pd[['year', 'month']].assign(day=1))
df_datetime_avg_score_pd = df_datetime_avg_score_pd.sort_values(by="date")
df_datetime_avg_score_pd.head()
subreddit | year | month | average_score | date | |
---|---|---|---|---|---|
20 | movies | 2021 | 1 | 169.982262 | 2021-01-01 |
26 | anime | 2021 | 1 | 66.665379 | 2021-01-01 |
23 | television | 2021 | 1 | 376.816112 | 2021-01-01 |
24 | anime | 2021 | 2 | 67.384173 | 2021-02-01 |
21 | television | 2021 | 2 | 326.005511 | 2021-02-01 |
# Define the division factors in a dictionary
divisors = {
'movies': 382085,
'anime': 404298,
'television': 89586
}
# Function to apply the custom division based on the subreddit
def custom_divide(row):
return row['count'] / divisors[row['subreddit']]
# Apply the function to each row
df_datetime_pd_ym['normalized_count'] = df_datetime_pd_ym.apply(custom_divide, axis=1)
# Custom color map
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
# Create the time series plot using Plotly
fig = px.line(
df_datetime_pd_ym,
x='date',
y='normalized_count',
color='subreddit',
color_discrete_map=color_map, # Use the custom color map
labels={'count': 'Post Count', 'date': 'Date', 'subreddit': 'Subreddit'},
line_shape="spline",
title='Number of posts across the years (2021-2023)',
render_mode='svg'
)
# Improve the clarity of the plot
fig.update_traces(
line=dict(width=2), # Thinner line
mode='lines+markers', # Show markers as well as lines
marker=dict(size=4, opacity=0.6), # Smaller markers with some transparency
opacity=0.7 # Lines are a bit transparent to reduce visual clutter
)
# Enhance the layout
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
legend=dict(
yanchor="top",
y=0.99,
xanchor="right",
x=0.99
)
)
# Add range slider for interactivity
fig.update_layout(
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1, label="1M", step="month", stepmode="backward"),
dict(count=6, label="6M", step="month", stepmode="backward"),
dict(count=1, label="1Y", step="year", stepmode="backward"),
dict(step="all")
])
),
rangeslider=dict(
visible=True
)
)
)
fig.update_yaxes(title_text='Normalized Post Count <br> (Post Count / Total Count)')
fig.update_xaxes(title_text = 'Date (2021-2023)')
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
# width=1000, # Increasing width
# height=600 # Increasing height
)
# Show the plot
fig.show()
fig.write_html(f"../../data/plots/time_series_eda.html")
# Custom color map
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
# Create the time series plot using Plotly
fig = px.line(
df_datetime_avg_score_pd,
x='date',
y='average_score',
color='subreddit',
color_discrete_map=color_map, # Use the custom color map
labels={'average_score': 'Average Score', 'date': 'Date', 'subreddit': 'Subreddit'},
line_shape="spline",
title='Average score of posts across the years (2021-2023)',
render_mode='svg'
)
# Improve the clarity of the plot
fig.update_traces(
line=dict(width=2), # Thinner line
mode='lines+markers', # Show markers as well as lines
marker=dict(size=4, opacity=0.6), # Smaller markers with some transparency
opacity=0.7 # Lines are a bit transparent to reduce visual clutter
)
# Enhance the layout
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
legend=dict(
yanchor="top",
y=0.99,
xanchor="right",
x=0.99
)
)
# Add range slider for interactivity
fig.update_layout(
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1, label="1M", step="month", stepmode="backward"),
dict(count=6, label="6M", step="month", stepmode="backward"),
dict(count=1, label="1Y", step="year", stepmode="backward"),
dict(step="all")
])
),
rangeslider=dict(
visible=True
)
)
)
fig.update_yaxes(title_text='Average Score')
fig.update_xaxes(title_text = 'Date (2021-2023)')
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
# width=1000, # Increasing width
# height=600 # Increasing height
)
# Show the plot
fig.show()
fig.write_html(f"../../data/plots/time_series_score_eda.html")
day_of_month_pd = pd.read_csv(f"../../data/csv/day_of_month_avg_eda.csv")
day_of_month_count_pd = pd.read_csv("../../data/csv/daily_weekly_count_eda.csv")
day_of_month_pd = day_of_month_pd.sort_values(by='day_of_month')
day_of_month_count_pd = day_of_month_count_pd.sort_values(by='day_of_month')
day_of_month_pd_1 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["anime", "movies", "television"])]
day_of_month_pd_2 = day_of_month_pd[day_of_month_pd["subreddit"].isin(["Animesuggest", "televisionsuggestions", "MovieSuggestions"])]
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
# Create the line chart
fig = px.line(
day_of_month_pd_1,
x='day_of_month',
y='average_score',
color='subreddit',
color_discrete_map=color_map,
labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
title='Average Score by Day of Month Across Subreddits',
line_shape="spline",
)
fig.update_xaxes(range=[1, 31])
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
width=800, # Increasing width
height=600 # Increasing height
)
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
# Show the plot
fig.show()
fig.write_html(f"../../data/plots/avg_score_eda.html")
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
# Create the line chart
fig = px.line(
day_of_month_count_pd,
x='day_of_month',
y='count',
color='subreddit',
color_discrete_map=color_map,
labels={'average_score': 'Average Score', 'day_of_month': 'Day of the Month'},
title='Count of posts by Day of Month Across Subreddits',
line_shape="spline",
)
fig.update_xaxes(range=[1, 31])
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
width=800, # Increasing width
height=600 # Increasing height
)
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
# Show the plot
fig.show()
fig.write_html(f"../../data/plots/day_of_month_count_eda.html")
df_daily_weekly = df.groupBy(["day_of_week_str", "hour_of_day", "subreddit"]).count().toPandas()
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 9, Finished, Available)
df_daily_weekly.to_csv(f"{CSV_DIR}/daily_weekly_eda.csv", index=False)
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 64, 15, Finished, Available)
df_daily_weekly = pd.read_csv(f"../../data/csv/daily_weekly_eda.csv")
df_daily_weekly.head()
day_of_week_str | hour_of_day | subreddit | count | |
---|---|---|---|---|
0 | Friday | 8 | anime | 1768 |
1 | Friday | 14 | television | 791 |
2 | Tuesday | 15 | movies | 2932 |
3 | Monday | 16 | movies | 3206 |
4 | Monday | 18 | television | 900 |
df_daily_weekly_avgscore = pd.read_csv("../../data/csv/daily_weekly_avgscore_eda.csv")
df_daily_weekly_avgscore = df_daily_weekly_avgscore.sort_values(by="hour_of_day")
df_daily_weekly_avgscore.head()
subreddit | hour_of_day | day_of_week_str | average_score | |
---|---|---|---|---|
0 | television | 0 | Wednesday | 243.692586 |
91 | movies | 0 | Friday | 136.785370 |
109 | television | 0 | Saturday | 215.709350 |
139 | television | 0 | Friday | 233.622340 |
156 | movies | 0 | Wednesday | 97.904864 |
df_daily_weekly['normalized_count'] = df_daily_weekly.apply(custom_divide, axis=1)
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
fig = px.sunburst(df_daily_weekly,
path=['subreddit', 'day_of_week_str', 'hour_of_day'],
values='count',
color='subreddit',
color_discrete_map=color_map,
title="Distribution of posts across Days of Week and Hours of Day")
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
fig.show()
fig.write_html(f"../../data/plots/sunburst_eda.html")
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
fig = px.sunburst(df_daily_weekly_avgscore,
path=['subreddit', 'day_of_week_str', 'hour_of_day'],
values='average_score',
color='subreddit',
color_discrete_map=color_map,
title="Average scores of posts across Days of Week and Hours of Day")
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
fig.show()
fig.write_html(f"../../data/plots/sunburst_avgscore_eda.html")
df_top_posts_scores = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores.head()
Unnamed: 0 | subreddit | title | num_comments | selftext | author | score | rank | count | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | anime | "Berserk" creator Kentaro Miura dead at 54 | 1762 | NaN | enterthedragonpunch | 33384 | 1 | 1 |
1 | 1 | anime | Who will be the first seed in Best Girl 8? | 619 | Hi everyone, we are currently trialing a new f... | mpp00 | 31830 | 2 | 241 |
2 | 2 | anime | Best Girl 9 Prediction Tournament! | 264 | NaN | mpp00 | 30302 | 3 | 241 |
3 | 3 | anime | The Devil is a Part-Timer Season 2 Announced! | 2486 | NaN | Srikkk | 30213 | 4 | 32 |
4 | 4 | anime | "Spice and Wolf" New Anime Announced | 1897 | NaN | dorkmax_executives | 29222 | 5 | 304 |
df_top_posts_scores_post_count = pd.read_csv(f"../../data/csv/top_author_score_postcount_eda.csv")
df_top_posts_scores_post_count
Unnamed: 0 | subreddit | title | num_comments | selftext | author | score | rank | count | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | anime | "Berserk" creator Kentaro Miura dead at 54 | 1762 | NaN | enterthedragonpunch | 33384 | 1 | 1 |
1 | 1 | anime | Who will be the first seed in Best Girl 8? | 619 | Hi everyone, we are currently trialing a new f... | mpp00 | 31830 | 2 | 241 |
2 | 2 | anime | Best Girl 9 Prediction Tournament! | 264 | NaN | mpp00 | 30302 | 3 | 241 |
3 | 3 | anime | The Devil is a Part-Timer Season 2 Announced! | 2486 | NaN | Srikkk | 30213 | 4 | 32 |
4 | 4 | anime | "Spice and Wolf" New Anime Announced | 1897 | NaN | dorkmax_executives | 29222 | 5 | 304 |
5 | 5 | anime | 'Konosuba' Season 3 Announced | 993 | NaN | RobotiSC | 28555 | 6 | 519 |
6 | 6 | anime | "JoJo's Bizarre Adventure Part 6" Anime Announced | 1334 | NaN | Lovro26 | 27942 | 7 | 1450 |
7 | 7 | anime | Konosuba | New Anime Key Visual (HQ) | 762 | NaN | MarvelsGrantMan136 | 27634 | 8 | 3480 |
8 | 8 | anime | One Punch Man Season 3 Announced | 1264 | NaN | Turbostrider27 | 27156 | 9 | 2235 |
9 | 9 | anime | Shingeki no Kyojin: The Final Season - Episode... | 6434 | *Shingeki no Kyojin: The Final Season*, episod... | AutoLovepon | 26579 | 10 | 6982 |
10 | 10 | television | Former 'Reading Rainbow' host LeVar Burton wan... | 2903 | NaN | esporx | 61389 | 1 | 141 |
11 | 11 | television | LeVar Burton wants Jeopardy producers to know ... | 1513 | NaN | chanma50 | 45599 | 2 | 1662 |
12 | 12 | television | For the Love of God, Let LeVar Burton Host Jeo... | 1602 | NaN | manskies | 45499 | 3 | 2 |
13 | 13 | television | ‘Futurama’ Revival Ordered at Hulu With Multip... | 2533 | NaN | chanma50 | 40529 | 4 | 1662 |
14 | 14 | television | ‘Mindhunter’ Director Urges Fans to Make Noise... | 1149 | NaN | MarvelsGrantMan136 | 39701 | 5 | 3480 |
15 | 15 | television | Jessica Walter Dies: Emmy-Winning ‘Arrested De... | 1746 | NaN | chanma50 | 39183 | 6 | 1662 |
16 | 16 | television | Biden Inauguration Captures Bigger Audience Th... | 2038 | NaN | chanma50 | 38453 | 7 | 1662 |
17 | 17 | television | Terry Crews Receives A Star On The Hollywood W... | 661 | NaN | Gato1980 | 35198 | 8 | 210 |
18 | 18 | television | Pedro Pascal To Star As Joel In ‘The Last Of U... | 2671 | NaN | chanma50 | 34814 | 9 | 1662 |
19 | 19 | television | Conan O’Brien Deserved Better. One of the most... | 1851 | NaN | Samoht99 | 34665 | 10 | 1180 |
20 | 20 | movies | Hi, I’m Keanu Reeves, AMA | 33376 | NaN | lionsgate | 282232 | 1 | 9 |
21 | 21 | movies | Hi, I’m Tobey Maguire, actor/executive produce... | 17793 | NaN | officialtobeymaguire | 192782 | 2 | 1 |
22 | 22 | movies | Hello, I’m Nicolas Cage and welcome to Ask Me ... | 26670 | NaN | lionsgate | 189542 | 3 | 9 |
23 | 23 | movies | Please Bring Back Voice Actors, Stop Celebrity... | 5191 | NaN | fungobat | 137551 | 4 | 82 |
24 | 24 | movies | Brendan Fraser Wins Academy Award for Best Act... | 3290 | NaN | MarvelsGrantMan136 | 109148 | 5 | 3480 |
25 | 25 | movies | ‘Dune’ Sequel Greenlit By Legendary For Exclus... | 6559 | NaN | CosmicBlazeKnight | 108958 | 6 | 2 |
26 | 26 | movies | Guy On Doomed Planet Mostly Concerned With Ski... | 5291 | NaN | uxhelpneeded | 103672 | 7 | 1 |
27 | 27 | movies | Gilbert Gottfried, Comedian and ‘Aladdin’ Star... | 4875 | NaN | chanma50 | 103500 | 8 | 1662 |
28 | 28 | movies | WillSmith Banned from Attending Oscars Ceremon... | 10783 | NaN | MarvelsGrantMan136 | 101136 | 9 | 3480 |
29 | 29 | movies | Robbie Coltrane, Comic Performer Who Played Ha... | 2467 | NaN | MarvelsGrantMan136 | 94867 | 10 | 3480 |
movies_df = df_top_posts_scores_post_count.sort_values(by='score', ascending=False)
movies_df =movies_df.head(15)
movies_df
Unnamed: 0 | subreddit | title | num_comments | selftext | author | score | rank | count | |
---|---|---|---|---|---|---|---|---|---|
20 | 20 | movies | Hi, I’m Keanu Reeves, AMA | 33376 | NaN | lionsgate | 282232 | 1 | 9 |
21 | 21 | movies | Hi, I’m Tobey Maguire, actor/executive produce... | 17793 | NaN | officialtobeymaguire | 192782 | 2 | 1 |
22 | 22 | movies | Hello, I’m Nicolas Cage and welcome to Ask Me ... | 26670 | NaN | lionsgate | 189542 | 3 | 9 |
23 | 23 | movies | Please Bring Back Voice Actors, Stop Celebrity... | 5191 | NaN | fungobat | 137551 | 4 | 82 |
24 | 24 | movies | Brendan Fraser Wins Academy Award for Best Act... | 3290 | NaN | MarvelsGrantMan136 | 109148 | 5 | 3480 |
25 | 25 | movies | ‘Dune’ Sequel Greenlit By Legendary For Exclus... | 6559 | NaN | CosmicBlazeKnight | 108958 | 6 | 2 |
26 | 26 | movies | Guy On Doomed Planet Mostly Concerned With Ski... | 5291 | NaN | uxhelpneeded | 103672 | 7 | 1 |
27 | 27 | movies | Gilbert Gottfried, Comedian and ‘Aladdin’ Star... | 4875 | NaN | chanma50 | 103500 | 8 | 1662 |
28 | 28 | movies | WillSmith Banned from Attending Oscars Ceremon... | 10783 | NaN | MarvelsGrantMan136 | 101136 | 9 | 3480 |
29 | 29 | movies | Robbie Coltrane, Comic Performer Who Played Ha... | 2467 | NaN | MarvelsGrantMan136 | 94867 | 10 | 3480 |
10 | 10 | television | Former 'Reading Rainbow' host LeVar Burton wan... | 2903 | NaN | esporx | 61389 | 1 | 141 |
11 | 11 | television | LeVar Burton wants Jeopardy producers to know ... | 1513 | NaN | chanma50 | 45599 | 2 | 1662 |
12 | 12 | television | For the Love of God, Let LeVar Burton Host Jeo... | 1602 | NaN | manskies | 45499 | 3 | 2 |
13 | 13 | television | ‘Futurama’ Revival Ordered at Hulu With Multip... | 2533 | NaN | chanma50 | 40529 | 4 | 1662 |
14 | 14 | television | ‘Mindhunter’ Director Urges Fans to Make Noise... | 1149 | NaN | MarvelsGrantMan136 | 39701 | 5 | 3480 |
#creating new dataframes according to the subreddit category
movie_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
anime_author_df = df_top_posts_scores_post_count[df_top_posts_scores_post_count['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
# Add Annotations
movie_annotations = [
dict(x="lionsgate",
y=9,
xref="x", yref="y",
text="",
ax=0, ay=-40, showarrow=True,
arrowhead=1),
dict(x="CosmicBlazeKnight",
y=2,
xref="x", yref="y",
text="",
ax=0, ay=-40, showarrow=True,
arrowhead=1),
dict(x="officialtobeymaguire",
y=1,
xref="x", yref="y",
text="Due to the limited number of posts, bars for the pointed authors may be hard to spot",
ax=0, ay=-50, showarrow=True,
arrowhead=1),
dict(x="uxhelpneeded",
y=1,
xref="x", yref="y",
text="",
ax=0, ay=-40, showarrow=True,
arrowhead=1),
]
tv_annotations = [
dict(x="manskies",
y=2,
xref="x", yref="y",
text="Since the Post Count is low, we can barely see the bar",
ax=0, ay=-40, showarrow=True,
arrowhead=1),
]
anime_annotations = [
dict(x="Srikkk",
y=32,
xref="x", yref="y",
text="Since the Post Count is low, we can barely see the bars",
ax=0, ay=-50, showarrow=True,
arrowhead=1),
dict(x="enterthedragonpunch",
y=1,
xref="x", yref="y",
text="",
ax=0, ay=-40, showarrow=True,
arrowhead=1),
]
# Create figure
fig = go.Figure()
# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False, marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Update layout for better visualization
fig.update_layout(
plot_bgcolor='white', # Set background color to white
xaxis=dict(title_text="Author"), # Set x-axis line color
yaxis=dict(
title_text="Post Count",
# range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
),
annotations=movie_annotations,
updatemenus=[
dict(
active=0,
buttons=list([
dict(label="Movie",
method="update",
args=[{"visible": [True, False, False]},
{"title": "Post count of authors with top scores for movies subreddit","annotations": movie_annotations}]),
dict(label="Television",
method="update",
args=[{"visible": [False, True, False]},
{"title": "Post count of authors with top scores for television subreddit","annotations": tv_annotations}]),
dict(label="Anime",
method="update",
args=[{"visible": [False, False, True]},
{"title": "Post count of authors with top scores for anime subreddit","annotations": anime_annotations}]),
# dict(label="All",
# method="update",
# args=[{"visible": [True, True, True]},
# {"title": "Top 10 active authors across all 3 subreddits"}]),
]),
x=0.9, # Adjusted the dropdown position to the top
xanchor='left', # Anchored the dropdown to the left
y=1.25, # Adjusted the dropdown position to the top
yanchor='top' # Anchored the dropdown to the top
)
])
# Set title
fig.update_layout(title_text="Post count of authors with top scores for movies subreddit")
fig.update_layout(
title_x=0.5, # Centering the title
)
fig.show()
fig.write_html(f"../../data/plots/top10_authorscore_postcount_eda.html")
authors_with_top_comments_post_counts = pd.read_csv("../../data/csv/authors_with_top_comments_post_counts.csv")
#creating new dataframes for each subreddit category
movie_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'movies'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
tv_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'television'].sort_values(by="count", ascending=False)[["author", "count", "num_comments"]].drop_duplicates()
tv_author_df = tv_author_df.nlargest(10, 'num_comments')
tv_author_df.drop(columns=['num_comments'], axis=1, inplace=True)
tv_author_df = tv_author_df.sort_values(by = "count", ascending = False)
anime_author_df = authors_with_top_comments_post_counts[authors_with_top_comments_post_counts['subreddit'] == 'anime'].sort_values(by="count", ascending=False)[["author", "count"]].drop_duplicates()
# Add Annotations
movie_annotations = [
dict(x="LETS_MAKE_IT_AWKWARD",
y=1,
xref="x", yref="y",
text="Since the Post Count is low, we can barely see the bars",
ax=0, ay=-50, showarrow=True,
arrowhead=1),
dict(x="officialtobeymaguire",
y=1,
xref="x", yref="y",
text="",
ax=0, ay=-40, showarrow=True,
arrowhead=1)
]
tv_annotations = [
dict(x="ewzetf",
y=3,
xref="x", yref="y",
text="",
ax=0, ay=-40, showarrow=True,
arrowhead=1),
dict(x="Midnight_Oil_",
y=1,
xref="x", yref="y",
text="Since the Post Count is low, we can barely see the bars",
ax=0, ay=-50, showarrow=True,
arrowhead=1),
dict(x="thetanhausergate",
y=1,
xref="x", yref="y",
text="",
ax=0, ay=-40, showarrow=True,
arrowhead=1),
]
anime_annotations = [
dict(x="AnimeMod",
y=1,
xref="x", yref="y",
text="",
ax=0, ay=0),
]
# Create figure
fig = go.Figure()
# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False, marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Update layout for better visualization
fig.update_layout(
plot_bgcolor='white', # Set background color to white
xaxis=dict(title_text="Author"), # Set x-axis line color
yaxis=dict(
title_text="Post Count",
# range=[0, max(movie_author_df['count'].max(), tv_author_df['count'].max(), anime_author_df['count'].max()) + 1000]
),
annotations=movie_annotations,
updatemenus=[
dict(
active=0,
buttons=list([
dict(label="Movie",
method="update",
args=[{"visible": [True, False, False]},
{"title": "Post counts of authors with top comments for movies subreddit","annotations": movie_annotations}]),
dict(label="Television",
method="update",
args=[{"visible": [False, True, False]},
{"title": "Post counts of authors with top comments for television subreddit","annotations": tv_annotations}]),
dict(label="Anime",
method="update",
args=[{"visible": [False, False, True]},
{"title": "Post counts of authors with top comments for anime subreddit","annotations": anime_annotations}]),
]),
x=0.9, # Adjusted the dropdown position to the top
xanchor='left', # Anchored the dropdown to the left
y=1.25, # Adjusted the dropdown position to the top
yanchor='top' # Anchored the dropdown to the top
)
])
# Set title
fig.update_layout(title_text="Post counts of authors with top comments for movies subreddit")
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
fig.show()
fig.write_html(f"../../data/plots/Top_10_authors_with_top_comments.html")
# Loading the dataset
df = pd.read_csv("../../data/csv/num_missing_val.csv")
df.head()
Column | Missing Values | |
---|---|---|
0 | subreddit | 0 |
1 | author | 0 |
2 | title | 0 |
3 | selftext | 0 |
4 | created_utc | 0 |
#renaming the columns
df.rename(columns={'Column': 'Column Name'}, inplace=True)
# Sort the DataFrame by 'Missing Values' in descending order
df_sorted = df.sort_values(by='Missing Values', ascending=False)
#creating table
fig = go.Figure(data=[go.Table(
header=dict(values=list(df_sorted.columns),
fill_color='#FF4301',
font=dict(color='white'), # Set font color for header
align='left'),
cells=dict(values=[df_sorted['Column Name'], df_sorted['Missing Values']],
fill_color='#ececec', #setting font color for rows
align='left'))
])
fig.update_layout(title=dict(text="Distribution of missing values"))
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
# Exporting the figure to a html file
fig.write_html("../../data/plots/table_missing_values.html")
fig.show()
# Loading the dataset
df_subreddit = pd.read_csv("../../data/csv/subreddit_count.csv")
df_subreddit
Unnamed: 0 | subreddit | count | |
---|---|---|---|
0 | 0 | anime | 404298 |
1 | 1 | television | 89586 |
2 | 2 | televisionsuggestions | 7991 |
3 | 3 | movies | 382085 |
4 | 4 | Animesuggest | 74101 |
5 | 5 | MovieSuggestions | 58907 |
#renaming the subreddits
df_subreddit =df_subreddit.replace("anime", "r/anime")
df_subreddit =df_subreddit.replace("television", "r/television")
df_subreddit =df_subreddit.replace("televisionsuggestions", "r/televisionsuggestions")
df_subreddit =df_subreddit.replace("movies", "r/movies")
df_subreddit =df_subreddit.replace("Animesuggest", "r/Animesuggest")
df_subreddit =df_subreddit.replace("MovieSuggestions", "r/MovieSuggestions")
# Reorder it based on the values:
ordered_df = df_subreddit.sort_values(by='count')
my_range=range(1,len(df_subreddit.index)+1)
# Horizontal version
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['count'], color='lightgrey')
plt.plot(ordered_df['count'], my_range, "D", markerfacecolor='#FF4301', markeredgecolor='#FF4301')
plt.yticks(my_range, ordered_df['subreddit'])
plt.gca().get_xaxis().set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x / 1000)}K'))
plt.xlim(0, ordered_df['count'].max() + 5000)
# Annotating count numbers over the diamonds and pushing them more to the left
for i, count in zip(my_range, ordered_df['count']):
plt.annotate(f'{int(count):,}', xy=(count + 5300, i), ha='left', va='center', fontsize=10, color='black')
# Adding x-axis and y-axis labels
plt.xlabel('Count', fontsize=13)
plt.ylabel('Subreddit', fontsize=13)
# Setting background color to none
plt.gca().set_facecolor('none')
# Remove borders
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# Add caption at the top
plt.text(0.5, 1.05, 'Subreddit Counts Analysis', ha='center', va='center', fontsize=16,transform=plt.gca().transAxes)
# Save the figure as a PNG file
plt.savefig("../../data/plots/subreddit_count_analysis.png", bbox_inches='tight', dpi=300)
plt.show()
# load dataset
author_df = pd.read_csv("../../data/csv/author_eda.csv")
author_df
Unnamed: 0 | author | subreddit | count | rank | |
---|---|---|---|---|---|
0 | 1 | AutoLovepon | anime | 6982 | 2 |
1 | 2 | Turbostrider27 | anime | 2215 | 3 |
2 | 3 | Gvostfr | anime | 1677 | 4 |
3 | 4 | Lovro26 | anime | 1450 | 5 |
4 | 5 | inspyral | anime | 787 | 6 |
5 | 6 | AnimeMod | anime | 755 | 7 |
6 | 7 | SuperAlloyBerserker | anime | 595 | 8 |
7 | 8 | Shimmering-Sky | anime | 557 | 9 |
8 | 9 | RobotiSC | anime | 517 | 10 |
9 | 10 | SorcererOfTheLake | anime | 441 | 11 |
10 | 12 | wednesdaygiftinfo | movies | 24222 | 2 |
11 | 13 | allthebestmovies | movies | 10353 | 3 |
12 | 14 | Alternative-Bat-2458 | movies | 3538 | 4 |
13 | 15 | Ornery-Control-9474 | movies | 3475 | 5 |
14 | 16 | Sisiwakanamaru | movies | 1861 | 6 |
15 | 17 | MovieBattleGame | movies | 1471 | 7 |
16 | 18 | MarvelsGrantMan136 | movies | 1362 | 8 |
17 | 19 | chanma50 | movies | 939 | 9 |
18 | 20 | Samoht99 | movies | 842 | 10 |
19 | 21 | indig0sixalpha | movies | 820 | 11 |
20 | 23 | MarvelsGrantMan136 | television | 1960 | 2 |
21 | 24 | Sisiwakanamaru | television | 1139 | 3 |
22 | 25 | klutzysunshine | television | 966 | 4 |
23 | 26 | chanma50 | television | 723 | 5 |
24 | 27 | misana123 | television | 663 | 6 |
25 | 28 | PetyrDayne | television | 657 | 7 |
26 | 29 | Neo2199 | television | 602 | 8 |
27 | 30 | indig0sixalpha | television | 564 | 9 |
28 | 31 | GroundbreakingSet187 | television | 436 | 10 |
29 | 32 | DemiFiendRSA | television | 403 | 11 |
#creating dataframes according to the subreddit category
movie_author_df = author_df[author_df['subreddit'] == 'movies']
tv_author_df = author_df[author_df['subreddit'] == 'television']
anime_author_df = author_df[author_df['subreddit'] == 'anime']
movie_author_df
Unnamed: 0 | author | subreddit | count | rank | |
---|---|---|---|---|---|
10 | 12 | wednesdaygiftinfo | movies | 24222 | 2 |
11 | 13 | allthebestmovies | movies | 10353 | 3 |
12 | 14 | Alternative-Bat-2458 | movies | 3538 | 4 |
13 | 15 | Ornery-Control-9474 | movies | 3475 | 5 |
14 | 16 | Sisiwakanamaru | movies | 1861 | 6 |
15 | 17 | MovieBattleGame | movies | 1471 | 7 |
16 | 18 | MarvelsGrantMan136 | movies | 1362 | 8 |
17 | 19 | chanma50 | movies | 939 | 9 |
18 | 20 | Samoht99 | movies | 842 | 10 |
19 | 21 | indig0sixalpha | movies | 820 | 11 |
# Create figure
fig = go.Figure()
# Add bar trace for movie_author_df
fig.add_trace(go.Bar(x=movie_author_df['author'], y=movie_author_df['count'], name='Movies', visible=True, marker_color='#FF4301'
,hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Add bar trace for tv_author_df
fig.add_trace(go.Bar(x=tv_author_df['author'], y=tv_author_df['count'], name='TV Shows', visible=False, marker_color='#ffe100'
,hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Add bar trace for anime_author_df
fig.add_trace(go.Bar(x=anime_author_df['author'], y=anime_author_df['count'], name='Anime', visible=False, marker_color='#ff9200',hovertemplate = "Author: %{x} <br>Post Count: %{y}"))
# Update layout with customization
fig.update_layout(
plot_bgcolor='white', # Set background color to white
xaxis=dict(title_text="Author"), # Set x-axis line color
yaxis=dict(
title_text="Post Count",
),
updatemenus=[
dict(
active=0,
buttons=list([
dict(label="Movie",
method="update",
args=[{"visible": [True, False, False]},
{"title": "Top 10 active authors for movies subreddit"}]),
dict(label="Television",
method="update",
args=[{"visible": [False, True, False]},
{"title": "Top 10 active authors for television subreddit"}]),
dict(label="Anime",
method="update",
args=[{"visible": [False, False, True]},
{"title": "Top 10 active authors for anime subreddit"}]),
]),
x=0.85, # Adjusted the dropdown position to the top
xanchor='left', # Anchored the dropdown to the left
y=1.25, # Adjusted the dropdown position to the top
yanchor='top' # Anchored the dropdown to the top
)
])
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
# Set title
fig.update_layout(title_text="Top 10 active authors for movies subreddit")
fig.write_html("../../data/plots/Top_author_post.html")
fig.show()
#import data
anime_comments_df = pd.read_csv("../../data/csv/top_comments_anime.csv")
movies_comments_df = pd.read_csv("../../data/csv/top_comments_movies.csv")
tvshows_comments_df = pd.read_csv("../../data/csv/top_comments_tv_show.csv")
#remove rows with [deleted] author
movies_comments_df = movies_comments_df[movies_comments_df['author'] != '[deleted]']
# creating a new column "content"
movies_comments_df["selftext"].fillna(" ", inplace=True)
movies_comments_df["Content"] = "Title: " + movies_comments_df["title"] + "Body: " + movies_comments_df["selftext"]
#Selecting columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
movies_comments_df = movies_comments_df[selected_columns].head(10)
movies_comments_df
Content | num_comments | author | |
---|---|---|---|
0 | Title: Name a single movie, where the sequel o... | 35446 | dpemerson76 |
1 | Title: Hi, I’m Keanu Reeves, AMABody: | 33376 | lionsgate |
2 | Title: Official Discussion - Zack Snyder's Jus... | 30350 | LiteraryBoner |
3 | Title: 1 Teen Dead, Another Critically Injured... | 28664 | prsnreddit |
4 | Title: Hello, I’m Nicolas Cage and welcome to ... | 26670 | lionsgate |
5 | Title: Official Oscars Discussion Thread 2022B... | 22097 | LiteraryBoner |
6 | Title: Official Discussion - Spider-Man: No Wa... | 21419 | LiteraryBoner |
8 | Title: Official Discussion - Avatar: The Way o... | 19888 | LiteraryBoner |
9 | Title: Official Oscars Discussion Thread 2023B... | 18380 | LiteraryBoner |
10 | Title: Hi, I’m Tobey Maguire, actor/executive ... | 17793 | officialtobeymaguire |
#remove rows with [deleted] author
anime_comments_df = anime_comments_df[anime_comments_df['author'] != '[deleted]']
# creating a new column content
anime_comments_df["selftext"].fillna(" ", inplace=True)
anime_comments_df["Content"] = "Title: " + anime_comments_df["title"] + "Body: " + anime_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
anime_comments_df = anime_comments_df[selected_columns].head(10)
anime_comments_df
Content | num_comments | author | |
---|---|---|---|
0 | Title: Casual Discussion Fridays - Week of Mar... | 18528 | AnimeMod |
1 | Title: Casual Discussion Fridays - Week of Apr... | 18074 | AnimeMod |
2 | Title: Casual Discussion Fridays - Week of Apr... | 16970 | AnimeMod |
3 | Title: Casual Discussion Fridays - Week of Mar... | 16651 | AnimeMod |
4 | Title: Casual Discussion Fridays - Week of Jun... | 16258 | AutoModerator |
5 | Title: Casual Discussion Fridays - Week of Apr... | 16229 | AnimeMod |
6 | Title: Casual Discussion Fridays - Week of Apr... | 16104 | AnimeMod |
7 | Title: Casual Discussion Fridays - Week of Apr... | 16084 | AnimeMod |
8 | Title: Casual Discussion Fridays - Week of Sep... | 15984 | AutoModerator |
9 | Title: Casual Discussion Fridays - Week of Jan... | 15277 | AnimeMod |
#remove rows with [deleted] author
tvshows_comments_df = tvshows_comments_df[tvshows_comments_df['author'] != '[deleted]']
# creating a new column content
tvshows_comments_df["selftext"].fillna(" ", inplace=True)
tvshows_comments_df["Content"] = "Title: " + tvshows_comments_df["title"] + "Body: " + tvshows_comments_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "num_comments","author"]
tvshows_comments_df = tvshows_comments_df[selected_columns].head(10)
tvshows_comments_df
Content | num_comments | author | |
---|---|---|---|
0 | Title: Will Smith Slaps Chris Rock at The Osca... | 9332 | Midnight_Oil_ |
1 | Title: Dave Chappelle Lands Emmy Nomination fo... | 7589 | Neo2199 |
2 | Title: ‘House of the Dragon’ Star Steve Toussa... | 6660 | overvivideo |
3 | Title: GLAAD condemns Dave Chappelle, Netflix ... | 6550 | LarryPeru |
4 | Title: Dave Chappelle Calls Kids Who Dared Cri... | 5976 | inthetownwhere |
5 | Title: ‘Cowboy Bebop’ Canceled By Netflix Afte... | 5974 | MarvelsGrantMan136 |
6 | Title: What color is an elf? Or a Sea Snake? A... | 5828 | ewzetf |
7 | Title: Gina Carano Star Wars: She is No Longer... | 5745 | thetanhausergate |
8 | Title: Netflix Co-CEO Ted Sarandos Defends Dav... | 5740 | Neo2199 |
9 | Title: The Last of Us - Series Premiere Discus... | 5721 | NicholasCajun |
selected_columns_movies = ['Content', 'num_comments', 'author']
data_values_movies = [movies_comments_df[col].tolist() for col in selected_columns_movies]
selected_columns_tv = ['Content', 'num_comments', 'author']
data_values_tv = [tvshows_comments_df[col].tolist() for col in selected_columns_tv]
selected_columns_anime = ['Content', 'num_comments', 'author']
data_values_anime = [anime_comments_df[col].tolist() for col in selected_columns_anime]
# Specify the new header names
header_values = ['Content', 'Number of Comments', 'Author']
header_color="#FF4301"
body_color="#ececec"
font_color = "white"
font_color_cell = "black"
# Create figure
fig = go.Figure()
# Add surface trace
# Add bar trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)),
cells=dict(values=data_values_movies, fill_color=body_color, font=dict(color=font_color_cell)),
columnwidth=[2.5, 0.25, 0.4]))
# Add bar trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)),
cells=dict(values=data_values_tv, fill_color=body_color, font=dict(color=font_color_cell)),
visible=False,
columnwidth=[2.5, 0.25, 0.4]))
# Add bar trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)),
cells=dict(values=data_values_anime, fill_color=body_color, font=dict(color=font_color_cell)),
visible=False,
columnwidth=[2.5, 0.25, 0.4]))
# Update layout for better visualization
fig.update_layout(
updatemenus=[
dict(
active=0,
buttons=list([
dict(label="Movie",
method="update",
args=[{"visible": [True, False, False]},
{"title": "Top 10 posts having maximum comments for movies subreddit"}]),
dict(label="Television",
method="update",
args=[{"visible": [False, True, False]},
{"title": "Top 10 posts having maximum comments for television subreddit"}]),
dict(label="Anime",
method="update",
args=[{"visible": [False, False, True]},
{"title": "Top 10 posts having maximum comments for anime subreddit"}]),]),
x=0.85, # Adjusted the dropdown position to the top
xanchor='left', # Anchored the dropdown to the left
y=1.05, # Adjusted the dropdown position to the top
yanchor='top' # Anchored the dropdown to the top
)
],
margin=dict(l=10, r=10, t=15, b=20), # Adjust margins
height=1000 # Adjust height
)
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
# Set title
fig.update_layout(title_text="Top 10 post having maximum comments for movies subreddit")
# Export the figure to a html file
fig.write_html("../../data/plots/table_top_comments.html")
fig.show()
#import data
scores_df = pd.read_csv("../../data/csv/top_author_score_postcount_eda.csv")
scores_df.head()
Unnamed: 0 | subreddit | title | num_comments | selftext | author | score | rank | count | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | anime | "Berserk" creator Kentaro Miura dead at 54 | 1762 | NaN | enterthedragonpunch | 33384 | 1 | 1 |
1 | 1 | anime | Who will be the first seed in Best Girl 8? | 619 | Hi everyone, we are currently trialing a new f... | mpp00 | 31830 | 2 | 241 |
2 | 2 | anime | Best Girl 9 Prediction Tournament! | 264 | NaN | mpp00 | 30302 | 3 | 241 |
3 | 3 | anime | The Devil is a Part-Timer Season 2 Announced! | 2486 | NaN | Srikkk | 30213 | 4 | 32 |
4 | 4 | anime | "Spice and Wolf" New Anime Announced | 1897 | NaN | dorkmax_executives | 29222 | 5 | 304 |
# creating a new column "content"
scores_df["selftext"].fillna(" ", inplace=True)
scores_df["Content"] = "Title: " + scores_df["title"] + "Body: " + scores_df["selftext"]
#Select columns and display top 10 records
selected_columns = ["Content", "score","author","subreddit"]
scores_df = scores_df[selected_columns]
scores_df.head()
Content | score | author | subreddit | |
---|---|---|---|---|
0 | Title: "Berserk" creator Kentaro Miura dead at... | 33384 | enterthedragonpunch | anime |
1 | Title: Who will be the first seed in Best Girl... | 31830 | mpp00 | anime |
2 | Title: Best Girl 9 Prediction Tournament! Body: | 30302 | mpp00 | anime |
3 | Title: The Devil is a Part-Timer Season 2 Anno... | 30213 | Srikkk | anime |
4 | Title: "Spice and Wolf" New Anime AnnouncedBod... | 29222 | dorkmax_executives | anime |
#making dataframes according to each reddit category
movie_score_df = scores_df[scores_df['subreddit'] == 'movies']
tv_score_df = scores_df[scores_df['subreddit'] == 'television']
anime_score_df = scores_df[scores_df['subreddit'] == 'anime']
selected_columns_movies = ['Content', 'score', 'author']
data_values_movies = [movie_score_df[col].tolist() for col in selected_columns_movies]
selected_columns_tv = ['Content', 'score', 'author']
data_values_tv = [tv_score_df[col].tolist() for col in selected_columns_tv]
selected_columns_anime = ['Content', 'score', 'author']
data_values_anime = [anime_score_df[col].tolist() for col in selected_columns_anime]
# Specify the new header names
header_values = ['Content', 'Score', 'Author']
header_color="#FF4301"
body_color="#ececec"
font_color = "white"
font_color_cell="black"
# Create figure
fig = go.Figure()
# Add trace for movie_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)),
cells=dict(values=data_values_movies, fill_color=body_color,font=dict(color=font_color_cell)),
columnwidth=[2, 0.25, 0.45]))
# Add trace for tv_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)),
cells=dict(values=data_values_tv, fill_color=body_color,font=dict(color=font_color_cell)),
visible=False,
columnwidth=[2, 0.25, 0.45]))
# Add trace for anime_author_df
fig.add_trace(go.Table(header=dict(values=header_values, fill_color=header_color, font=dict(color=font_color)),
cells=dict(values=data_values_anime, fill_color=body_color,font=dict(color=font_color_cell)),
visible=False,
columnwidth=[2.5, 0.2, 0.45]))
# Update layout for better visualization
fig.update_layout(
updatemenus=[
dict(
active=0,
buttons=list([
dict(label="Movie",
method="update",
args=[{"visible": [True, False, False]},
{"title": "Top 10 posts having maximum score for movies subreddit"}]),
dict(label="Television",
method="update",
args=[{"visible": [False, True, False]},
{"title": "Top 10 posts having maximum score for television subreddit"}]),
dict(label="Anime",
method="update",
args=[{"visible": [False, False, True]},
{"title": "Top 10 posts having maximum score for anime subreddit"}]),]),
x=0.85, # Adjusted the dropdown position to the top
xanchor='left', # Anchored the dropdown to the left
y=1.15, # Adjusted the dropdown position to the top
yanchor='top' # Anchored the dropdown to the top
)
],
margin=dict(l=20, r=20, t=15, b=20), # Adjust margins
height=350 # Adjust height
)
# Adjusting the layout
fig.update_layout(
title_x=0.5, # Centering the title
)
# Set title
fig.update_layout(title_text="Top 10 Post having maximum score for movies subreddit")
# Export the figure to a html file
fig.write_html("../../data/plots/table_top_score.html")
fig.show()
df_external_movies = pd.read_csv("../../data/csv/best_movies_netflix_ext.csv")
df_external_movies.head()
TITLE | RELEASE_YEAR | SCORE | NUMBER_OF_VOTES | DURATION | MAIN_GENRE | MAIN_PRODUCTION | |
---|---|---|---|---|---|---|---|
0 | David Attenborough: A Life on Our Planet | 2020 | 9.0 | 31180 | 83 | documentary | GB |
1 | Inception | 2010 | 8.8 | 2268288 | 148 | scifi | GB |
2 | Forrest Gump | 1994 | 8.8 | 1994599 | 142 | drama | US |
3 | Anbe Sivam | 2003 | 8.7 | 20595 | 160 | comedy | IN |
4 | Bo Burnham: Inside | 2021 | 8.7 | 44074 | 87 | comedy | US |
df_external_shows = pd.read_csv("../../data/csv/best_shows_netflix_ext.csv")
df_external_movies.head()
TITLE | RELEASE_YEAR | SCORE | NUMBER_OF_VOTES | DURATION | MAIN_GENRE | MAIN_PRODUCTION | |
---|---|---|---|---|---|---|---|
0 | David Attenborough: A Life on Our Planet | 2020 | 9.0 | 31180 | 83 | documentary | GB |
1 | Inception | 2010 | 8.8 | 2268288 | 148 | scifi | GB |
2 | Forrest Gump | 1994 | 8.8 | 1994599 | 142 | drama | US |
3 | Anbe Sivam | 2003 | 8.7 | 20595 | 160 | comedy | IN |
4 | Bo Burnham: Inside | 2021 | 8.7 | 44074 | 87 | comedy | US |