Setup¶
In [ ]:
# Setup - Run only once per Kernel App
%conda install openjdk -y
# install PySpark
%pip install pyspark==3.3.0
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
Collecting package metadata (current_repodata.json): done Solving environment: done ==> WARNING: A newer version of conda exists. <== current version: 23.3.1 latest version: 23.9.0 Please update conda by running $ conda update -n base -c defaults conda Or to minimize the number of packages updated during conda update use conda install conda=23.9.0 # All requested packages already installed. Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: pyspark==3.3.0 in /opt/conda/lib/python3.10/site-packages (3.3.0) Requirement already satisfied: py4j==0.10.9.5 in /opt/conda/lib/python3.10/site-packages (from pyspark==3.3.0) (0.10.9.5) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv [notice] A new release of pip is available: 23.2.1 -> 23.3.1 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages.
Out[ ]:
Create Spark Session¶
In [ ]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession
spark = (
SparkSession.builder.appName("PySparkApp")
.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
.config(
"fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.ContainerCredentialsProvider",
)
.getOrCreate()
)
print(spark.version)
Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider
:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2/cache The jars for the packages stored in: /root/.ivy2/jars org.apache.hadoop#hadoop-aws added as a dependency :: resolving dependencies :: org.apache.spark#spark-submit-parent-ecf5cc64-d6f9-4902-9d53-c90167f0db69;1.0 confs: [default] found org.apache.hadoop#hadoop-aws;3.2.2 in central found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central :: resolution report :: resolve 244ms :: artifacts dl 12ms :: modules in use: com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default] org.apache.hadoop#hadoop-aws;3.2.2 from central in [default] --------------------------------------------------------------------- | | modules || artifacts | | conf | number| search|dwnlded|evicted|| number|dwnlded| --------------------------------------------------------------------- | default | 2 | 0 | 0 | 0 || 2 | 0 | --------------------------------------------------------------------- :: retrieving :: org.apache.spark#spark-submit-parent-ecf5cc64-d6f9-4902-9d53-c90167f0db69 confs: [default] 0 artifacts copied, 2 already retrieved (0kB/9ms)
23/11/04 16:42:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
3.3.0
Check Cross Account S3 Access¶
In [ ]:
! aws s3 ls s3://sagemaker-us-east-1-224518912016/project/
PRE cleaned/ PRE comments/ PRE submissions/
In [ ]:
! aws s3 ls s3://sagemaker-us-east-1-711387073580/project/
PRE cleaned/ PRE comments/ PRE submissions/
In [ ]:
! aws s3 ls s3://sagemaker-us-east-1-433974840707/project/
PRE cleaned/ PRE comments/ PRE submissions/
Read in data¶
Comments¶
In [ ]:
import sagemaker
session = sagemaker.Session()
# Create or retrieve a Spark session
spark = SparkSession.builder.appName("ReadS3Parquet").getOrCreate()
# S3 directory path
s3_directory1 = f"s3a://sagemaker-us-east-1-395393721134/project/cleaned/comments/"
s3_directory2 = f"s3a://sagemaker-us-east-1-433974840707/project/cleaned/comments/"
s3_directory3 = f"s3a://sagemaker-us-east-1-711387073580/project/cleaned/comments/"
s3_directory4 = f"s3a://sagemaker-us-east-1-224518912016/project/cleaned/comments/"
# Read all the Parquet files in the directory into a DataFrame
df_comments1 = spark.read.parquet(s3_directory1)
df_comments2 = spark.read.parquet(s3_directory2)
df_comments3 = spark.read.parquet(s3_directory3)
df_comments4 = spark.read.parquet(s3_directory4)
df_comments = df_comments1.union(df_comments2).union(df_comments3).union(df_comments4)
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml 23/11/04 16:42:32 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect. 23/11/04 16:42:32 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
In [ ]:
# check counts (ensuring all needed subreddits exist)
df_comments.groupBy('subreddit').count().show()
# get cols
print(df_comments.columns)
[Stage 4:========================================================>(58 + 1) / 59]
+------------+-------+ | subreddit| count| +------------+-------+ | socialism| 371369| | Liberal| 96396| | Economics|1428423| | Libertarian|2706903| | centrist| 921871| |changemyview|3909587| |Ask_Politics| 60149| | finance| 137118| |Conservative|5231661| +------------+-------+ ['author', 'author_cakeday', 'author_flair_css_class', 'body', 'can_gild', 'controversiality', 'created_utc', 'distinguished', 'edited', 'gilded', 'id', 'is_submitter', 'link_id', 'parent_id', 'permalink', 'retrieved_on', 'score', 'stickied', 'subreddit', 'subreddit_id']
Submissions¶
In [ ]:
# S3 directory path
s3_directory1 = f"s3a://sagemaker-us-east-1-395393721134/project/cleaned/submissions/"
s3_directory2 = f"s3a://sagemaker-us-east-1-433974840707/project/cleaned/submissions/"
s3_directory3 = f"s3a://sagemaker-us-east-1-711387073580/project/cleaned/submissions/"
s3_directory4 = f"s3a://sagemaker-us-east-1-224518912016/project/cleaned/submissions/"
# Read all the Parquet files in the directory into a DataFrame
df_submissions1 = spark.read.parquet(s3_directory1)
df_submissions2 = spark.read.parquet(s3_directory2)
df_submissions3 = spark.read.parquet(s3_directory3)
df_submissions4 = spark.read.parquet(s3_directory4)
df_submissions = df_submissions1.union(df_submissions2).union(df_submissions3).union(df_submissions4)
23/11/04 16:43:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
In [ ]:
# check counts (ensuring all needed subreddits exist)
df_submissions.groupBy('subreddit').count().show()
# get cols
print(df_submissions.columns)
[Stage 11:====================================================> (15 + 1) / 16]
+------------+------+ | subreddit| count| +------------+------+ | socialism| 40094| | Liberal| 11086| | Economics| 40604| | Libertarian| 51153| | centrist| 13594| |changemyview| 64632| |Ask_Politics| 5903| | finance| 28904| |Conservative|343938| +------------+------+ ['author', 'title', 'selftext', 'subreddit', 'score', 'num_comments', 'permalink', 'created_utc', 'url', 'domain', 'is_video', 'is_self', 'is_reddit_media_domain', 'spoiler', 'over_18', 'stickied', 'thumbnail', 'media', 'secure_media', 'gilded', 'archived', 'distinguished', 'crosspost_parent', 'crosspost_parent_list']
Data Prep¶
In [ ]:
# necessary imports
from pyspark.sql.functions import col, year, month, concat, date_format, avg, min, max, countDistinct
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np
In [ ]:
# cols of interest
comms_cols = ['subreddit', 'author', 'body', 'created_utc', 'score', 'controversiality', 'gilded', 'distinguished']
subs_cols = ['subreddit', 'author', 'title', 'selftext', 'created_utc', 'score', 'num_comments', 'gilded', 'distinguished']
# filter for target cols + reformat date
comms = df_comments.select(comms_cols).withColumn('created_utc', date_format('created_utc', 'yyyy-MM-dd'))
subs = df_submissions.select(subs_cols).withColumn('created_utc', date_format('created_utc', 'yyyy-MM-dd'))
# create year + month cols
comms_month_year = comms.withColumn('year', year('created_utc')).withColumn('month', month('created_utc'))
subs_month_year = subs.withColumn('year', year('created_utc')).withColumn('month', month('created_utc'))
In [ ]:
# get count of comments + posts based on subreddit, year, month
comms_counts = comms_month_year.groupBy('subreddit', 'year', 'month').count().withColumnRenamed('count', 'comments_count')
subs_counts = subs_month_year.groupBy('subreddit', 'year', 'month').count().withColumnRenamed('count', 'submissions_count')
# combine comments + posts dfs together
combined_counts = comms_counts.join(subs_counts, ['subreddit', 'year', 'month'], 'outer').fillna(0)
# group by subreddit, year, month + get counts of distinct authors
comms_distinct_authors = comms_month_year.groupBy('subreddit', 'year', 'month').agg(countDistinct('author').alias('distinct_authors_comments'))
subs_distinct_authors = subs_month_year.groupBy('subreddit', 'year', 'month').agg(countDistinct('author').alias('distinct_authors_submissions'))
# merge distinct author dfs with combined_counts df
combined_counts_distinct = combined_counts.join(comms_distinct_authors, ['subreddit', 'year', 'month'], 'left') \
.join(subs_distinct_authors, ['subreddit', 'year', 'month'], 'left') \
.fillna(0)
# get percentages for distinct authors for comments + submissions
combined_counts_distinct_perc = combined_counts_distinct.withColumn('percent_distinct_authors_comments',
col('distinct_authors_comments') / col('comments_count') * 100) \
.withColumn('percent_distinct_authors_submissions',
col('distinct_authors_submissions') / col('submissions_count') * 100) \
.fillna(0)
# sanity check
combined_counts_distinct_perc.show()
# convert to pandas df
activity_df = combined_counts_distinct_perc.toPandas()
activity_df.to_csv('../../data/csv/activity_df.csv', index=False)
# sanity check
print(activity_df['subreddit'].unique())
print(activity_df.groupby('subreddit').count())
print(activity_df.groupby('subreddit')[['year', 'month']].min())
print(activity_df.groupby('subreddit')[['year', 'month']].max())
+------------+----+-----+--------------+-----------------+-------------------------+----------------------------+---------------------------------+------------------------------------+ | subreddit|year|month|comments_count|submissions_count|distinct_authors_comments|distinct_authors_submissions|percent_distinct_authors_comments|percent_distinct_authors_submissions| +------------+----+-----+--------------+-----------------+-------------------------+----------------------------+---------------------------------+------------------------------------+ | Libertarian|2022| 12| 12659| 552| 3562| 259| 28.138083576901806| 46.92028985507246| | finance|2021| 2| 4943| 985| 2312| 362| 46.77321464697552| 36.751269035533| |Conservative|2022| 2| 203812| 14116| 27225| 2761| 13.357898455439326| 19.55936525928025| | socialism|2021| 7| 15061| 1746| 3911| 788| 25.967731226346192| 45.131729667812145| | socialism|2022| 4| 19905| 1382| 5480| 676| 27.530771163024365| 48.914616497829236| | centrist|2021| 2| 26016| 677| 2335| 231| 8.975246002460025| 34.12112259970458| | socialism|2021| 2| 12190| 1954| 3727| 939| 30.574241181296145| 48.05527123848516| | socialism|2021| 9| 13754| 1601| 4026| 638| 29.27148465900829| 39.85009369144285| | Libertarian|2023| 3| 19069| 702| 4695| 350| 24.62111280088101| 49.85754985754986| | centrist|2021| 7| 36968| 465| 2723| 182| 7.365829906946549| 39.13978494623656| |Conservative|2022| 9| 131923| 9907| 19807| 2168| 15.014061232688766| 21.883516705359845| | Economics|2022| 9| 71902| 6675| 11607| 505| 16.142805485243805| 7.5655430711610485| | Liberal|2021| 10| 2364| 344| 595| 148| 25.16920473773266| 43.02325581395349| |changemyview|2022| 8| 148335| 2084| 15983| 1099| 10.774935113088617| 52.73512476007678| |Conservative|2022| 3| 197741| 12029| 25880| 2444| 13.087827006033145| 20.317565882450744| | finance|2022| 5| 5337| 869| 2653| 299| 49.70957466741615| 34.407364787111625| |Conservative|2023| 2| 100924| 7158| 17065| 1676| 16.908763029606437| 23.414361553506566| | Economics|2021| 3| 52554| 1431| 8718| 541| 16.588651672565362| 37.80573025856045| | Economics|2021| 8| 30593| 828| 5197| 376| 16.987546170692642| 45.410628019323674| | centrist|2021| 3| 24945| 525| 2324| 204| 9.316496291842052| 38.857142857142854| +------------+----+-----+--------------+-----------------+-------------------------+----------------------------+---------------------------------+------------------------------------+ only showing top 20 rows
[Stage 38:====================================================> (15 + 1) / 16]
['Libertarian' 'finance' 'Conservative' 'socialism' 'centrist' 'Economics' 'Liberal' 'changemyview' 'Ask_Politics'] year month comments_count submissions_count \ subreddit Ask_Politics 27 27 27 27 Conservative 27 27 27 27 Economics 27 27 27 27 Liberal 27 27 27 27 Libertarian 27 27 27 27 centrist 27 27 27 27 changemyview 27 27 27 27 finance 27 27 27 27 socialism 27 27 27 27 distinct_authors_comments distinct_authors_submissions \ subreddit Ask_Politics 27 27 Conservative 27 27 Economics 27 27 Liberal 27 27 Libertarian 27 27 centrist 27 27 changemyview 27 27 finance 27 27 socialism 27 27 percent_distinct_authors_comments \ subreddit Ask_Politics 27 Conservative 27 Economics 27 Liberal 27 Libertarian 27 centrist 27 changemyview 27 finance 27 socialism 27 percent_distinct_authors_submissions subreddit Ask_Politics 27 Conservative 27 Economics 27 Liberal 27 Libertarian 27 centrist 27 changemyview 27 finance 27 socialism 27 year month subreddit Ask_Politics 2021 1 Conservative 2021 1 Economics 2021 1 Liberal 2021 1 Libertarian 2021 1 centrist 2021 1 changemyview 2021 1 finance 2021 1 socialism 2021 1 year month subreddit Ask_Politics 2023 12 Conservative 2023 12 Economics 2023 12 Liberal 2023 12 Libertarian 2023 12 centrist 2023 12 changemyview 2023 12 finance 2023 12 socialism 2023 12
In [ ]:
# comms : body = deleted / removed
# subs : title, selftext = deleted / removed
# filter for deleted/removed comments + posts
deleted_removed_comms = comms_month_year.filter(col('body').isin('[deleted]', '[removed]'))
deleted_removed_subs = subs_month_year.filter((col('title').isin('[deleted]', '[removed]')) | (col('selftext').isin('[deleted]', '[removed]')))
# get count of deleted/removed comments + posts based on subreddit, year, month
deleted_removed_comms_counts = deleted_removed_comms.groupBy('subreddit', 'year', 'month').count().withColumnRenamed('count', 'deleted_removed_comments_count')
deleted_removed_subs_counts = deleted_removed_subs.groupBy('subreddit', 'year', 'month').count().withColumnRenamed('count', 'deleted_removed_submissions_count')
# combine comments + posts dfs together
deleted_removed_combined = deleted_removed_comms_counts.join(deleted_removed_subs_counts, ['subreddit', 'year', 'month'], 'outer').fillna(0)
# sanity check
deleted_removed_combined.show()
# convert to pandas df
moderation_df = deleted_removed_combined.toPandas()
moderation_df.to_csv('../../data/csv/moderation_df.csv', index=False)
# sanity check
print(moderation_df['subreddit'].unique())
print(moderation_df.groupby('subreddit').count())
print(moderation_df.groupby('subreddit')[['year', 'month']].min())
print(moderation_df.groupby('subreddit')[['year', 'month']].max())
+------------+----+-----+------------------------------+---------------------------------+ | subreddit|year|month|deleted_removed_comments_count|deleted_removed_submissions_count| +------------+----+-----+------------------------------+---------------------------------+ |Ask_Politics|2021| 1| 894| 673| |Ask_Politics|2021| 2| 849| 241| |Ask_Politics|2021| 3| 1007| 202| |Ask_Politics|2021| 4| 833| 200| |Ask_Politics|2021| 5| 1244| 245| |Ask_Politics|2021| 6| 963| 156| |Ask_Politics|2021| 7| 784| 124| |Ask_Politics|2021| 8| 303| 107| |Ask_Politics|2021| 9| 297| 105| |Ask_Politics|2021| 10| 232| 83| |Ask_Politics|2021| 11| 87| 65| |Ask_Politics|2021| 12| 356| 59| |Ask_Politics|2022| 1| 373| 81| |Ask_Politics|2022| 2| 241| 93| |Ask_Politics|2022| 3| 159| 107| |Ask_Politics|2022| 4| 142| 74| |Ask_Politics|2022| 5| 199| 69| |Ask_Politics|2022| 6| 209| 100| |Ask_Politics|2022| 7| 145| 109| |Ask_Politics|2022| 8| 110| 96| +------------+----+-----+------------------------------+---------------------------------+ only showing top 20 rows
[Stage 62:====================================================> (15 + 1) / 16]
['Ask_Politics' 'Conservative' 'Economics' 'Liberal' 'Libertarian' 'centrist' 'changemyview' 'finance' 'socialism'] year month deleted_removed_comments_count \ subreddit Ask_Politics 27 27 27 Conservative 27 27 27 Economics 27 27 27 Liberal 27 27 27 Libertarian 27 27 27 centrist 27 27 27 changemyview 27 27 27 finance 27 27 27 socialism 27 27 27 deleted_removed_submissions_count subreddit Ask_Politics 27 Conservative 27 Economics 27 Liberal 27 Libertarian 27 centrist 27 changemyview 27 finance 27 socialism 27 year month subreddit Ask_Politics 2021 1 Conservative 2021 1 Economics 2021 1 Liberal 2021 1 Libertarian 2021 1 centrist 2021 1 changemyview 2021 1 finance 2021 1 socialism 2021 1 year month subreddit Ask_Politics 2023 12 Conservative 2023 12 Economics 2023 12 Liberal 2023 12 Libertarian 2023 12 centrist 2023 12 changemyview 2023 12 finance 2023 12 socialism 2023 12
In [ ]:
# get avg score based on subreddit, year, month
comms_score = comms_month_year.groupby('subreddit', 'year', 'month').agg({'score': 'avg'}).withColumnRenamed('avg(score)', 'avg_comments_score')
subs_score = subs_month_year.groupby('subreddit', 'year', 'month').agg({'score': 'avg'}).withColumnRenamed('avg(score)', 'avg_submissions_score')
# get min + max for comments scores
min_max_comments = comms_score.agg(F.min('avg_comments_score'), F.max('avg_comments_score')).collect()
min_comments, max_comments = min_max_comments[0]
# get min + max for submissions scores
min_max_submissions = subs_score.agg(F.min('avg_submissions_score'), F.max('avg_submissions_score')).collect()
min_submissions, max_submissions = min_max_submissions[0]
# get normalized scores for comments
comms_score_norm = comms_score.withColumn(
'normalized_comments_score',
F.when(
F.col('avg_comments_score') != 0,
(F.col('avg_comments_score') - min_comments) / (max_comments - min_comments)
).otherwise(0) # To handle division by zero cases
)
# get normalized scores for submissions
subs_score_norm = subs_score.withColumn(
'normalized_submissions_score',
F.when(
F.col('avg_submissions_score') != 0,
(F.col('avg_submissions_score') - min_submissions) / (max_submissions - min_submissions)
).otherwise(0) # To handle division by zero cases
)
# combine comments + posts dfs together
combined_scores = comms_score_norm.join(subs_score_norm, ['subreddit', 'year', 'month'], 'outer').fillna(0)
# sanity check
combined_scores.show()
# convert to pandas df
reputation_df = combined_scores.toPandas()
# get controversial comments
controversial_comms = comms_month_year.filter(col('controversiality') == 1) \
.groupby('subreddit', 'year', 'month').agg({'controversiality': 'count'}) \
.withColumnRenamed('count(controversiality)', 'controversial_comments')
# get gilded comments
gilded_comms = comms_month_year.groupby('subreddit', 'year', 'month').agg({
'gilded': 'count',
}).withColumnRenamed('count(gilded)', 'total_gilded_comments')
# get distinguished comments
distinguished_comms = comms_month_year.groupby('subreddit', 'year', 'month').agg({
'distinguished': 'count'
}).withColumnRenamed('count(distinguished)', 'total_distinguished_comments')
# get gilded posts
gilded_subs = subs_month_year.groupby('subreddit', 'year', 'month').agg({
'gilded': 'count',
}).withColumnRenamed('count(gilded)', 'total_gilded_submissions')
# get distinguished posts
distinguished_subs = subs_month_year.groupby('subreddit', 'year', 'month').agg({
'distinguished': 'count'
}).withColumnRenamed('count(distinguished)', 'total_distinguished_submissions')
# merge additional information into combined_scores df
combined_scores = combined_scores.join(controversial_comms, ['subreddit', 'year', 'month'], 'left') \
.join(gilded_subs, ['subreddit', 'year', 'month'], 'left') \
.join(distinguished_subs, ['subreddit', 'year', 'month'], 'left') \
.join(gilded_comms, ['subreddit', 'year', 'month'], 'left') \
.join(distinguished_comms, ['subreddit', 'year', 'month'], 'left').fillna(0)
# convert to pandas df
reputation_df = combined_scores.toPandas()
reputation_df.to_csv('../../data/csv/reputation_df.csv', index=False)
# sanity check
print(reputation_df['subreddit'].unique())
print(reputation_df.groupby('subreddit').count())
print(reputation_df.groupby('subreddit')[['year', 'month']].min())
print(reputation_df.groupby('subreddit')[['year', 'month']].max())
+------------+----+-----+------------------+-------------------------+---------------------+----------------------------+ | subreddit|year|month|avg_comments_score|normalized_comments_score|avg_submissions_score|normalized_submissions_score| +------------+----+-----+------------------+-------------------------+---------------------+----------------------------+ |Ask_Politics|2021| 1| 3.847379859937213| 0.16884459957467995| 9.099318403115872| 0.016698467115947583| |Ask_Politics|2021| 2|3.6339940689870454| 0.15044033339653748| 13.191397849462366| 0.02764815226969936| |Ask_Politics|2021| 3| 3.483548387096774| 0.13746457543940474| 14.133165829145728| 0.03016815780737888| |Ask_Politics|2021| 4|3.7325046824450876| 0.15893675456582382| 14.079691516709511| 0.030025069948829922| |Ask_Politics|2021| 5|3.1032566754128723| 0.10466487562805961| 13.223587223587224| 0.027734285375516306| |Ask_Politics|2021| 6|3.5455030691219642| 0.1428080917714742| 12.449056603773585| 0.02566177761018651| |Ask_Politics|2021| 7|3.6076481835564054| 0.14816803266833398| 13.873170731707317| 0.029472456637400044| |Ask_Politics|2021| 8| 3.292490118577075| 0.12098603104469982| 9.151351351351352| 0.01683769813730731| |Ask_Politics|2021| 9|3.1264367816091956| 0.10666413172929819| 9.375| 0.017436142600537656| |Ask_Politics|2021| 10| 3.392670157068063| 0.12962643810233407| 8.446969696969697| 0.014952896670306456| |Ask_Politics|2021| 11|3.1106194690265485| 0.10529990766499814| 7.675925925925926| 0.012889719099248152| |Ask_Politics|2021| 12|2.7448618001417433| 0.0737537514524466| 7.495798319327731| 0.012407729292938294| |Ask_Politics|2022| 1|2.7338516746411483| 0.07280414145514821| 8.064516129032258| 0.013929518163180233| |Ask_Politics|2022| 2| 2.785239852398524| 0.07723630958426843| 7.189542483660131| 0.011588242520741495| |Ask_Politics|2022| 3| 3.028787878787879| 0.09824203205937793| 6.516129032258065| 0.009786306520271775| |Ask_Politics|2022| 4| 2.815165876777251| 0.07981739296319078| 5.975806451612903| 0.008340498290715174| |Ask_Politics|2022| 5|3.5857946554149085| 0.14628319231857723| 10.53968253968254| 0.020552628309284563| |Ask_Politics|2022| 6|2.5606375606375607| 0.05786463378812828| 5.919540229885057| 0.00818993977375843| |Ask_Politics|2022| 7|2.4002100840336134| 0.04402795811933323| 4.290697674418604| 0.003831443468581875| |Ask_Politics|2022| 8|2.7697777777777777| 0.07590272436418706| 7.178807947019868| 0.01155951878797349| +------------+----+-----+------------------+-------------------------+---------------------+----------------------------+ only showing top 20 rows
[Stage 94:=======================================================>(58 + 1) / 59]
['Libertarian' 'finance' 'Conservative' 'socialism' 'centrist' 'Economics' 'Liberal' 'changemyview' 'Ask_Politics'] year month avg_comments_score normalized_comments_score \ subreddit Ask_Politics 27 27 27 27 Conservative 27 27 27 27 Economics 27 27 27 27 Liberal 27 27 27 27 Libertarian 27 27 27 27 centrist 27 27 27 27 changemyview 27 27 27 27 finance 27 27 27 27 socialism 27 27 27 27 avg_submissions_score normalized_submissions_score \ subreddit Ask_Politics 27 27 Conservative 27 27 Economics 27 27 Liberal 27 27 Libertarian 27 27 centrist 27 27 changemyview 27 27 finance 27 27 socialism 27 27 controversial_comments total_gilded_submissions \ subreddit Ask_Politics 27 27 Conservative 27 27 Economics 27 27 Liberal 27 27 Libertarian 27 27 centrist 27 27 changemyview 27 27 finance 27 27 socialism 27 27 total_distinguished_submissions total_gilded_comments \ subreddit Ask_Politics 27 27 Conservative 27 27 Economics 27 27 Liberal 27 27 Libertarian 27 27 centrist 27 27 changemyview 27 27 finance 27 27 socialism 27 27 total_distinguished_comments subreddit Ask_Politics 27 Conservative 27 Economics 27 Liberal 27 Libertarian 27 centrist 27 changemyview 27 finance 27 socialism 27 year month subreddit Ask_Politics 2021 1 Conservative 2021 1 Economics 2021 1 Liberal 2021 1 Libertarian 2021 1 centrist 2021 1 changemyview 2021 1 finance 2021 1 socialism 2021 1 year month subreddit Ask_Politics 2023 12 Conservative 2023 12 Economics 2023 12 Liberal 2023 12 Libertarian 2023 12 centrist 2023 12 changemyview 2023 12 finance 2023 12 socialism 2023 12
In [ ]:
# get avg score based on author + subreddit
comms_score = comms.groupby('author', 'subreddit').agg({'score': 'avg'}).withColumnRenamed('avg(score)', 'avg_comments_score')
subs_score = subs.groupby('author', 'subreddit').agg({'score': 'avg'}).withColumnRenamed('avg(score)', 'avg_submissions_score')
# get min + max for comments scores
min_max_comments = comms_score.agg(F.min('avg_comments_score'), F.max('avg_comments_score')).collect()
min_comments, max_comments = min_max_comments[0]
# get min + max for submissions scores
min_max_submissions = subs_score.agg(F.min('avg_submissions_score'), F.max('avg_submissions_score')).collect()
min_submissions, max_submissions = min_max_submissions[0]
# get normalized scores for comments
comms_score_norm = comms_score.withColumn(
'normalized_comments_score',
F.when(
F.col('avg_comments_score') != 0,
(F.col('avg_comments_score') - min_comments) / (max_comments - min_comments)
).otherwise(0) # To handle division by zero cases
)
# get normalized scores for submissions
subs_score_norm = subs_score.withColumn(
'normalized_submissions_score',
F.when(
F.col('avg_submissions_score') != 0,
(F.col('avg_submissions_score') - min_submissions) / (max_submissions - min_submissions)
).otherwise(0) # To handle division by zero cases
)
# combine comments + posts dfs together
combined_scores = comms_score_norm.join(subs_score_norm, ['author', 'subreddit'], 'outer').fillna(0)
# sanity check
combined_scores.show()
# get controversial comments
controversial_comms = comms.filter(col('controversiality') == 1) \
.groupby('author', 'subreddit').agg({'controversiality': 'count'}) \
.withColumnRenamed('count(controversiality)', 'controversial_comments')
# get gilded comments
gilded_comms = comms.groupby('author', 'subreddit').agg({'gilded': 'count'}).withColumnRenamed('count(gilded)', 'total_gilded_comments')
# get distinguished comments
distinguished_comms = comms.groupby('author', 'subreddit').agg({'distinguished': 'count'}).withColumnRenamed('count(distinguished)', 'total_distinguished_comments')
# get gilded posts
gilded_subs = subs.groupby('author', 'subreddit').agg({'gilded': 'count'}).withColumnRenamed('count(gilded)', 'total_gilded_submissions')
# get distinguished posts
distinguished_subs = subs.groupby('author', 'subreddit').agg({'distinguished': 'count' }).withColumnRenamed('count(distinguished)', 'total_distinguished_submissions')
# merge additional information into combined_scores df
combined_scores = combined_scores.join(controversial_comms, ['author', 'subreddit'], 'left') \
.join(gilded_subs, ['author', 'subreddit'], 'left') \
.join(distinguished_subs, ['author', 'subreddit'], 'left') \
.join(gilded_comms, ['author', 'subreddit'], 'left') \
.join(distinguished_comms, ['author', 'subreddit'], 'left').fillna(0)
# convert to pandas df
users_df = combined_scores.toPandas()
users_df.to_csv('../../data/csv/users_df.csv', index=False)
# sanity check
print(users_df.head(10))
print(users_df.shape)
+--------------------+------------+-------------------+-------------------------+---------------------+----------------------------+ | author| subreddit| avg_comments_score|normalized_comments_score|avg_submissions_score|normalized_submissions_score| +--------------------+------------+-------------------+-------------------------+---------------------+----------------------------+ |----------_______---|Conservative| -6.964285714285714| 0.04727596195940722| 0.0| 0.0| |----------_______---| Economics| 2.5| 0.04851013887983308| 0.0| 0.0| | -------7654321|Conservative| 11.8| 0.04972289235182891| 0.0| 0.0| | -------I-------| Economics| 9.625| 0.04943926452370086| 0.0| 0.0| | ------sb|Conservative| 6.4363636363636365| 0.04902345486553611| 0.0| 0.0| | -----Nice-----| Economics| 1.5384615384615385| 0.04838475080373757| 0.0| 0.0| | ----Lu|changemyview| 2.6| 0.04852317923974702| 0.0| 0.0| | ----The_Truth-----| Economics| 1.0| 0.04831453348112408| 0.0| 0.0| | ----dotcom|changemyview| 7.0| 0.0490969550759601| 0.0| 0.0| | ---AI---|Conservative| 16.0| 0.050270587468214126| 0.0| 0.0| | ---Default---| Economics| 12.5| 0.04981417487122645| 0.0| 0.0| | ---Default---|changemyview| 1.0| 0.04831453348112408| 0.0| 0.0| | ---Keith---|changemyview| 1.0| 0.04831453348112408| 0.0| 0.0| | ---LEGO--| centrist| 5.675675675675675| 0.048924258417640434| 0.0| 0.0| | ---M0NK---|Conservative| 5.0| 0.048836147877681424| 0.0| 0.0| | ---Mr_Castle|Conservative| 21.0| 0.0509226054639108| 0.0| 0.0| | ---SG---|changemyview|0.49122807017543857| 0.048248187790333893| 5.0| 4.517323937299544E-5| | ---Sanguine---| finance| 8.0| 0.049227358675099434| 0.0| 0.0| | ---YNWA---|Conservative| 1.2727272727272727| 0.04835009809907117| 0.0| 0.0| | ---YNWA---| Libertarian| 1.0| 0.04831453348112408| 0.0| 0.0| +--------------------+------------+-------------------+-------------------------+---------------------+----------------------------+ only showing top 20 rows
author subreddit avg_comments_score \ 0 ----------_______--- Conservative -6.964286 1 ----------_______--- Economics 2.500000 2 -------7654321 Conservative 11.800000 3 -------I------- Economics 9.625000 4 ------sb Conservative 6.436364 5 -----Nice----- Economics 1.538462 6 ----Lu changemyview 2.600000 7 ----The_Truth----- Economics 1.000000 8 ----dotcom changemyview 7.000000 9 ---AI--- Conservative 16.000000 normalized_comments_score avg_submissions_score \ 0 0.047276 0.0 1 0.048510 0.0 2 0.049723 0.0 3 0.049439 0.0 4 0.049023 0.0 5 0.048385 0.0 6 0.048523 0.0 7 0.048315 0.0 8 0.049097 0.0 9 0.050271 0.0 normalized_submissions_score controversial_comments \ 0 0.0 6 1 0.0 0 2 0.0 2 3 0.0 0 4 0.0 12 5 0.0 0 6 0.0 0 7 0.0 0 8 0.0 0 9 0.0 0 total_gilded_submissions total_distinguished_submissions \ 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 0 9 0 0 total_gilded_comments total_distinguished_comments 0 28 0 1 2 0 2 10 0 3 8 0 4 55 0 5 26 0 6 5 0 7 1 0 8 1 0 9 3 0 (933193, 11)
In [ ]:
# remove deleted / removed users
filtered_users_df = users_df[~users_df['author'].isin(['[deleted]', '[removed]'])]
filtered_users_df.to_csv('../../data/csv/filtered_users_df.csv', index=False)
# sanity check
print(filtered_users_df.head(10))
print(filtered_users_df.shape)
author subreddit avg_comments_score \ 0 ----------_______--- Conservative -6.964286 1 ----------_______--- Economics 2.500000 2 -------7654321 Conservative 11.800000 3 -------I------- Economics 9.625000 4 ------sb Conservative 6.436364 5 -----Nice----- Economics 1.538462 6 ----Lu changemyview 2.600000 7 ----The_Truth----- Economics 1.000000 8 ----dotcom changemyview 7.000000 9 ---AI--- Conservative 16.000000 normalized_comments_score avg_submissions_score \ 0 0.047276 0.0 1 0.048510 0.0 2 0.049723 0.0 3 0.049439 0.0 4 0.049023 0.0 5 0.048385 0.0 6 0.048523 0.0 7 0.048315 0.0 8 0.049097 0.0 9 0.050271 0.0 normalized_submissions_score controversial_comments \ 0 0.0 6 1 0.0 0 2 0.0 2 3 0.0 0 4 0.0 12 5 0.0 0 6 0.0 0 7 0.0 0 8 0.0 0 9 0.0 0 total_gilded_submissions total_distinguished_submissions \ 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 0 9 0 0 total_gilded_comments total_distinguished_comments 0 28 0 1 2 0 2 10 0 3 8 0 4 55 0 5 26 0 6 5 0 7 1 0 8 1 0 9 3 0 (933184, 11)
Activity Visualizations¶
Determine which subreddits are the most active (comments + posts)¶
In [ ]:
subreddits = activity_df['subreddit'].unique()
label_d = {
'Ask_Politics': 'AskPolitics',
'centrist': 'Centrist',
'changemyview': 'ChangeMyView',
'finance': 'Finance',
'socialism': 'Socialism'
}
fig_comments = px.line(title='Comments Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Count'})
fig_submissions = px.line(title='Posts Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Count'})
for subreddit in subreddits:
data_comments = activity_df[activity_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
data_submissions = activity_df[activity_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
if subreddit in label_d:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['comments_count'],
mode='lines+markers',
name=label_d[subreddit])
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['submissions_count'],
mode='lines+markers',
name=label_d[subreddit])
else:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['comments_count'],
mode='lines+markers',
name=subreddit)
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['submissions_count'],
mode='lines+markers',
name=subreddit)
fig_comments.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_comments.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, None]))
fig_comments.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_comments.update_layout(xaxis_title='Date', yaxis_title='Count')
fig_comments.update_layout(legend_title_text='Subreddits - Comments')
fig_comments.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_comments.update_layout(xaxis=dict(showgrid=True))
fig_comments.update_layout(yaxis=dict(showgrid=True))
fig_submissions.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_submissions.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, None]))
fig_submissions.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_submissions.update_layout(xaxis_title='Date', yaxis_title='Count')
fig_submissions.update_layout(legend_title_text='Subreddits - Submissions')
fig_submissions.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_submissions.update_layout(xaxis=dict(showgrid=True))
fig_submissions.update_layout(yaxis=dict(showgrid=True))
fig_comments.show();
fig_submissions.show();
Distinct Author Percentage for Posts + Comments¶
In [ ]:
fig_comments = px.line(title='Distinct Authors in Comments Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Percentage of Distinct Authors'})
fig_submissions = px.line(title='Distinct Authors in Posts Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Percentage of Distinct Authors'})
for subreddit in subreddits:
data_comments = activity_df[activity_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
data_submissions = activity_df[activity_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
if subreddit in label_d:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['percent_distinct_authors_comments'],
mode='lines+markers',
name=label_d[subreddit])
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['percent_distinct_authors_submissions'],
mode='lines+markers',
name=label_d[subreddit])
else:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['percent_distinct_authors_comments'],
mode='lines+markers',
name=subreddit)
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['percent_distinct_authors_submissions'],
mode='lines+markers',
name=subreddit)
fig_comments.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_comments.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, 100]))
fig_comments.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_comments.update_layout(xaxis_title='Date', yaxis_title='Percentage')
fig_comments.update_layout(legend_title_text='Subreddits - Comments')
fig_comments.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_comments.update_layout(xaxis=dict(showgrid=False))
fig_comments.update_layout(yaxis=dict(showgrid=False))
fig_submissions.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_submissions.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, 100]))
fig_submissions.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_submissions.update_layout(xaxis_title='Date', yaxis_title='Percentage')
fig_submissions.update_layout(legend_title_text='Subreddits - Submissions')
fig_submissions.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_submissions.update_layout(xaxis=dict(showgrid=False))
fig_submissions.update_layout(yaxis=dict(showgrid=False))
fig_comments.show();
fig_submissions.show();
Moderation Visualizations¶
Determine which subreddits have the most deleted / removed comments + posts¶
In [ ]:
fig_comments = px.line(title='Comment Moderation Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Count'})
fig_submissions = px.line(title='Post Moderation Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Count'})
for subreddit in subreddits:
data_comments = moderation_df[moderation_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
data_submissions = moderation_df[moderation_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
if subreddit in label_d:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['deleted_removed_comments_count'],
mode='lines+markers',
name=label_d[subreddit])
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['deleted_removed_submissions_count'],
mode='lines+markers',
name=label_d[subreddit])
else:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['deleted_removed_comments_count'],
mode='lines+markers',
name=subreddit)
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['deleted_removed_submissions_count'],
mode='lines+markers',
name=subreddit)
fig_comments.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_comments.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, None]))
fig_comments.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_comments.update_layout(xaxis_title='Date', yaxis_title='Count')
fig_comments.update_layout(legend_title_text='Subreddits - Comments')
fig_comments.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_comments.update_layout(xaxis=dict(showgrid=False))
fig_comments.update_layout(yaxis=dict(showgrid=False))
fig_submissions.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_submissions.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, None]))
fig_submissions.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_submissions.update_layout(xaxis_title='Date', yaxis_title='Count')
fig_submissions.update_layout(legend_title_text='Subreddits - Submissions')
fig_submissions.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_submissions.update_layout(xaxis=dict(showgrid=False))
fig_submissions.update_layout(yaxis=dict(showgrid=False))
fig_comments.show();
fig_submissions.show();
Reputation / Public Opinion Visualizations¶
Determine likes / dislikes on comments + posts for each subreddits¶
In [ ]:
# bubble chart
fig_comments = px.line(title='Average Comment Scores (Min-Max Normalized) Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Score'})
fig_submissions = px.line(title='Average Post Scores (Min-Max Normalized) Over Time for Each Subreddit', labels={'x': 'Date', 'y': 'Score'})
for subreddit in subreddits:
data_comments = reputation_df[reputation_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
data_submissions = reputation_df[reputation_df['subreddit'] == subreddit].sort_values(by=['year', 'month'])
if subreddit in label_d:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['normalized_comments_score'],
mode='lines+markers',
name=label_d[subreddit])
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['normalized_submissions_score'],
mode='lines+markers',
name=label_d[subreddit])
else:
fig_comments.add_scatter(x=data_comments['year'].astype(str) + '-' + data_comments['month'].astype(str),
y=data_comments['normalized_comments_score'],
mode='lines+markers',
name=subreddit)
fig_submissions.add_scatter(x=data_submissions['year'].astype(str) + '-' + data_submissions['month'].astype(str),
y=data_submissions['normalized_submissions_score'],
mode='lines+markers',
name=subreddit)
fig_comments.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_comments.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, None]))
fig_comments.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_comments.update_layout(xaxis_title='Date', yaxis_title='Score')
fig_comments.update_layout(legend_title_text='Subreddits - Comments')
fig_comments.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_comments.update_layout(xaxis=dict(showgrid=False))
fig_comments.update_layout(yaxis=dict(showgrid=False))
fig_submissions.update_layout(xaxis_tickangle=-45, xaxis=dict(tickfont=dict(size=8)), yaxis=dict(tickfont=dict(size=8)))
fig_submissions.update_layout(xaxis=dict(range=[-0.5, None]), yaxis=dict(range=[0, None]))
fig_submissions.update_layout(showlegend=True, legend=dict(font=dict(size=8)))
fig_submissions.update_layout(xaxis_title='Date', yaxis_title='Score')
fig_submissions.update_layout(legend_title_text='Subreddits - Submissions')
fig_submissions.update_layout(margin=dict(l=0, r=0, t=30, b=0))
fig_submissions.update_layout(xaxis=dict(showgrid=False))
fig_submissions.update_layout(yaxis=dict(showgrid=False))
fig_comments.show();
fig_submissions.show();
Determine Most Controversial Subreddits¶
In [ ]:
from tabulate import tabulate
# group by subreddit + sum each value
grouped_df = reputation_df.groupby('subreddit').agg({
'total_distinguished_submissions': 'sum',
'total_gilded_submissions': 'sum',
'total_distinguished_comments': 'sum',
'total_gilded_comments': 'sum',
'controversial_comments': 'sum'
}).reset_index()
# rename
grouped_df.columns = ['Subreddit', 'Distinguished Submissions', 'Gilded Submissions', 'Distinguished Comments', 'Gilded Comments', 'Controversial Comments']
grouped_df['Subreddit'] = grouped_df['Subreddit'].replace(label_d)
grouped_df = grouped_df.sort_values(by='Controversial Comments', ascending=False)
# formatting for numerics (add commas every 3 digits)
def add_commas(val):
if isinstance(val, (int, float)):
return f'{val:,}'
return val
numeric_cols = ['Distinguished Submissions', 'Gilded Submissions', 'Distinguished Comments', 'Gilded Comments', 'Controversial Comments']
reputation_df_str = grouped_df.copy()
reputation_df_str[numeric_cols] = reputation_df_str[numeric_cols].applymap(add_commas)
# get pretty table
table = tabulate(reputation_df_str, headers='keys', tablefmt='pretty', showindex=False)
print("Grouped Data:")
print(table)
Grouped Data: +--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+ | Subreddit | Distinguished Submissions | Gilded Submissions | Distinguished Comments | Gilded Comments | Controversial Comments | +--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+ | Conservative | 16 | 343,938 | 15,474 | 5,231,661 | 441,601 | | Libertarian | 34 | 51,153 | 20,053 | 2,706,903 | 198,658 | | ChangeMyView | 40 | 64,632 | 149,978 | 3,909,587 | 167,041 | | Economics | 20 | 40,604 | 37,890 | 1,428,423 | 100,945 | | Centrist | 8 | 13,594 | 4,122 | 921,871 | 100,272 | | Socialism | 14 | 40,094 | 17,093 | 371,369 | 11,285 | | Finance | 0 | 28,904 | 336 | 137,118 | 8,853 | | Liberal | 0 | 11,086 | 209 | 96,396 | 1,962 | | AskPolitics | 11 | 5,903 | 5,747 | 60,149 | 1,154 | +--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+
In [ ]:
# create plotly datatable
fig = go.Figure(data=go.Table(
header=dict(
values=list(reputation_df_str.columns),
fill_color='lightgreen',
align='center',
font=dict(size=14, color='black')
),
cells=dict(values=[reputation_df_str[col] for col in reputation_df_str.columns], align='center')
))
fig.show();
# save as html
pio.write_html(fig, file='../../data/plots/controversial_subreddit_raw.html')
In [ ]:
# use z-score normalization since the difference b/w min and max is quite large
def z_score_normalize(column):
return (column - column.mean()) / column.std()
normalized_df = grouped_df.copy()
# exclude subreddit col
cols_to_normalize = normalized_df.columns.difference(['Subreddit'])
# perform z-score normalization
normalized_df[cols_to_normalize] = normalized_df[cols_to_normalize].apply(z_score_normalize, axis=0)
# formatting for numerics (up to 4 decimal points)
def limit_decimals(val):
if isinstance(val, (int, float)):
return f'{val:.4f}'
return val
normalized_df_str = normalized_df.applymap(limit_decimals)
# get pretty table
normalized_table = tabulate(normalized_df_str, headers='keys', tablefmt='pretty', showindex=False)
print("Normalized Data:")
print(normalized_table)
Normalized Data: +--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+ | Subreddit | Distinguished Submissions | Gilded Submissions | Distinguished Comments | Gilded Comments | Controversial Comments | +--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+ | Conservative | 0.0081 | 2.6210 | -0.2621 | 1.8970 | 2.2798 | | Libertarian | 1.3135 | -0.1465 | -0.1653 | 0.5592 | 0.5858 | | ChangeMyView | 1.7487 | -0.0191 | 2.5800 | 1.1965 | 0.3654 | | Economics | 0.2982 | -0.2463 | 0.2116 | -0.1182 | -0.0955 | | Centrist | -0.5722 | -0.5016 | -0.5020 | -0.3866 | -0.1002 | | Socialism | -0.1370 | -0.2511 | -0.2279 | -0.6783 | -0.7207 | | Finance | -1.1524 | -0.3569 | -0.5820 | -0.8024 | -0.7376 | | Liberal | -1.1524 | -0.5253 | -0.5846 | -0.8240 | -0.7857 | | AskPolitics | -0.3546 | -0.5743 | -0.4676 | -0.8432 | -0.7913 | +--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+
In [ ]:
# create plotly datatable
fig = go.Figure(data=go.Table(
header=dict(
values=list(normalized_df_str.columns),
fill_color='lightgreen',
align='center',
font=dict(size=14, color='black')
),
cells=dict(values=[normalized_df_str[col] for col in normalized_df_str.columns], align='center')
))
fig.show();
# save as html
pio.write_html(fig, file='../../data/plots/controversial_subreddit_normalized.html')
Determine Most Controversial Users¶
In [ ]:
filtered_users_df = users_df[~users_df['author'].isin(['[deleted]', '[removed]'])]
filtered_users_df = filtered_users_df[['author', 'subreddit',
'total_distinguished_submissions', 'total_gilded_submissions',
'total_distinguished_comments', 'total_gilded_comments', 'controversial_comments']]
# rename
filtered_users_df.columns = ['User', 'Subreddit', 'Distinguished Submissions', 'Gilded Submissions', 'Distinguished Comments', 'Gilded Comments', 'Controversial Comments']
filtered_users_df = filtered_users_df.sort_values(by='Controversial Comments', ascending=False)
top_20_users = filtered_users_df.head(20)
# formatting for numerics (add commas every 3 digits)
numeric_cols = ['Distinguished Submissions', 'Gilded Submissions', 'Distinguished Comments', 'Gilded Comments', 'Controversial Comments']
top_20_users_str = top_20_users.copy()
top_20_users_str[numeric_cols] = top_20_users_str[numeric_cols].applymap(add_commas)
# format subreddit col
top_20_users_str['Subreddit'] = top_20_users_str['Subreddit'].replace(label_d)
# get pretty table
table = tabulate(top_20_users_str, headers='keys', tablefmt='pretty', showindex=False)
print("Grouped Data:")
print(table)
Grouped Data: +----------------------+--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+ | User | Subreddit | Distinguished Submissions | Gilded Submissions | Distinguished Comments | Gilded Comments | Controversial Comments | +----------------------+--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+ | Saanvik | Centrist | 0 | 33 | 0 | 10,676 | 1,389 | | wingman43487 | Libertarian | 0 | 1 | 0 | 12,475 | 1,111 | | incendiaryblizzard | Centrist | 0 | 24 | 0 | 5,519 | 1,100 | | bhknb | Libertarian | 0 | 11 | 0 | 6,754 | 872 | | cstar1996 | Centrist | 0 | 13 | 0 | 6,178 | 868 | | quit_lying_already | Centrist | 0 | 111 | 0 | 6,714 | 836 | | nekomancey | Conservative | 0 | 37 | 0 | 8,443 | 816 | | abqguardian | Centrist | 0 | 21 | 0 | 4,119 | 800 | | Fuzzy_Yogurt_Bucket | Centrist | 0 | 5 | 0 | 3,150 | 787 | | SteelmanINC | Centrist | 0 | 2 | 0 | 4,448 | 779 | | ultimis | Conservative | 0 | 58 | 37 | 8,699 | 777 | | JFMV763 | Libertarian | 0 | 347 | 0 | 3,401 | 758 | | dazedANDconfused2020 | Conservative | 0 | 0 | 0 | 7,800 | 739 | | SgtFraggleRock | Conservative | 0 | 14 | 0 | 12,823 | 706 | | PB_Mack | Conservative | 0 | 1 | 0 | 10,339 | 697 | | iushciuweiush | Libertarian | 0 | 249 | 0 | 4,521 | 690 | | twilightknock | Centrist | 0 | 68 | 0 | 4,679 | 690 | | johntwit | Libertarian | 0 | 1,553 | 0 | 4,614 | 685 | | unkorrupted | Centrist | 0 | 3 | 0 | 4,623 | 667 | | Nikkolios | Conservative | 0 | 9 | 0 | 7,340 | 665 | +----------------------+--------------+---------------------------+--------------------+------------------------+-----------------+------------------------+
In [ ]:
# create plotly datatable
fig = go.Figure(data=go.Table(
header=dict(
values=list(top_20_users_str.columns),
fill_color='lightgreen',
align='center',
font=dict(size=14, color='black')
),
cells=dict(values=[top_20_users_str[col] for col in top_20_users_str.columns], align='center')
))
fig.show();
# save as html
pio.write_html(fig, file='../../data/plots/controversial_user.html')
In [ ]: