# Setup - Run only once per Kernel App
%conda install openjdk -y
# install PySpark
%pip install pyspark==3.4.0
# install spark-nlp
%pip install spark-nlp==5.1.3
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
Collecting package metadata (current_repodata.json): done Solving environment: done ==> WARNING: A newer version of conda exists. <== current version: 23.3.1 latest version: 23.11.0 Please update conda by running $ conda update -n base -c defaults conda Or to minimize the number of packages updated during conda update use conda install conda=23.11.0 # All requested packages already installed. Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: pyspark==3.4.0 in /opt/conda/lib/python3.10/site-packages (3.4.0) Requirement already satisfied: py4j==0.10.9.7 in /opt/conda/lib/python3.10/site-packages (from pyspark==3.4.0) (0.10.9.7) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv [notice] A new release of pip is available: 23.2.1 -> 23.3.1 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: spark-nlp==5.1.3 in /opt/conda/lib/python3.10/site-packages (5.1.3) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv [notice] A new release of pip is available: 23.2.1 -> 23.3.1 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages.
import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.functions import date_format
from pyspark.sql.functions import avg
from pyspark.sql.functions import to_date
from pyspark.sql.functions import date_format
from pyspark.sql.functions import count
from pyspark.ml.feature import Word2Vec, PCA, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.sql.types import StringType
from itertools import chain
import matplotlib.pyplot as plt
# Import pyspark and build Spark session
spark = SparkSession.builder \
.appName("Spark NLP")\
.master("local[*]")\
.config("spark.driver.memory","16G")\
.config("spark.driver.maxResultSize", "0") \
.config("spark.kryoserializer.buffer.max", "2000M")\
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")\
.config(
"fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.ContainerCredentialsProvider",
)\
.getOrCreate()
Warning: Ignoring non-Spark config property: fs.s3a.aws.credentials.provider
:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2/cache The jars for the packages stored in: /root/.ivy2/jars org.apache.hadoop#hadoop-aws added as a dependency :: resolving dependencies :: org.apache.spark#spark-submit-parent-340edd02-d36e-45b0-86e9-7e73e8eeafc2;1.0 confs: [default] found org.apache.hadoop#hadoop-aws;3.2.2 in central found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central :: resolution report :: resolve 323ms :: artifacts dl 19ms :: modules in use: com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default] org.apache.hadoop#hadoop-aws;3.2.2 from central in [default] --------------------------------------------------------------------- | | modules || artifacts | | conf | number| search|dwnlded|evicted|| number|dwnlded| --------------------------------------------------------------------- | default | 2 | 0 | 0 | 0 || 2 | 0 | --------------------------------------------------------------------- :: retrieving :: org.apache.spark#spark-submit-parent-340edd02-d36e-45b0-86e9-7e73e8eeafc2 confs: [default] 0 artifacts copied, 2 already retrieved (0kB/14ms) 23/12/05 23:36:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 23/12/05 23:36:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
print(f"Spark version: {spark.version}")
print(f"sparknlp version: {sparknlp.version()}")
Spark version: 3.4.0 sparknlp version: 5.1.3
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()
print(bucket)
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml sagemaker-us-east-1-433974840707
# S3 directory path
s3_directory_subs = f"s3a://{bucket}/project/nlp_cleaned_submissions/"
# Read all the Parquet files in the directory into a DataFrame
submissions_cleaned = spark.read.parquet(s3_directory_subs)
print("Number of records for all subreddit submissions:", submissions_cleaned.count())
[Stage 1:============================================> (3 + 1) / 4]
Number of records for all subreddit submissions: 599908
submissions_cleaned.columns
['author', 'title', 'selftext', 'subreddit', 'score', 'num_comments', 'permalink', 'created_utc', 'url', 'domain', 'is_video', 'is_self', 'is_reddit_media_domain', 'spoiler', 'over_18', 'stickied', 'thumbnail', 'media', 'secure_media', 'gilded', 'archived', 'distinguished', 'cleaned_body', 'cleaned_title']
submissions_cleaned.show(5)
+--------------------+--------------------+--------------------+---------+-----+------------+--------------------+-----------+--------------------+------------------+--------+-------+----------------------+-------+-------+--------+--------------------+--------------------+--------------------+------+--------+-------------+--------------------+--------------------+ | author| title| selftext|subreddit|score|num_comments| permalink|created_utc| url| domain|is_video|is_self|is_reddit_media_domain|spoiler|over_18|stickied| thumbnail| media| secure_media|gilded|archived|distinguished| cleaned_body| cleaned_title| +--------------------+--------------------+--------------------+---------+-----+------------+--------------------+-----------+--------------------+------------------+--------+-------+----------------------+-------+-------+--------+--------------------+--------------------+--------------------+------+--------+-------------+--------------------+--------------------+ | IsThisReallyNate|Evidence of the W...|I’ve heard it cla...|socialism| 6| 3|/r/socialism/comm...| 2021-07-29|https://www.reddi...| self.socialism| false| true| false| false| false| false| self| null| null| 0| false| null|[ive, hear, claim...|[evidence, war, t...| | [deleted]|Starbucks are blo...| [deleted]|socialism| 11| 0|/r/socialism/comm...| 2022-04-15|https://twitter.c...| twitter.com| false| false| false| false| false| false| default|{event_id=null, t...|{event_id=null, t...| 0| false| null| [delete]|[starbucks, block...| |MedicineInevitable48| Shubham| |Economics| 1| 1|/r/Economics/comm...| 2021-02-03|https://bsshahedu...|bsshaheducation.in| false| false| false| false| false| false| default| null| null| 0| false| null| []| [shubham]| | NORDLAN|Federal judge tos...| | Liberal| 17| 2|/r/Liberal/commen...| 2021-06-22|https://www.washi...|washingtonpost.com| false| false| false| false| false| false|https://b.thumbs....| null| null| 0| false| null| []|[federal, judge, ...| | haskalah1989|Today marks 214 y...| |socialism| 1761| 43|/r/socialism/comm...| 2021-03-25|https://i.redd.it...| i.redd.it| false| false| true| false| false| false|https://b.thumbs....| null| null| 0| false| null| []|[today, mark, yea...| +--------------------+--------------------+--------------------+---------+-----+------------+--------------------+-----------+--------------------+------------------+--------+-------+----------------------+-------+-------+--------+--------------------+--------------------+--------------------+------+--------+-------------+--------------------+--------------------+ only showing top 5 rows
submissions_cleaned.printSchema()
root |-- distinguished: string (nullable = false) |-- subreddit: string (nullable = true) |-- year: integer (nullable = true) |-- month: integer (nullable = true) |-- day: integer (nullable = true) |-- num_comments: string (nullable = true) |-- score: integer (nullable = true) |-- over_18: boolean (nullable = true) |-- cleaned_body: array (nullable = true) | |-- element: string (containsNull = true) |-- spoiler: boolean (nullable = true) |-- is_video: boolean (nullable = true) |-- archived: boolean (nullable = true) |-- stickied: boolean (nullable = true)
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
submissions_cleaned = submissions_cleaned.withColumn('year', F.year('created_utc')) \
.withColumn('month', F.month('created_utc')) \
.withColumn ('day', F.dayofmonth("created_utc")) \
.withColumn('distinguished', F.when(submissions_cleaned['distinguished'] == 'moderator', 'yes')
.otherwise(submissions_cleaned['distinguished'])) \
.withColumn('is_video', F.col('is_video').cast("string")) \
.withColumn('spoiler', F.col('spoiler').cast("string")) \
.withColumn('over_18', F.col('over_18').cast("string")) \
.withColumn('num_comments', F.col('num_comments').cast(IntegerType()))
submissions_cleaned = submissions_cleaned.fillna({'distinguished': "no"})
# select required cols for clustering
submissions_cleaned = submissions_cleaned.select("distinguished", "subreddit", "year", "month", "day", "num_comments", "score", "over_18", "cleaned_body", "spoiler", "is_video", "gilded")
cleaned_body
¶### WORD2VEC ###
from pyspark.ml.feature import Tokenizer, Word2Vec
# Step 3: Create and Train Word2Vec Model
word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="cleaned_body", outputCol="embeddings")
model = word2Vec.fit(submissions_cleaned)
# Step 4: Transform the Data
df_with_embeddings = model.transform(submissions_cleaned)
df_with_embeddings.columns
['distinguished', 'subreddit', 'year', 'month', 'day', 'num_comments', 'score', 'over_18', 'cleaned_body', 'spoiler', 'is_video', 'gilded', 'embeddings']
subreddit_col = df_with_embeddings.select("subreddit")
subreddit_col_pd = subreddit_col.toPandas()
cols_to_select = df_with_embeddings.columns
cols_to_select.remove("subreddit")
# Select all columns except "subreddit"
pca_df = df_with_embeddings.select(*[col(col_name) for col_name in cols_to_select])
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
distinguished_indexer = StringIndexer(inputCol="distinguished", outputCol="distinguished_ix")
#subreddit_indexer = StringIndexer(inputCol="subreddit", outputCol="subreddit_ix")
video_indexer = StringIndexer(inputCol="is_video", outputCol="is_video_index")
spoiler_indexer = StringIndexer(inputCol="spoiler", outputCol="spoiler_index")
over_18_indexer = StringIndexer(inputCol="over_18", outputCol="over_18_index")
#subreddit_encoder = OneHotEncoder(inputCol="subreddit_ix", outputCol="subreddit_vec")
pipeline = Pipeline(stages=[distinguished_indexer, video_indexer, spoiler_indexer, over_18_indexer])
pca_df_encoded = pipeline.fit(pca_df).transform(pca_df)
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
vectorAssembler_features = VectorAssembler(inputCols=['distinguished_ix', 'year', 'month', 'day', 'score', 'gilded', "num_comments", "score", "over_18_index", "spoiler_index", "is_video_index"],
outputCol= 'features')
scaler = StandardScaler(
inputCol = 'features',
outputCol = 'scaledFeatures',
withMean = True,
withStd = True
)
n_components = 2
pca = PCA(
k = n_components,
inputCol = 'scaledFeatures',
outputCol = 'pcaFeatures'
)
# Define the Pipeline
pipeline = Pipeline(stages=[vectorAssembler_features, scaler, pca])
# Fit and Transform pca_df_encoded using pipeline
pca_model = pipeline.fit(pca_df_encoded).transform(pca_df_encoded)
23/12/06 00:04:43 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
# save pca df in s3
pca_model.write.parquet(f"s3a://{bucket}/project/pca_subs/", mode='overwrite')
pca_model.columns
['distinguished', 'year', 'month', 'day', 'num_comments', 'score', 'over_18', 'cleaned_body', 'spoiler', 'is_video', 'gilded', 'embeddings', 'distinguished_ix', 'is_video_index', 'spoiler_index', 'over_18_index', 'features', 'scaledFeatures', 'pcaFeatures']
# not sure if this is the best way to do it
X_pca = pca_model.rdd.map(lambda row: row.pcaFeatures).collect()
X_pca = np.array(X_pca)
# change default style figure and font size
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12
def plot_pca(X_pca, y):
"""a scatter plot of the 2-dimensional submissions data"""
markers = ['s', 'x', 'o', '^', 'v', '<', '>', 'p', '*', 'h', '+', 'D']
colors = list(plt.rcParams['axes.prop_cycle'])
target = np.unique(y)
for idx, (t, m) in enumerate(zip(target, markers)):
subset = X_pca[y == t]
plt.scatter(subset[:, 0], subset[:, 1], s = 50,
c = colors[idx]['color'], label = t, marker = m)
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc = 'lower left')
plt.tight_layout()
plt.show()
plot_pca(X_pca, subreddit_col_pd)
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[51], line 1 ----> 1 plot_pca(X_pca, subreddit_col_pd) Cell In[50], line 13, in plot_pca(X_pca, y) 11 target = np.unique(y) 12 for idx, (t, m) in enumerate(zip(target, markers)): ---> 13 subset = X_pca[y == t] 14 plt.scatter(subset[:, 0], subset[:, 1], s = 50, 15 c = colors[idx]['color'], label = t, marker = m) 17 plt.xlabel('PC 1') IndexError: boolean index did not match indexed array along dimension 1; dimension is 2 but corresponding boolean dimension is 1
subreddit_col_pd.shape
(599908, 1)
X_pca
array([[-0.29978992, -0.51705833], [-0.30828099, 0.3426218 ], [-0.31157503, -0.07915274], ..., [-0.33580681, 1.0316967 ], [-0.33292898, -0.0131314 ], [ 3.34573769, 1.12503749]])
subreddit_col_pd
subreddit | |
---|---|
0 | socialism |
1 | socialism |
2 | Economics |
3 | Liberal |
4 | socialism |
... | ... |
599903 | Conservative |
599904 | Conservative |
599905 | Conservative |
599906 | finance |
599907 | Conservative |
599908 rows × 1 columns
import seaborn as sns
# Concatenate the two DataFrames
result_df = pd.concat([subreddit_col_pd, pd.DataFrame(X_pca, columns=['PCA_Feature_1', 'PCA_Feature_2'])], axis=1)
# Plot using Seaborn
sns.scatterplot(x='PCA_Feature_1', y='PCA_Feature_2', hue='subreddit', data=result_df, palette='bright')
# save fig
plt.savefig("../../data/plots/pca_all_subreddits.png")
plt.savefig("../../website-source/pca_all_subreddits.png")
# Show the plot
plt.show()
/opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector):
result_df.to_csv("../../data/csv/pca_df.csv", index=False)
# Grouping subreddits as political or economics
result_df = pd.concat([subreddit_col_pd, pd.DataFrame(X_pca, columns=['PCA_Feature_1', 'PCA_Feature_2'])], axis=1)
def rename_subreddit(subreddit):
if subreddit in ("Economics", "finance"):
return "economics"
else:
return "political"
result_df["subreddit"] = result_df["subreddit"].apply(rename_subreddit)
# Plot using Seaborn with a categorical color palette
sns.scatterplot(x='PCA_Feature_1', y='PCA_Feature_2', hue='subreddit', data=result_df, palette='bright')
# Show the plot
plt.show()
/opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): /opt/conda/lib/python3.10/site-packages/seaborn/_core.py:1225: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector):
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import sklearn.cluster
from statistics import mode
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.metrics import accuracy_score
import scipy.cluster.hierarchy as sch
import sklearn.cluster as cluster
sns.set_theme(style="whitegrid", palette='Set2')
import warnings
warnings.filterwarnings("ignore")
pca_df = pd.read_csv("../../data/csv/pca_df.csv")
pca_df.shape # get the number of rows and columns
(599908, 4)
pca_df.head()
Unnamed: 0 | subreddit | PCA_Feature_1 | PCA_Feature_2 | |
---|---|---|---|---|
0 | 0 | socialism | -0.299790 | -0.517058 |
1 | 1 | socialism | -0.308281 | 0.342622 |
2 | 2 | Economics | -0.311575 | -0.079153 |
3 | 3 | Liberal | -0.281342 | -0.429890 |
4 | 4 | socialism | 3.307645 | -0.147045 |
pca_df.drop("Unnamed: 0", axis=1, inplace=True)
pca_df.head()
subreddit | PCA_Feature_1 | PCA_Feature_2 | |
---|---|---|---|
0 | socialism | -0.299790 | -0.517058 |
1 | socialism | -0.308281 | 0.342622 |
2 | Economics | -0.311575 | -0.079153 |
3 | Liberal | -0.281342 | -0.429890 |
4 | socialism | 3.307645 | -0.147045 |
# seperate the features from the target
X = pca_df.drop("subreddit", axis=1)
y = pca_df["subreddit"]
# for k means clustering we will use the elbow method to find the optimal number of clusters.
# we will use the inertia_ attribute to find the sum of squared distances of samples to their closest cluster center.
# we will use the range of 1 to 12 clusters and plot the inertia_ values for each cluster.
distortions = []
inertias = []
k = 12
for k in range(1,k):
kmeansmodel = KMeans(n_clusters=k, init='k-means++', random_state=42)
kmeansmodel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeansmodel.cluster_centers_, 'euclidean'), axis=1))/ X.shape[0])
inertias.append(kmeansmodel.inertia_)
evaluation=pd.DataFrame.from_records({"Cluster":np.arange(1,k+1), "Distortion":distortions, "Inertia":inertias})
evaluation
Cluster | Distortion | Inertia | |
---|---|---|---|
0 | 1 | 0.806508 | 2.318779e+06 |
1 | 2 | 0.720983 | 1.608663e+06 |
2 | 3 | 0.661291 | 1.154088e+06 |
3 | 4 | 0.611819 | 7.414638e+05 |
4 | 5 | 0.576421 | 5.787194e+05 |
5 | 6 | 0.451568 | 4.715226e+05 |
6 | 7 | 0.431004 | 3.836171e+05 |
7 | 8 | 0.415867 | 3.163443e+05 |
8 | 9 | 0.407123 | 2.576952e+05 |
9 | 10 | 0.382387 | 2.146869e+05 |
10 | 11 | 0.310653 | 1.823192e+05 |
# plot distortion and inertia for kmeans
fig, ax = plt.subplots(1, 2)
# increase the size of the figure
fig.set_size_inches(12, 4)
# template theme
sns.set_theme(style="darkgrid")
ax[0].plot(evaluation.Cluster, evaluation.Distortion, 'bx-')
ax[0].set_xlabel('Number of Clusters (k)')
ax[0].set_ylabel('Distortion')
# vertical line to indicate the optimal number of clusters at 6 in figure 1
ax[0].axvline(x=6, linestyle='--', c='red')
ax[1].plot(evaluation.Cluster, evaluation.Inertia, 'bx-')
ax[1].set_xlabel('Number of Clusters (k)')
ax[1].set_ylabel('Inertia')
# vertical line to indicate the optimal number of clusters at 4
plt.axvline(x=4, linestyle='--', c='red')
fig.suptitle('The Elbow Method showing the optimal k (Distortion and Inertia)', fontsize=14, fontweight='bold')
plt.savefig("../../data/plots/elbow_method.png")
plt.show()
# Plotting clusters for best k = 5 (as per elbow method above)
bestK = KMeans(n_clusters=5, init='k-means++', random_state=42)
labels5 = bestK.fit_predict(X)
pca_df['kmeans_labels'] = labels5
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(1,2, figsize=(10,5))
sns.scatterplot(x="PCA_Feature_1", y="PCA_Feature_2", hue="subreddit", data=pca_df, ax=ax[0]).set(title='PCA Features By Subreddit')
sns.scatterplot(x="PCA_Feature_1", y="PCA_Feature_2", hue="kmeans_labels", data=pca_df, ax=ax[1]).set(title='K-Means Clustering Plot (5 Clusters)')
plt.savefig("../../data/plots/kmeans_clusters_elbow.png")
plt.savefig("../../website-source/kmeans_clusters_elbow.png")
plt.show()
def maximize_silhouette(X,algo="birch",nmin=2, nmax=20,i_plot=False):
# PARAM
i_print=False
#FORCE CONTIGUOUS
X=np.ascontiguousarray(X)
# LOOP OVER HYPER-PARAM
params=[]; sil_scores=[]
sil_max=-10
for param in range(nmin,nmax+1):
if(algo=="kmeans"):
print("Working on",param,"clusters")
model = KMeans(n_clusters=param, init='random')
labels=model.fit_predict(X)
try:
sil_scores.append(silhouette_score(X,labels))
params.append(param)
except:
continue
if(i_print): print(param,sil_scores[-1])
if(sil_scores[-1]>sil_max):
opt_param=param
sil_max=sil_scores[-1]
opt_labels=labels
print("OPTIMAL NUMBER OF CLUSTERS =",opt_param)
if(i_plot):
fig, ax = plt.subplots()
ax.plot(params, sil_scores, "-o")
ax.set(xlabel='Number Of Clusters (k)', ylabel='Silhouette Score', title='Silhouette Score By Number Of Clusters')
plt.axvline(x=opt_param, color='red', linestyle='--')
#plt.text(opt_param+0.2, sil_max+0.01, 'k = %d' % opt_param)
plt.savefig("../../data/plots/silhouette_score_tuning.png")
plt.savefig("../../website-source/silhouette_score_tuning.png")
plt.show()
return opt_labels
k_means_opt_labels=maximize_silhouette(X,algo="kmeans",nmin=4, nmax=10, i_plot=True)
Working on 4 clusters Working on 5 clusters Working on 6 clusters Working on 7 clusters Working on 8 clusters Working on 9 clusters Working on 10 clusters OPTIMAL NUMBER OF CLUSTERS = 4
range_n_clusters = [4, 6, 8, 9, 10]
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("Silhouette plot for clusters")
ax1.set_xlabel("Silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(
X['PCA_Feature_1'], X['PCA_Feature_2'], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(
centers[:, 0],
centers[:, 1],
marker="o",
c="white",
alpha=1,
s=200,
edgecolor="k",
)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
ax2.set_title("Visualization of the Clustered Continuous Features")
ax2.set_xlabel("PCA_Feature_1")
ax2.set_ylabel("PCA_Feature_2")
plt.suptitle(
"Silhouette analysis for KMeans clustering on PCA Features with n_clusters = %d"
% n_clusters,
fontsize=14,
fontweight="bold",
)
plt.savefig("../../data/plots/silhouette_tuning.png")
plt.savefig("../../website-source/silhouette_tuning.png")
plt.show()
For n_clusters = 4 The average silhouette_score is : 0.8757106308706852 For n_clusters = 6 The average silhouette_score is : 0.48703280632198676 For n_clusters = 8 The average silhouette_score is : 0.49958818589536474 For n_clusters = 9 The average silhouette_score is : 0.5040308016434782
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) /Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb Cell 49 line <cell line: 3>() <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=18'>19</a> cluster_labels = clusterer.fit_predict(X) <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=20'>21</a> # The silhouette_score gives the average value for all the samples. <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=21'>22</a> # This gives a perspective into the density and separation of the formed <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=22'>23</a> # clusters ---> <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=23'>24</a> silhouette_avg = silhouette_score(X, cluster_labels) <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=24'>25</a> print( <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=25'>26</a> "For n_clusters =", <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=26'>27</a> n_clusters, <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=27'>28</a> "The average silhouette_score is :", <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=28'>29</a> silhouette_avg, <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=29'>30</a> ) <a href='vscode-notebook-cell:/Users/Tegveer/Fall_23_Courses/Big_Data/fall-2023-reddit-project-team-01/code/ml/clustering_subreddits.ipynb#Y102sZmlsZQ%3D%3D?line=31'>32</a> # Compute the silhouette scores for each sample File ~/miniforge3/envs/ANLY501/lib/python3.9/site-packages/sklearn/utils/_param_validation.py:211, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 205 try: 206 with config_context( 207 skip_parameter_validation=( 208 prefer_skip_nested_validation or global_skip_validation 209 ) 210 ): --> 211 return func(*args, **kwargs) 212 except InvalidParameterError as e: 213 # When the function is just a wrapper around an estimator, we allow 214 # the function to delegate validation to the estimator, but we replace 215 # the name of the estimator by the name of the function in the error 216 # message to avoid confusion. 217 msg = re.sub( 218 r"parameter of \w+ must be", 219 f"parameter of {func.__qualname__} must be", 220 str(e), 221 ) File ~/miniforge3/envs/ANLY501/lib/python3.9/site-packages/sklearn/metrics/cluster/_unsupervised.py:130, in silhouette_score(X, labels, metric, sample_size, random_state, **kwds) 128 else: 129 X, labels = X[indices], labels[indices] --> 130 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) File ~/miniforge3/envs/ANLY501/lib/python3.9/site-packages/sklearn/utils/_param_validation.py:184, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 182 global_skip_validation = get_config()["skip_parameter_validation"] 183 if global_skip_validation: --> 184 return func(*args, **kwargs) 186 func_sig = signature(func) 188 # Map *args/**kwargs to the function signature File ~/miniforge3/envs/ANLY501/lib/python3.9/site-packages/sklearn/metrics/cluster/_unsupervised.py:282, in silhouette_samples(X, labels, metric, **kwds) 278 kwds["metric"] = metric 279 reduce_func = functools.partial( 280 _silhouette_reduce, labels=labels, label_freqs=label_freqs 281 ) --> 282 results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds)) 283 intra_clust_dists, inter_clust_dists = results 284 intra_clust_dists = np.concatenate(intra_clust_dists) File ~/miniforge3/envs/ANLY501/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:2027, in pairwise_distances_chunked(X, Y, reduce_func, metric, n_jobs, working_memory, **kwds) 2025 if reduce_func is not None: 2026 chunk_size = D_chunk.shape[0] -> 2027 D_chunk = reduce_func(D_chunk, sl.start) 2028 _check_chunk_size(D_chunk, chunk_size) 2029 yield D_chunk File ~/miniforge3/envs/ANLY501/lib/python3.9/site-packages/sklearn/metrics/cluster/_unsupervised.py:171, in _silhouette_reduce(D_chunk, start, labels, label_freqs) 169 sample_weights = D_chunk[i] 170 sample_labels = labels --> 171 cluster_distances[i] += np.bincount( 172 sample_labels, weights=sample_weights, minlength=len(label_freqs) 173 ) 175 # intra_index selects intra-cluster distances within cluster_distances 176 end = start + n_chunk_samples File <__array_function__ internals>:180, in bincount(*args, **kwargs) KeyboardInterrupt:
def rename_subreddit(subreddit):
if subreddit in ("Economics", "finance"):
return "economics"
else:
return "political"
pca_df["subreddit"] = pca_df["subreddit"].apply(rename_subreddit)
X = pca_df.drop("subreddit", axis=1)
y = pca_df["subreddit"]
bestK = KMeans(n_clusters=2, init='k-means++', random_state=42)
labels2 = bestK.fit_predict(X)
pca_df['kmeans_labels'] = labels2
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(1,2, figsize=(10,5))
sns.scatterplot(x="PCA_Feature_1", y="PCA_Feature_2", hue="subreddit", data=pca_df, ax=ax[0]).set(title='PCA Features By Subreddit')
sns.scatterplot(x="PCA_Feature_1", y="PCA_Feature_2", hue="kmeans_labels", data=pca_df, ax=ax[1]).set(title='K-Means Clustering Plot (2 Clusters)')
plt.savefig("../../data/plots/kmeans_two_clusters.png")
plt.savefig("../../website-source/kmeans_two_clusters.png")
plt.show()
# silhouette score
silhouette_avg = silhouette_score(X, labels2, sample_size=300000)
print(
"For n_clusters =",
2,
"The average silhouette_score is :",
silhouette_avg
)
For n_clusters = 2 The average silhouette_score is : 0.9129267595873652