Clustering Wikipedia
Hi, in this article i’ll make a simple clustering example using wikipedia.
You can access full code, here: https://drive.google.com/drive/folders/1FKAqwAvaSmEt0jzL3lHu5qQGEcw4FQGS?usp=sharing
# Perform the necessary imports from sklearn.decomposition import TruncatedSVD from sklearn.cluster import KMeans from sklearn.pipeline import make_pipeline # Create a TruncatedSVD instance: svd svd = TruncatedSVD(n_components=50) # Create a KMeans instance: kmeans kmeans = KMeans(n_clusters=6) # Create a pipeline: pipeline pipeline = make_pipeline(svd,kmeans)
import pandas as pd from scipy.sparse import csc_matrix documents = pd.read_csv("wikipedia-vectors.txt") documents.drop(columns="Unnamed: 0",inplace=True) titles = documents.columns articles = csc_matrix(documents.values).T
# Fit the pipeline to articles pipeline.fit(articles) # Calculate the cluster labels: labels labels = pipeline.predict(articles) # Create a DataFrame aligning labels and titles: df df = pd.DataFrame({'label': labels, 'article': titles}) # Display df sorted by cluster label print(df.sort_values(by="label"))