{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Code source: Sebastian Curi and Andreas Krause.\n", "\n", "# Python Notebook Commands\n", "%matplotlib inline\n", "%reload_ext autoreload\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "# Numerical Libraries\n", "import numpy as np\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "from matplotlib import rcParams\n", "rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. \n", "import time \n", "import copy \n", "\n", "# IPython Libraries\n", "import IPython\n", "import ipywidgets\n", "from ipywidgets import interact, interactive, interact_manual\n", "\n", "\n", "# SKLEARN Libraries\n", "from sklearn import datasets\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def distance(x, mu):\n", " \"\"\"Distance function for k-means algorithm.\"\"\"\n", " return np.linalg.norm(x - mu, 2, axis=1) ** 2 \n", "\n", "class LloydsAlgorithm(object):\n", " \"\"\"Implementation of Lloyd's Algorithm. \n", " \n", " The algorithm has two phases, implemented as independent methods:\n", " - update_means(), where the mean estimates are updated. \n", " - assign_centers(), where each point of the data set is assigned to a center. \n", " \n", " There is also a method, run()', that alternates between the two phases. \n", " \n", " Finally, there is a method restart()', that restarts the algorithm.\n", " See `self.restart()' for more details on the restart.\n", " \n", " The algorithm has two properties: \n", " - self.means: nd.ndarray of shape [num_centers x dimension]\n", " self.means[i] contains the coordinate of the i-th mean.\n", " - self.centers: nd.ndarray of shape [num_points] \n", " self.centers[j] contains the mean associated to data point j.\n", " \"\"\"\n", "\n", " def __init__(self, num_centers, X):\n", " self.X = X\n", " self.num_points, self.dimension = X.shape\n", "\n", " self.num_centers = num_centers\n", " self.restart()\n", " \n", " def restart(self):\n", " \"\"\"Restart the algorithm.\n", " \n", " - Sample each mean uniformly at random from [min_domain, max_domain], coordinatewise.\n", " - Set the centers to None. \n", " \"\"\"\n", " min_, max_ = np.min(self.X, axis=0), np.max(self.X, axis=0) \n", " self.means = np.random.uniform(low=min_, high=max_, \n", " size=(self.num_centers, self.dimension))\n", " self.centers = None\n", " \n", " def update_means(self):\n", " \"\"\"Update the center means. \n", " \n", " - Set means to average of the data points assigned to it.\n", " \"\"\"\n", " for k in range(self.num_centers):\n", " idx = np.where(self.centers == k)[0]\n", " if len(idx) > 0:\n", " self.means[k] = self.X[idx].mean(axis=0)\n", " \n", " def assign_centers(self):\n", " \"\"\"Assign point centers.\n", " \n", " - Set center associated to each point by selecting the\n", " center that minimizes the distance to each point.\n", " \"\"\"\n", " if self.centers is None:\n", " self.centers = np.zeros(self.num_points, dtype=np.int)\n", " \n", " for i, x in enumerate(self.X):\n", " d = distance(x, self.means) \n", " self.centers[i] = np.argmin(d)\n", " \n", " def run(self, max_iter=100):\n", " \"\"\"Run Lloyd's algorithm until convergence or until max-iter.\"\"\"\n", " converged = False \n", " old_means = copy.deepcopy(self.means)\n", "\n", " i = 0 \n", " while not converged:\n", " i += 1\n", " self.assign_centers() \n", " self.update_means()\n", "\n", " if i > max_iter:\n", " break \n", " converged = np.all(self.means == old_means)\n", " old_means = copy.deepcopy(self.means)\n", " \n", " \n", "class KMeans(LloydsAlgorithm):\n", " \"\"\"Lloyd's Algorithm with K-Means initialization.\"\"\"\n", "\n", " def restart(self):\n", " \"\"\"Restart the algorithm.\n", " \n", " - Sample each mean uniformly at random from the data set.\n", " - Set the centers to None. \n", " \"\"\"\n", " idx = np.random.choice(self.num_points, self.num_centers, replace=False)\n", " self.means = self.X[idx]\n", " self.centers = None\n", " \n", "class KMeansPP(LloydsAlgorithm):\n", " \"\"\"Lloyd's Algorithm with K-Means++ initialization.\"\"\"\n", "\n", " def restart(self): \n", " \"\"\"Restart the algorithm.\n", " \n", " - for k = 1, sample a point uniformly at random from the data set and \n", " add it to the mean set. \n", " - for k = 2:K, \n", " sample a point with probability proportional to dist(x, mean_set) \n", " and add it to the mean set. \n", " \n", " The dist(x, mean_set) = min_{i} dist(x, mu_i). \n", " \n", " - Set the centers to None. \n", " \n", " Note: there is no need to remove points from the sampling distribution as\n", " points that are already in the mean set have distance 0 to the mean set.s\n", " \"\"\"\n", " self.means = np.empty((self.num_centers, self.dimension))\n", " i = np.random.choice(self.num_points, 1)\n", " self.means[0] = self.X[i]\n", " indexes = [i] \n", " for j in range(1, self.num_centers):\n", " probs = np.zeros(self.num_points)\n", " \n", " for i, x in enumerate(self.X):\n", " d = distance(x, self.means[:j])\n", " probs[i] = np.min(d)\n", " \n", " probs = probs / np.sum(probs)\n", " i = np.random.choice(self.num_points, 1, p = probs)\n", " assert np.all(probs[np.array(indexes)] == 0)\n", " # All previous indexes should have zero probability of being selected. \n", " \n", " self.means[j] = self.X[i] \n", " indexes.append(i)\n", " self.centers = None\n", " \n", " \n", "def k_means_cost(means, X):\n", " r\"\"\"Return the k-means cost.\n", " \n", " The k-means cost is Cost = \\sum_{j=1}^N \\min_{i={1, ..., K}} dist(x_j, means_i)\n", " \n", " Parameters\n", " ----------\n", " mean: np.ndarray \n", " Array of means with shape [num_centers, dimension]\n", " \n", " X: np.ndarray\n", " Array with data points with shape [num_points, dimension]\n", " \n", " Returns\n", " -------\n", " cost: float.\n", " Cost of current means.\n", " \"\"\"\n", " cost = 0\n", " for x in X:\n", " d = distance(x, means)\n", " cost += np.min(d)\n", " return cost\n", "\n", "\n", "def get_dataset(dataset, n_samples):\n", " # Generate Data\n", " if dataset == '3-blobs':\n", " X, Y = datasets.make_blobs(n_samples=n_samples, random_state=8)\n", " elif dataset == '4-blobs':\n", " X, Y = datasets.make_blobs(centers=4, n_samples=n_samples, random_state=8)\n", " elif dataset == 'varied density':\n", " X, Y = datasets.make_blobs(n_samples=2 * n_samples, random_state=8)\n", " idx = np.random.choice(np.where(Y == 0)[0], int(0.8 * sum(Y==0)), replace=False)\n", " X, Y = np.delete(X, idx, axis=0), np.delete(Y, idx, axis=0)\n", " idx = np.random.choice(np.where(Y == 1)[0], int(0.8 * sum(Y==1)), replace=False)\n", " X, Y = np.delete(X, idx, axis=0), np.delete(Y, idx, axis=0) \n", " elif dataset == 'circles':\n", " X, Y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)\n", " elif dataset == 'moons':\n", " X, Y = datasets.make_moons(n_samples=n_samples, noise=.05)\n", " elif dataset == 'no_structure':\n", " X, Y = np.random.rand(n_samples, 2), None \n", " elif dataset == 'anisotropic':\n", " X, Y = datasets.make_blobs(n_samples=n_samples, random_state=170)\n", " transformation = [[0.6, -0.6], [-0.4, 0.8]]\n", " X = np.dot(X, transformation)\n", " elif dataset == 'varied variance':\n", " X, Y = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=170)\n", " elif dataset == 'no structure':\n", " X, Y = np.random.rand(n_samples, 2), None \n", " return X, Y\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. \n", "rcParams['font.size'] = 16 # Change this if figures look ugly. \n", "\n", "def run_lloyds(dataset, num_centers, init):\n", " X, Y = get_dataset(dataset, n_samples = 200)\n", " \n", " # Initialize algorithm\n", " if init == 'random':\n", " alg = LloydsAlgorithm(num_centers=num_centers, X=X)\n", " elif init == 'random from data set':\n", " alg = KMeans(num_centers=num_centers, X=X)\n", " elif 'kmeans++':\n", " alg = KMeansPP(num_centers=num_centers, X=X)\n", " \n", " button = ipywidgets.Button(description=\"Assign Center\")\n", " button2 = ipywidgets.Button(description=\"Update Means\")\n", " button3 = ipywidgets.Button(description=\"Restart Algorithm\")\n", " button4 = ipywidgets.Button(description=\"Run Algorithm\")\n", "\n", " \n", " def plot_lloyds():\n", " colors = np.array(['#377eb8', '#ff7f00', \n", " '#f781bf', '#a65628', '#984ea3',\n", " '#999999', '#e41a1c', '#dede00', \n", " '#000000', '#4daf4a'][0:num_centers+1])\n", " \n", " if alg.centers is not None:\n", " plt.scatter(alg.X[:, 0], alg.X[:, 1], marker='.', color=colors[alg.centers])\n", " else:\n", " plt.scatter(alg.X[:, 0], alg.X[:, 1], marker='.', color='g')\n", "\n", " plt.scatter(alg.means[:, 0], alg.means[:, 1], marker='*', color=colors[:num_centers], s=250)\n", " \n", " plt.title(f\"Cost: {k_means_cost(alg.means, alg.X):.1f}\")\n", " IPython.display.clear_output(wait=True)\n", " IPython.display.display(plt.gcf())\n", " plt.close()\n", "\n", " display(button);\n", " display(button2);\n", " display(button3);\n", " display(button4);\n", "\n", " \n", " def restart(b):\n", " alg.restart()\n", " plot_lloyds()\n", " \n", " def assign_centers(b):\n", " alg.assign_centers()\n", " plot_lloyds()\n", "\n", " def update_means(b):\n", " alg.update_means()\n", " plot_lloyds()\n", " \n", " def run(b):\n", " converged = False \n", " old_means = copy.deepcopy(alg.means)\n", " max_iter = 10\n", " i = 0 \n", " while not converged:\n", " i += 1\n", " alg.assign_centers()\n", " plot_lloyds()\n", " time.sleep(0.5)\n", " \n", " alg.update_means()\n", " plot_lloyds()\n", " time.sleep(0.5)\n", "\n", " if i > max_iter:\n", " break \n", " converged = np.all(alg.means == old_means)\n", " old_means = copy.deepcopy(alg.means)\n", " restart(None)\n", " button.on_click(assign_centers)\n", " button2.on_click(update_means)\n", " button3.on_click(restart)\n", " button4.on_click(run)\n", "\n", " plot_lloyds()\n", "\n", "n_centers_widget = ipywidgets.IntSlider(value=2, min=1, max=10, step=1, description='Number of Centers:', \n", " style={'description_width': 'initial'}, continuous_update=False)\n", "dataset_widget = ipywidgets.Dropdown(\n", " value='3-blobs', \n", " options=['3-blobs', '4-blobs', 'varied density', 'circles', 'moons', 'anisotropic', 'varied variance', 'no structure'], \n", " description='Dataset:', style={'description_width': 'initial'}, continuous_update=False)\n", "initialization_widget = ipywidgets.Dropdown(\n", " value='random', \n", " options=['random', 'random from data set', 'kmeans++'], \n", " description='Initialization:', style={'description_width': 'initial'}, continuous_update=False)\n", "\n", "\n", "interact(run_lloyds, dataset=dataset_widget, init=initialization_widget, num_centers=n_centers_widget);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# How to Select k?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. \n", "rcParams['font.size'] = 16 # Change this if figures look ugly. \n", "\n", "def determine_k(dataset, init, lambda_):\n", " IPython.display.clear_output(wait=True)\n", " plt.close()\n", " # Generate Data\n", " X, Y = get_dataset(dataset, n_samples = 200)\n", " \n", " # Evaluate for different values of K\n", " K = 10\n", " costs = np.zeros(K)\n", " for k in range(K):\n", " # Initialize algorithm\n", " if init == 'random':\n", " alg = LloydsAlgorithm(num_centers=k + 1, X=X)\n", " elif init == 'random from data set':\n", " alg = KMeans(num_centers=k + 1, X=X)\n", " elif 'kmeans++':\n", " alg = KMeansPP(num_centers=k + 1, X=X) \n", " \n", " alg.run(100)\n", " costs[k] = k_means_cost(alg.means, alg.X) / X.shape[0]\n", " \n", " # Add regularization.\n", " reg_cost = costs + lambda_ * (np.arange(K) + 1) \n", " plt.plot(np.arange(K) + 1, reg_cost)\n", " plt.xlabel('Number of centers.')\n", " plt.ylabel('K-Means Cost');\n", " plt.show()\n", " \n", " # Get best K\n", " k = np.argmin(reg_cost)\n", " if init == 'random':\n", " alg = LloydsAlgorithm(num_centers=k + 1, X=X)\n", " elif init == 'random from data set':\n", " alg = KMeans(num_centers=k + 1, X=X)\n", " elif 'kmeans++':\n", " alg = KMeansPP(num_centers=k + 1, X=X) \n", "\n", " alg.run(100)\n", " \n", " colors = np.array(['#377eb8', '#ff7f00', \n", " '#f781bf', '#a65628', '#984ea3',\n", " '#999999', '#e41a1c', '#dede00', \n", " '#000000', '#4daf4a'][0:k+1])\n", " \n", " plt.scatter(alg.X[:, 0], alg.X[:, 1], marker='.', color=colors[alg.centers])\n", " plt.scatter(alg.means[:, 0], alg.means[:, 1], marker='*', color=colors[:(k + 1)], s=250)\n", " plt.title('Optimal-k Cluster Assignment.')\n", " plt.show()\n", " \n", " button = ipywidgets.Button(description=\"Resample\")\n", " button.on_click(lambda b: determine_k(dataset, init, lambda_))\n", " display(button)\n", " \n", "\n", "dataset_widget = ipywidgets.Dropdown(\n", " value='3-blobs', \n", " options=['3-blobs', '4-blobs', 'varied density', 'circles', 'moons', 'anisotropic', 'varied variance', 'no structure'], \n", " description='Dataset:', style={'description_width': 'initial'}, continuous_update=False)\n", "initialization_widget = ipywidgets.Dropdown(\n", " value='random', \n", " options=['random', 'random from data set', 'kmeans++'], \n", " description='Initialization:', style={'description_width': 'initial'}, continuous_update=False)\n", "lambda_widget = ipywidgets.FloatSlider(\n", " value=0, min=0, max=10, step=0.1, \n", " description='Regularization Parameter:', style={'description_width': 'initial'}, continuous_update=False)\n", "interact(determine_k, dataset=dataset_widget, init=initialization_widget, lambda_=lambda_widget);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }