{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "# Code source: Sebastian Curi and Andreas Krause.\n", "\n", "# Python Notebook Commands\n", "%matplotlib inline\n", "%reload_ext autoreload\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "# Numerical Libraries\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from matplotlib import rcParams\n", "rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. \n", "rcParams['font.size'] = 16\n", "# IPython Libraries\n", "import IPython\n", "import ipywidgets\n", "from IPython.display import display\n", "from ipywidgets import interact, interactive, interact_manual\n", "\n", "\n", "# sklearn library\n", "import sklearn\n", "from sklearn.datasets import make_regression\n", "from sklearn.linear_model import Ridge\n", "from sklearn.preprocessing import PolynomialFeatures\n", "from sklearn.pipeline import make_pipeline\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# Custom Libraries\n", "from utilities import plot_helpers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model Selection \n", "\n", "In this task we have noisy samples of the function $f(x) = x \\sin(x)$, and our objective is to learn it from data (here we're cheating because we already know the function). \n", "\n", "In this demo we will see how model selection works and how to use K-fold cross-validation. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. \n", "rcParams['font.size'] = 16\n", "# Let's plot the function first (without noise)\n", "def f(x):\n", " return x * np.sin(x) \n", "\n", "x_plot = np.linspace(-1, 11, 100)\n", "f_plot = f(x_plot)\n", "X_plot = x_plot[:, np.newaxis]\n", "\n", "\n", "plot_opts = {'x_label': '$x$', 'y_label': '$y$', 'y_lim': [np.min(f_plot)-3, np.max(f_plot)+3], \n", " 'legend':False, 'legend_loc': 'lower left'}\n", "plot_helpers.plot_data(x_plot, f_plot, fig=plt.subplot(111), options=plot_opts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. \n", "rcParams['font.size'] = 16\n", "noise_widget = ipywidgets.FloatSlider(value=1,\n", " min=0,\n", " max=3,\n", " step=0.5,\n", " readout_format='.1f',\n", " description='Noise level:',\n", " style={'description_width': 'initial'},\n", " continuous_update=False)\n", "resample_button = ipywidgets.ToggleButton(description=\"Resample!\")\n", "\n", "degree_widget = ipywidgets.IntSlider(min=1,\n", " max=19,\n", " step=1,\n", " description='Polynomial Degree:',\n", " style={'description_width': 'initial'},\n", " continuous_update=False)\n", "reg_widget = ipywidgets.Dropdown(\n", " options=[0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2],\n", " value=0,\n", " description='Regularizer:',\n", " disabled=False,\n", " style={'description_width': 'initial'},\n", " continuous_update=False\n", ")\n", "\n", "def resample(b, noise):\n", " x = 10 * np.random.rand(20)\n", "\n", " y = f(x) + np.random.normal(size=(20,)) * noise\n", "\n", " # create matrix versions of these arrays\n", " X = x[:, np.newaxis]\n", "\n", " def change_degree(degree, reg):\n", " model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=reg))\n", " model.fit(X, y) \n", " \n", " fig = plt.figure()\n", " lw = 2\n", " plt.plot(x_plot, f_plot, color='cornflowerblue', linewidth=lw, label=\"Ground Truth\")\n", " y_plot = model.predict(X_plot)\n", " plt.plot(x_plot, y_plot, color='r', linewidth=lw, label=\"Degree %d\" % degree)\n", " \n", " plot_opts = {'x_label': '$x$', 'y_label': '$y$', 'y_lim': [np.min(f_plot)-3, np.max(f_plot)+3], \n", " 'legend':True, 'legend_loc': 'lower left'}\n", " \n", " opts = {'marker': 'b*', 'label': 'Training Points'}\n", " plot_opts.update(opts)\n", " \n", " plot_helpers.plot_data(X, y, fig=fig.gca(), options=plot_opts)\n", " plt.show()\n", " coefs = model._final_estimator.coef_\n", " coefs[0] = model._final_estimator.intercept_\n", " print(\"Estimated coefficients{}\".format(coefs))\n", " \n", "\n", " interact(change_degree, degree=degree_widget, reg=reg_widget);\n", "interact(resample, b=resample_button, noise=noise_widget);\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# K-Fold Cross-Validation\n", "\n", "The idea of this method is to split the dataset into K different bins, use K-1 to learn and 1 to validate. Then you can interchange which split you validate on and make statistics on the different errors on each split (such as avg, std, etc). " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "rcParams['figure.figsize'] = (10, 5) # Change this if figures look ugly. \n", "rcParams['font.size'] = 16\n", "folds = 5\n", "N = 50\n", "n = int(N/folds)\n", "resample_button = ipywidgets.ToggleButton(description=\"Resample!\")\n", "\n", "degree_widget = ipywidgets.IntSlider(value=1,\n", " min=1,\n", " max=19,\n", " step=1,\n", " description='Polynomial Degree:',\n", " style={'description_width': 'initial'},\n", " continuous_update=False)\n", "fold_widget = ipywidgets.ToggleButtons(value=1,\n", " options=np.arange(1, folds+1),\n", " description='Validation fold:',\n", " style={'description_width': 'initial'},\n", " continuous_update=False)\n", "noise_widget = ipywidgets.FloatSlider(value=1, \n", " min=0, \n", " max=3, \n", " step=0.5, \n", " readout_format='.1f',\n", " description='Noise level:', \n", " style={'description_width': 'initial'},\n", " continuous_update=False)\n", "\n", "reg_widget = reg_widget = ipywidgets.Dropdown(\n", " options=[0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2],\n", " value=0,\n", " description='Regularizer:',\n", " disabled=False,\n", " style={'description_width': 'initial'},\n", " continuous_update=False\n", ")\n", "\n", "def resample(b, noise):\n", " xraw = 10 * np.random.rand(N)\n", " # rng = np.random.RandomState(0)\n", " np.random.shuffle(xraw)\n", "\n", " #noise=1\n", "\n", " x = dict()\n", " y = dict()\n", " for i in range(folds):\n", " x[i] = xraw[n*i:n*(i+1)]\n", " y[i] = f(x[i]) + np.random.normal(size=(n,)) * noise\n", "\n", " \n", " def change_degree(degree, reg, fold):\n", " X = np.array(())\n", " Y = np.array(())\n", " for i in range(folds):\n", " if i == (fold-1):\n", " Xval = x[i]\n", " Yval = y[i]\n", " else:\n", " X = np.concatenate((X, x[i]))\n", " Y = np.concatenate((Y, y[i]))\n", "\n", "\n", " X = X[:, np.newaxis]\n", " Xval = Xval[:, np.newaxis]\n", "\n", " model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=reg))\n", "\n", " model.fit(X, Y)\n", " \n", " fig = plt.subplot(111)\n", " lw = 2\n", " plt.plot(x_plot, f_plot, color='cornflowerblue', linewidth=lw, label=\"Ground Truth\")\n", " y_plot = model.predict(X_plot)\n", " plt.plot(x_plot, y_plot, color='r', linewidth=lw, label=\"Degree %d\" % degree)\n", " \n", " opts = {'marker': 'b*', 'label': 'Training Points'}\n", " plot_helpers.plot_data(X, Y, fig=fig, options=opts)\n", " \n", " plot_opts = {'x_label': '$x$', 'y_label': '$y$', 'y_lim': [np.min(f_plot)-3, np.max(f_plot)+3], \n", " 'legend':True, 'legend_loc': 'lower left'}\n", " opts = {'marker': 'mX', 'label': 'Validation Points'}\n", " plot_opts.update(opts)\n", " plot_helpers.plot_data(Xval, Yval, fig=fig, options=plot_opts)\n", " \n", " plt.show()\n", " print(\"Train. Error: {:.2f}\".format(1/X.size * np.linalg.norm(model.predict(X) - Y, 2)))\n", " print(\"Valid. Error: {:.2f}\".format(1/Xval.size * np.linalg.norm(model.predict(Xval) - Yval, 2)))\n", " \n", " \n", " interact(change_degree, degree=degree_widget, reg=reg_widget, fold=fold_widget);\n", "\n", "interact(resample, b=resample_button, noise=noise_widget);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 2 }