From b18a5cb42ad838892f689d70aad22b8ae755e64a Mon Sep 17 00:00:00 2001 From: scuti Date: Sun, 27 Apr 2025 10:06:33 -0700 Subject: [PATCH] Implemented "risky" (pink) model. Also cleaned up code. Training on sorted data is unrecommended and "risky"; however, the risky model appears to be generalizing across random state. --- stackoverflow-survey.ipynb | 58 +++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index 6dc0a46..f3f2bd8 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -120,8 +120,9 @@ "outputs": [], "source": [ "visualize_devs( get_c_devs(so_df) , 'C')\n", + "visualize_devs( get_c_devs(so_df, lang='Python') , 'Python')\n", "\n", - "for lang in ['Cobol', 'Prolog', 'Ada', 'Python']:\n", + "for lang in ['Cobol', 'Prolog', 'Ada']:\n", " foo = get_lang_devs(so_df, lang)\n", " visualize_devs(foo, lang)" ] @@ -215,27 +216,41 @@ " + '\\ncountry=%s' % self.country\n", " plt.title(title)\n", "\n", - " def run_regression(self, model=LinearRegression(), split=train_test_split, \n", - " x_transform=None, change_base=None, x_shift=0, y_shift=0,\n", - " line_color='red', random=333):\n", + " def run_regression(self, x_transform=None, change_base=1.07, \n", + " x_shift=0, y_shift=0,\n", + " random=333, risky=0,\n", + " color='red', name='Regression Line' ):\n", " df = self.devs # .sort_values(by = self.key2)\n", " X = df[self.key_x].to_frame()\n", - " if x_transform is not None and change_base is not None:\n", - " X = x_transform (X, a=change_base ) \n", - " elif x_transform is not None:\n", - " X = x_transform (X) \n", - " X = X + x_shift\n", - " y = df[self.key_y].to_frame() + y_shift\n", - " \n", - " X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=random)\n", + " y = df[self.key_y].to_frame()\n", "\n", + " # not recommended\n", + " # carries risk of model training on sorted order\n", + " # however it appears to be generalizing well\n", + " # across random state and shuffle (=True, default)\n", + " style = '-'\n", + " if risky > 0:\n", + " X = X.sort_values(by=self.key_x)\n", + " style = '--'\n", + " if risky > 1:\n", + " y = y.sort_values(by=self.key_y)\n", + " if x_transform is not None:\n", + " X = x_transform (X, a=change_base ) \n", + "\n", + " X = X + x_shift\n", + " y = y + y_shift\n", + " \n", + " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n", + " random_state=random)\n", + "\n", + " model = LinearRegression()\n", " model.fit(X_train, y_train)\n", " y_pred = model.predict(X_test)\n", " \n", " m = model.coef_[0][0]\n", " b = model.intercept_[0]\n", " print('+----------------------+')\n", - " print('%s regression line for %s' % (line_color, self.language))\n", + " print('%s regression line for %s' % (color, self.language))\n", " print('coefficient = %0.2f' % m)\n", " print('intercept = %0.2f' % b)\n", " rmse = root_mean_squared_error(y_test, y_pred)\n", @@ -247,11 +262,11 @@ " print('+----------------------+')\n", "\n", " plt.figure(self.canvas)\n", - " plt.plot(X_test, y_pred, color=line_color, label='Regression Line')\n", + " plt.plot(X_test, y_pred, color=color, label=name, linestyle=style)\n", " plt.axhline(y=b, color='purple', linestyle='--', \n", " label='b=%0.2f' % b, zorder=-1 )\n", " plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", - " del y_pred, model\n", + " del y_pred, model, X, y\n", "\n", "\n", " def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n", @@ -293,9 +308,13 @@ "\n", "python = Foo(so_df, 'Python', jobs=pyjobs, n_rich_outliers=12, n_poor_outliers=2)\n", "python.visualize(hue='DevType', palette=['#dbdb32', '#34bf65', '#ac70e0'])\n", - "python.run_regression()\n", + "python.run_regression(name = 'Default regression line')\n", "python.run_regression( x_transform=log_base_a, change_base=1.20, \n", - " x_shift=0, y_shift=-1.5e4, line_color='cyan', random=888)\n", + " x_shift=0, y_shift=-1.5e4, random=888,\n", + " color='cyan', name='Tuned regression line')\n", + "\n", + "python.run_regression(x_transform=log_base_a, change_base=1.20, risky=2, random=555, \n", + " color='pink', name='Risky regression line')\n", "python.export_image()" ] }, @@ -350,7 +369,8 @@ "c.visualize(hue='DevType', palette=['#57e6da','#d9e352','#cc622d'] ) \n", "c.run_regression()\n", "c.run_regression(x_transform=log_base_a, change_base=1.3, \n", - " x_shift=2, y_shift=-5000, line_color='magenta', random=555)\n", + " x_shift=2, y_shift=-5000, color='magenta', random=555)\n", + "c.run_regression(x_transform=log_base_a, change_base=1.3, risky=2, color='pink')\n", "c.export_image()" ] }, @@ -712,7 +732,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.13.3" } }, "nbformat": 4,