Implemented "risky" (pink) model.
Also cleaned up code. Training on sorted data is unrecommended and "risky"; however, the risky model appears to be generalizing across random state.
This commit is contained in:
@@ -120,8 +120,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"visualize_devs( get_c_devs(so_df) , 'C')\n",
|
||||
"visualize_devs( get_c_devs(so_df, lang='Python') , 'Python')\n",
|
||||
"\n",
|
||||
"for lang in ['Cobol', 'Prolog', 'Ada', 'Python']:\n",
|
||||
"for lang in ['Cobol', 'Prolog', 'Ada']:\n",
|
||||
" foo = get_lang_devs(so_df, lang)\n",
|
||||
" visualize_devs(foo, lang)"
|
||||
]
|
||||
@@ -215,27 +216,41 @@
|
||||
" + '\\ncountry=%s' % self.country\n",
|
||||
" plt.title(title)\n",
|
||||
"\n",
|
||||
" def run_regression(self, model=LinearRegression(), split=train_test_split, \n",
|
||||
" x_transform=None, change_base=None, x_shift=0, y_shift=0,\n",
|
||||
" line_color='red', random=333):\n",
|
||||
" def run_regression(self, x_transform=None, change_base=1.07, \n",
|
||||
" x_shift=0, y_shift=0,\n",
|
||||
" random=333, risky=0,\n",
|
||||
" color='red', name='Regression Line' ):\n",
|
||||
" df = self.devs # .sort_values(by = self.key2)\n",
|
||||
" X = df[self.key_x].to_frame()\n",
|
||||
" if x_transform is not None and change_base is not None:\n",
|
||||
" X = x_transform (X, a=change_base ) \n",
|
||||
" elif x_transform is not None:\n",
|
||||
" X = x_transform (X) \n",
|
||||
" X = X + x_shift\n",
|
||||
" y = df[self.key_y].to_frame() + y_shift\n",
|
||||
" \n",
|
||||
" X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=random)\n",
|
||||
" y = df[self.key_y].to_frame()\n",
|
||||
"\n",
|
||||
" # not recommended\n",
|
||||
" # carries risk of model training on sorted order\n",
|
||||
" # however it appears to be generalizing well\n",
|
||||
" # across random state and shuffle (=True, default)\n",
|
||||
" style = '-'\n",
|
||||
" if risky > 0:\n",
|
||||
" X = X.sort_values(by=self.key_x)\n",
|
||||
" style = '--'\n",
|
||||
" if risky > 1:\n",
|
||||
" y = y.sort_values(by=self.key_y)\n",
|
||||
" if x_transform is not None:\n",
|
||||
" X = x_transform (X, a=change_base ) \n",
|
||||
"\n",
|
||||
" X = X + x_shift\n",
|
||||
" y = y + y_shift\n",
|
||||
" \n",
|
||||
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n",
|
||||
" random_state=random)\n",
|
||||
"\n",
|
||||
" model = LinearRegression()\n",
|
||||
" model.fit(X_train, y_train)\n",
|
||||
" y_pred = model.predict(X_test)\n",
|
||||
" \n",
|
||||
" m = model.coef_[0][0]\n",
|
||||
" b = model.intercept_[0]\n",
|
||||
" print('+----------------------+')\n",
|
||||
" print('%s regression line for %s' % (line_color, self.language))\n",
|
||||
" print('%s regression line for %s' % (color, self.language))\n",
|
||||
" print('coefficient = %0.2f' % m)\n",
|
||||
" print('intercept = %0.2f' % b)\n",
|
||||
" rmse = root_mean_squared_error(y_test, y_pred)\n",
|
||||
@@ -247,11 +262,11 @@
|
||||
" print('+----------------------+')\n",
|
||||
"\n",
|
||||
" plt.figure(self.canvas)\n",
|
||||
" plt.plot(X_test, y_pred, color=line_color, label='Regression Line')\n",
|
||||
" plt.plot(X_test, y_pred, color=color, label=name, linestyle=style)\n",
|
||||
" plt.axhline(y=b, color='purple', linestyle='--', \n",
|
||||
" label='b=%0.2f' % b, zorder=-1 )\n",
|
||||
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
|
||||
" del y_pred, model\n",
|
||||
" del y_pred, model, X, y\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",
|
||||
@@ -293,9 +308,13 @@
|
||||
"\n",
|
||||
"python = Foo(so_df, 'Python', jobs=pyjobs, n_rich_outliers=12, n_poor_outliers=2)\n",
|
||||
"python.visualize(hue='DevType', palette=['#dbdb32', '#34bf65', '#ac70e0'])\n",
|
||||
"python.run_regression()\n",
|
||||
"python.run_regression(name = 'Default regression line')\n",
|
||||
"python.run_regression( x_transform=log_base_a, change_base=1.20, \n",
|
||||
" x_shift=0, y_shift=-1.5e4, line_color='cyan', random=888)\n",
|
||||
" x_shift=0, y_shift=-1.5e4, random=888,\n",
|
||||
" color='cyan', name='Tuned regression line')\n",
|
||||
"\n",
|
||||
"python.run_regression(x_transform=log_base_a, change_base=1.20, risky=2, random=555, \n",
|
||||
" color='pink', name='Risky regression line')\n",
|
||||
"python.export_image()"
|
||||
]
|
||||
},
|
||||
@@ -350,7 +369,8 @@
|
||||
"c.visualize(hue='DevType', palette=['#57e6da','#d9e352','#cc622d'] ) \n",
|
||||
"c.run_regression()\n",
|
||||
"c.run_regression(x_transform=log_base_a, change_base=1.3, \n",
|
||||
" x_shift=2, y_shift=-5000, line_color='magenta', random=555)\n",
|
||||
" x_shift=2, y_shift=-5000, color='magenta', random=555)\n",
|
||||
"c.run_regression(x_transform=log_base_a, change_base=1.3, risky=2, color='pink')\n",
|
||||
"c.export_image()"
|
||||
]
|
||||
},
|
||||
@@ -712,7 +732,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
"version": "3.13.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Reference in New Issue
Block a user