Added train-test splitting for log regression.

This commit is contained in:
2025-04-28 06:31:35 -07:00
parent 1f7fe33915
commit f49283a7cc

View File

@@ -263,24 +263,36 @@
"\n",
" def run_log_regression(self, color='pink', nodraw=True):\n",
" df = self.devs\n",
" X = df[[self.key_x]].sort_values(by=self.key_x)\n",
" y = df[[self.key_y]].sort_values(by=self.key_y)\n",
" X = df[[self.key_x]] #.sort_values(by=self.key_x)\n",
" y = df[[self.key_y]] #.sort_values(by=self.key_y)\n",
"\n",
" X_log = np.log(X)\n",
" x_fit = np.linspace(1, 40, len(y)).reshape(-1, 1)\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, \n",
" test_size=0.2, \n",
" random_state=777)\n",
" \n",
" X_train_log = np.log(X_train)\n",
" X_test_log = np.log(X_test)\n",
" \n",
" # X_train_log = X_train_log.sort_values(by=self.key_x)\n",
" # y_train = y_train.sort_values(by=self.key_y)\n",
" X_test_log = X_test_log.sort_values(by=self.key_x)\n",
" X_test = X_test.sort_values(by=self.key_x)\n",
" y_test = y_test.sort_values(by=self.key_y)\n",
" \n",
" model = LinearRegression()\n",
" model.fit(X_log, y)\n",
" y_pred = model.predict(X_log)\n",
" model.fit(X_train_log, y_train)\n",
" y_pred = model.predict(X_test_log)\n",
" y_pred.sort()\n",
"\n",
" m = model.coef_[0][0]\n",
" b = model.intercept_[0]\n",
" label = '%s log regression line for %s' % (color, self.language)\n",
" show_model_stats(m, b, y, y_pred, label)\n",
" show_model_stats(m, b, y_test, y_pred, label)\n",
"\n",
" if nodraw:\n",
" return\n",
" plt.plot(X, y_pred, color=color, label=\"Log regression\")\n",
" plt.plot(X_test, y_pred, color=color, label=\"Log regression\")\n",
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
"\n",
" def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",