diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index 41a9311..a5acebe 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -263,24 +263,36 @@ "\n", " def run_log_regression(self, color='pink', nodraw=True):\n", " df = self.devs\n", - " X = df[[self.key_x]].sort_values(by=self.key_x)\n", - " y = df[[self.key_y]].sort_values(by=self.key_y)\n", + " X = df[[self.key_x]] #.sort_values(by=self.key_x)\n", + " y = df[[self.key_y]] #.sort_values(by=self.key_y)\n", "\n", - " X_log = np.log(X)\n", - " x_fit = np.linspace(1, 40, len(y)).reshape(-1, 1)\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, \n", + " test_size=0.2, \n", + " random_state=777)\n", + " \n", + " X_train_log = np.log(X_train)\n", + " X_test_log = np.log(X_test)\n", + " \n", + " # X_train_log = X_train_log.sort_values(by=self.key_x)\n", + " # y_train = y_train.sort_values(by=self.key_y)\n", + " X_test_log = X_test_log.sort_values(by=self.key_x)\n", + " X_test = X_test.sort_values(by=self.key_x)\n", + " y_test = y_test.sort_values(by=self.key_y)\n", " \n", " model = LinearRegression()\n", - " model.fit(X_log, y)\n", - " y_pred = model.predict(X_log)\n", + " model.fit(X_train_log, y_train)\n", + " y_pred = model.predict(X_test_log)\n", + " y_pred.sort()\n", "\n", " m = model.coef_[0][0]\n", " b = model.intercept_[0]\n", " label = '%s log regression line for %s' % (color, self.language)\n", - " show_model_stats(m, b, y, y_pred, label)\n", + " show_model_stats(m, b, y_test, y_pred, label)\n", "\n", " if nodraw:\n", " return\n", - " plt.plot(X, y_pred, color=color, label=\"Log regression\")\n", + " plt.plot(X_test, y_pred, color=color, label=\"Log regression\")\n", " plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", "\n", " def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",