From a6bc83fa8b5051812804988de95afc384359e887 Mon Sep 17 00:00:00 2001 From: scuti Date: Mon, 21 Apr 2025 23:22:00 -0700 Subject: [PATCH] Started performing linear or log. regressions. Split trails for different languages into their own cells. --- stackoverflow-survey.ipynb | 242 ++++++++++++++++++++++++++----------- 1 file changed, 172 insertions(+), 70 deletions(-) diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index ee85569..34ecd6b 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -375,77 +375,142 @@ { "cell_type": "code", "execution_count": null, - "id": "67b56700-5847-4af8-87ec-74249aa95749", + "id": "b8212c27-6c76-4c8f-ba66-bbf1b5835c99", + "metadata": {}, + "outputs": [], + "source": [ + "##### import numpy as np\n", + "\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import root_mean_squared_error\n", + "from sklearn.model_selection import StratifiedShuffleSplit\n", + "import traceback\n", + "import numpy as np\n", + "\n", + "# still haven't come up with a name\n", + "class Foo:\n", + " def __init__(self, dataset, language, jobs=None, n_rich_outliers=0, n_poor_outliers=0, country=\"United States of America\"):\n", + " self.devs = None\n", + " self.canvas = None\n", + " self.language = language\n", + " self.country = country\n", + " # focus on people who have given ...\n", + " key = \"ConvertedCompYearly\"\n", + " key2 = \"YearsCodePro\"\n", + " df = dataset.dropna(subset=[key, key2])\n", + " self.key = key\n", + " self.key2 = key2\n", + "\n", + " criteria = {\"MainBranch\":\"I am a developer by profession\"}\n", + "\n", + " #print(df[\"Country\"].unique)\n", + " if country:\n", + " criteria[\"Country\"] = country\n", + " for k in criteria:\n", + " df = df[df[k] == criteria[k] ] \n", + "\n", + " # chatgpt tells me about filtering with multiple strings\n", + " if jobs:\n", + " df = df[df.isin(jobs).any(axis=1)]\n", + "\n", + " devs = None\n", + " if len(language) > 1:\n", + " devs = get_lang_devs(df, language)\n", + " else:\n", + " devs = get_c_devs(df, lang=language)\n", + " replacement_dict = {\n", + " 'Less than 1 year': '0.5',\n", + " 'More than 50 years': '51',\n", + " }\n", + "\n", + " # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n", + " pd.options.mode.chained_assignment = None # default='warn'\n", + " new_column = devs[key2].replace(replacement_dict)\n", + " devs[key2] = pd.to_numeric(new_column, errors='coerce')\n", + " pd.options.mode.chained_assignment = 'warn' # default='warn'\n", + " # print( devs[key2].unique() )\n", + " \n", + " indices = devs[key].nlargest(n_rich_outliers).index\n", + " devs = devs.drop(indices)\n", + " indices = devs[key].nsmallest(n_poor_outliers).index\n", + " self.devs = devs.drop(indices)\n", + " del devs, new_column, criteria\n", + " \n", + " def visualize(self, n_lowest=0, hue=\"Country\"): \n", + " self.canvas = plt.figure()\n", + " key = self.key\n", + " key2 = self.key2\n", + "\n", + " if n_lowest > 0:\n", + " # chatgpt draws my line\n", + " # Calculate the lowest nth point (for example, the 5th lowest value)\n", + " # iloc[-1] gets the last element from the n smallest\n", + " lowest_nth = self.devs[key].nsmallest(n_lowest).iloc[-1] \n", + " # Draw a horizontal line at the lowest nth point\n", + " # label=f'Lowest {n_poorest}th Point: {lowest_nth_value:.2f}'\n", + " plt.axhline(y=lowest_nth, color='purple', linestyle='--', label=\"y=%0.2f\" % lowest_nth )\n", + "\n", + " sb.scatterplot(data=self.devs, x=key2, y=key, hue=hue)\n", + " plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", + " title = \"Annual Salary of %s Developers Over Years of Experience\" % self.language\\\n", + " + \"\\nsample size=%i\" % len (self.devs)\\\n", + " + \"\\ncountry=%s\" % self.country\n", + " plt.title(title)\n", + "\n", + " def run_regression(self, split=train_test_split, \n", + " x_transform=None, change_base=None, x_shift=0,\n", + " line_color='red'):\n", + " df = self.devs # .sort_values(by = self.key2)\n", + "# df['binned'] = pd.qcut(df[self.key], q=4, labels=False)\n", + " X = df[self.key2].to_frame() + x_shift\n", + " if x_transform is not None and change_base is not None:\n", + " X = x_transform (X, a=change_base ) \n", + " elif x_transform is not None:\n", + " X = x_transform (X) \n", + "\n", + " y = df[self.key].to_frame()\n", + "# y = df['binned']\n", + " \n", + " X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=999)\n", + "\n", + " model = LinearRegression()\n", + " model.fit(X_train, y_train)\n", + " y_pred = model.predict(X_test)\n", + "\n", + " print(\"+----------------------+\")\n", + " print(\"coefficient =\", model.coef_)\n", + " print('intercept=', model.intercept_)\n", + " rmse = root_mean_squared_error(y_test, y_pred)\n", + " print(\"rmse = \", rmse)\n", + " print(\"sample predictions:\")\n", + " print(y_pred[3:6])\n", + " print(\"+----------------------+\")\n", + " \n", + " plt.figure(self.canvas)\n", + " plt.xlim(left=0, right=40) # Adjust these values as needed\n", + " plt.plot(X_test, y_pred, color=line_color, label='Regression Line')\n", + " plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", + " del y_pred, model\n", + "\n", + "\n", + " def export_image(self, filename = \"images/programmers-%s-%s.png\"):\n", + " plt.figure(self.canvas)\n", + " plt.savefig(filename % (self.language, self.country), bbox_inches='tight')\n", + "\n", + "# the higher a is, the steeper the line gets\n", + "def log_base_a(x, a=1.07):\n", + " return np.log10(x)/np.log(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba81c59c-0610-4f71-96fb-9eddd7736329", "metadata": {}, "outputs": [], "source": [ "\n", - "# come up with name later\n", - "# do_actual_project\n", - "# earnings over years of exp\n", - "def foo(dataset, language, jobs=None, n_rich_outliers=0, n_lowest=0, hue=\"Country\", country=\"United States of America\"):\n", - " # focus on people who have given ...\n", - " key = \"ConvertedCompYearly\"\n", - " key2 = \"YearsCodePro\"\n", - " df = dataset.dropna(subset=[key, key2])\n", - " \n", - " criteria = {\"MainBranch\":\"I am a developer by profession\"}\n", - " \n", - " #print(df[\"Country\"].unique)\n", - " if country:\n", - " criteria[\"Country\"] = country\n", - " for k in criteria:\n", - " df = df[df[k] == criteria[k] ] \n", - " \n", - " # chatgpt tells me about filtering with multiple strings\n", - " if jobs:\n", - " df = df[df.isin(jobs).any(axis=1)]\n", - " \n", - " devs = None\n", - " if len(language) > 1:\n", - " devs = get_lang_devs(df, language)\n", - " else:\n", - " devs = get_c_devs(df, lang=language)\n", - " replacement_dict = {\n", - " 'Less than 1 year': '0.5',\n", - " 'More than 50 years': '51',\n", - " }\n", - " \n", - " # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n", - " pd.options.mode.chained_assignment = None # default='warn'\n", - " new_column = devs[key2].replace(replacement_dict)\n", - " devs[key2] = pd.to_numeric(new_column, errors='coerce')\n", - " # print( devs[key2].unique() )\n", - " \n", - " indices = devs[key].nlargest(n_rich_outliers).index\n", - " devs = devs.drop(indices)\n", - "\n", - " plt.figure()\n", - " \n", - " if n_lowest > 0:\n", - " # chatgpt draws my line\n", - " # Calculate the lowest nth point (for example, the 5th lowest value)\n", - " # iloc[-1] gets the last element from the n smallest\n", - " lowest_nth = df[key].nsmallest(n_lowest).iloc[-1] \n", - " # Draw a horizontal line at the lowest nth point\n", - " # label=f'Lowest {n_poorest}th Point: {lowest_nth_value:.2f}'\n", - " plt.axhline(y=lowest_nth, color='r', linestyle='--', label=\"y=%0.2f\" % lowest_nth )\n", - "\n", - "# plt.xticks(rotation=90)\n", - " sb.scatterplot(data=devs, x=key2, y=key, hue=hue)\n", - " plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", - " title = \"Annual Salary of %s Developers Over Years of Experience\" %language\\\n", - " + \"\\nsample size=%i\" % len (devs)\\\n", - " + \"\\ncountry=%s\" %country\n", - " plt.title(title)\n", - "\n", - "# expected C jobs\n", - "cjobs = [\"Developer, embedded applications or devices\", \n", - " \"Developer, game or graphics\",\n", - " \"Hardware Engineer\" ,\n", - " # \"Project manager\", \n", - " # \"Product manager\"\n", - "]\n", "\n", "# expected python jobs\n", "pyjobs = [\"Data scientist or machine learning specialist\",\n", @@ -454,15 +519,52 @@ "# \"DevOps specialist\",\n", "# \"Developer, QA or test\"\n", "]\n", + "\n", + "python = Foo(so_df, \"Python\", jobs=pyjobs, n_rich_outliers=9, n_poor_outliers=2)\n", + "python.visualize(hue=\"DevType\")\n", + "# earnings vary widely after the first year\n", + "python.run_regression( x_transform=log_base_a, x_shift=1)\n", + "python.run_regression( x_transform=log_base_a, change_base=1.2, x_shift=1, line_color='magenta')\n", + "python.run_regression( x_transform=log_base_a, change_base=1.12, x_shift=1, line_color='lightgreen')\n", + "python.export_image()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e27f76c-8f87-4c39-ac2f-5a9b2434466f", + "metadata": {}, + "outputs": [], + "source": [ + "# expected C jobs\n", + "cjobs = [\"Developer, embedded applications or devices\", \n", + " \"Developer, game or graphics\",\n", + " \"Hardware Engineer\" ,\n", + " # \"Project manager\", \n", + " # \"Product manager\"\n", + "]\n", + "c = Foo(so_df, \"C\", jobs=cjobs, n_rich_outliers=11)\n", + "c.visualize(n_lowest=3, hue=\"DevType\")\n", + "c.run_regression(x_transform=log_base_a, change_base=1.25)\n", + "c.export_image()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8357f841-23a0-4bfa-bf09-860bd3e014b8", + "metadata": {}, + "outputs": [], + "source": [ "\n", "jsjobs = [\"Developer, full-stack\",\n", " \"Developer, front-end\",\n", " \"Developer, mobile\"\n", "]\n", "\n", - "foo(so_df, \"Python\", jobs=pyjobs, hue=\"DevType\", n_rich_outliers=7, n_lowest=12)\n", - "foo(so_df, \"C\", jobs=cjobs, hue=\"DevType\", n_rich_outliers=3, n_lowest=5)\n", - "foo(so_df, \"JavaScript\", jobs=jsjobs, hue=\"DevType\", n_rich_outliers=6, country=\"Ukraine\")" + "js = Foo(so_df, \"JavaScript\", jobs=jsjobs, n_rich_outliers=6, country=\"Ukraine\")\n", + "js.visualize(hue=\"DevType\")\n", + "js.export_image()" ] }, {