Started performing linear or log. regressions.

Split trails for different languages into their own cells.
This commit is contained in:
2025-04-21 23:22:00 -07:00
parent f073019538
commit a6bc83fa8b

View File

@@ -375,77 +375,142 @@
{
"cell_type": "code",
"execution_count": null,
"id": "67b56700-5847-4af8-87ec-74249aa95749",
"id": "b8212c27-6c76-4c8f-ba66-bbf1b5835c99",
"metadata": {},
"outputs": [],
"source": [
"##### import numpy as np\n",
"\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import root_mean_squared_error\n",
"from sklearn.model_selection import StratifiedShuffleSplit\n",
"import traceback\n",
"import numpy as np\n",
"\n",
"# still haven't come up with a name\n",
"class Foo:\n",
" def __init__(self, dataset, language, jobs=None, n_rich_outliers=0, n_poor_outliers=0, country=\"United States of America\"):\n",
" self.devs = None\n",
" self.canvas = None\n",
" self.language = language\n",
" self.country = country\n",
" # focus on people who have given ...\n",
" key = \"ConvertedCompYearly\"\n",
" key2 = \"YearsCodePro\"\n",
" df = dataset.dropna(subset=[key, key2])\n",
" self.key = key\n",
" self.key2 = key2\n",
"\n",
" criteria = {\"MainBranch\":\"I am a developer by profession\"}\n",
"\n",
" #print(df[\"Country\"].unique)\n",
" if country:\n",
" criteria[\"Country\"] = country\n",
" for k in criteria:\n",
" df = df[df[k] == criteria[k] ] \n",
"\n",
" # chatgpt tells me about filtering with multiple strings\n",
" if jobs:\n",
" df = df[df.isin(jobs).any(axis=1)]\n",
"\n",
" devs = None\n",
" if len(language) > 1:\n",
" devs = get_lang_devs(df, language)\n",
" else:\n",
" devs = get_c_devs(df, lang=language)\n",
" replacement_dict = {\n",
" 'Less than 1 year': '0.5',\n",
" 'More than 50 years': '51',\n",
" }\n",
"\n",
" # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n",
" pd.options.mode.chained_assignment = None # default='warn'\n",
" new_column = devs[key2].replace(replacement_dict)\n",
" devs[key2] = pd.to_numeric(new_column, errors='coerce')\n",
" pd.options.mode.chained_assignment = 'warn' # default='warn'\n",
" # print( devs[key2].unique() )\n",
" \n",
" indices = devs[key].nlargest(n_rich_outliers).index\n",
" devs = devs.drop(indices)\n",
" indices = devs[key].nsmallest(n_poor_outliers).index\n",
" self.devs = devs.drop(indices)\n",
" del devs, new_column, criteria\n",
" \n",
" def visualize(self, n_lowest=0, hue=\"Country\"): \n",
" self.canvas = plt.figure()\n",
" key = self.key\n",
" key2 = self.key2\n",
"\n",
" if n_lowest > 0:\n",
" # chatgpt draws my line\n",
" # Calculate the lowest nth point (for example, the 5th lowest value)\n",
" # iloc[-1] gets the last element from the n smallest\n",
" lowest_nth = self.devs[key].nsmallest(n_lowest).iloc[-1] \n",
" # Draw a horizontal line at the lowest nth point\n",
" # label=f'Lowest {n_poorest}th Point: {lowest_nth_value:.2f}'\n",
" plt.axhline(y=lowest_nth, color='purple', linestyle='--', label=\"y=%0.2f\" % lowest_nth )\n",
"\n",
" sb.scatterplot(data=self.devs, x=key2, y=key, hue=hue)\n",
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
" title = \"Annual Salary of %s Developers Over Years of Experience\" % self.language\\\n",
" + \"\\nsample size=%i\" % len (self.devs)\\\n",
" + \"\\ncountry=%s\" % self.country\n",
" plt.title(title)\n",
"\n",
" def run_regression(self, split=train_test_split, \n",
" x_transform=None, change_base=None, x_shift=0,\n",
" line_color='red'):\n",
" df = self.devs # .sort_values(by = self.key2)\n",
"# df['binned'] = pd.qcut(df[self.key], q=4, labels=False)\n",
" X = df[self.key2].to_frame() + x_shift\n",
" if x_transform is not None and change_base is not None:\n",
" X = x_transform (X, a=change_base ) \n",
" elif x_transform is not None:\n",
" X = x_transform (X) \n",
"\n",
" y = df[self.key].to_frame()\n",
"# y = df['binned']\n",
" \n",
" X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=999)\n",
"\n",
" model = LinearRegression()\n",
" model.fit(X_train, y_train)\n",
" y_pred = model.predict(X_test)\n",
"\n",
" print(\"+----------------------+\")\n",
" print(\"coefficient =\", model.coef_)\n",
" print('intercept=', model.intercept_)\n",
" rmse = root_mean_squared_error(y_test, y_pred)\n",
" print(\"rmse = \", rmse)\n",
" print(\"sample predictions:\")\n",
" print(y_pred[3:6])\n",
" print(\"+----------------------+\")\n",
" \n",
" plt.figure(self.canvas)\n",
" plt.xlim(left=0, right=40) # Adjust these values as needed\n",
" plt.plot(X_test, y_pred, color=line_color, label='Regression Line')\n",
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
" del y_pred, model\n",
"\n",
"\n",
" def export_image(self, filename = \"images/programmers-%s-%s.png\"):\n",
" plt.figure(self.canvas)\n",
" plt.savefig(filename % (self.language, self.country), bbox_inches='tight')\n",
"\n",
"# the higher a is, the steeper the line gets\n",
"def log_base_a(x, a=1.07):\n",
" return np.log10(x)/np.log(a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba81c59c-0610-4f71-96fb-9eddd7736329",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# come up with name later\n",
"# do_actual_project\n",
"# earnings over years of exp\n",
"def foo(dataset, language, jobs=None, n_rich_outliers=0, n_lowest=0, hue=\"Country\", country=\"United States of America\"):\n",
" # focus on people who have given ...\n",
" key = \"ConvertedCompYearly\"\n",
" key2 = \"YearsCodePro\"\n",
" df = dataset.dropna(subset=[key, key2])\n",
" \n",
" criteria = {\"MainBranch\":\"I am a developer by profession\"}\n",
" \n",
" #print(df[\"Country\"].unique)\n",
" if country:\n",
" criteria[\"Country\"] = country\n",
" for k in criteria:\n",
" df = df[df[k] == criteria[k] ] \n",
" \n",
" # chatgpt tells me about filtering with multiple strings\n",
" if jobs:\n",
" df = df[df.isin(jobs).any(axis=1)]\n",
" \n",
" devs = None\n",
" if len(language) > 1:\n",
" devs = get_lang_devs(df, language)\n",
" else:\n",
" devs = get_c_devs(df, lang=language)\n",
" replacement_dict = {\n",
" 'Less than 1 year': '0.5',\n",
" 'More than 50 years': '51',\n",
" }\n",
" \n",
" # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n",
" pd.options.mode.chained_assignment = None # default='warn'\n",
" new_column = devs[key2].replace(replacement_dict)\n",
" devs[key2] = pd.to_numeric(new_column, errors='coerce')\n",
" # print( devs[key2].unique() )\n",
" \n",
" indices = devs[key].nlargest(n_rich_outliers).index\n",
" devs = devs.drop(indices)\n",
"\n",
" plt.figure()\n",
" \n",
" if n_lowest > 0:\n",
" # chatgpt draws my line\n",
" # Calculate the lowest nth point (for example, the 5th lowest value)\n",
" # iloc[-1] gets the last element from the n smallest\n",
" lowest_nth = df[key].nsmallest(n_lowest).iloc[-1] \n",
" # Draw a horizontal line at the lowest nth point\n",
" # label=f'Lowest {n_poorest}th Point: {lowest_nth_value:.2f}'\n",
" plt.axhline(y=lowest_nth, color='r', linestyle='--', label=\"y=%0.2f\" % lowest_nth )\n",
"\n",
"# plt.xticks(rotation=90)\n",
" sb.scatterplot(data=devs, x=key2, y=key, hue=hue)\n",
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
" title = \"Annual Salary of %s Developers Over Years of Experience\" %language\\\n",
" + \"\\nsample size=%i\" % len (devs)\\\n",
" + \"\\ncountry=%s\" %country\n",
" plt.title(title)\n",
"\n",
"# expected C jobs\n",
"cjobs = [\"Developer, embedded applications or devices\", \n",
" \"Developer, game or graphics\",\n",
" \"Hardware Engineer\" ,\n",
" # \"Project manager\", \n",
" # \"Product manager\"\n",
"]\n",
"\n",
"# expected python jobs\n",
"pyjobs = [\"Data scientist or machine learning specialist\",\n",
@@ -454,15 +519,52 @@
"# \"DevOps specialist\",\n",
"# \"Developer, QA or test\"\n",
"]\n",
"\n",
"python = Foo(so_df, \"Python\", jobs=pyjobs, n_rich_outliers=9, n_poor_outliers=2)\n",
"python.visualize(hue=\"DevType\")\n",
"# earnings vary widely after the first year\n",
"python.run_regression( x_transform=log_base_a, x_shift=1)\n",
"python.run_regression( x_transform=log_base_a, change_base=1.2, x_shift=1, line_color='magenta')\n",
"python.run_regression( x_transform=log_base_a, change_base=1.12, x_shift=1, line_color='lightgreen')\n",
"python.export_image()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e27f76c-8f87-4c39-ac2f-5a9b2434466f",
"metadata": {},
"outputs": [],
"source": [
"# expected C jobs\n",
"cjobs = [\"Developer, embedded applications or devices\", \n",
" \"Developer, game or graphics\",\n",
" \"Hardware Engineer\" ,\n",
" # \"Project manager\", \n",
" # \"Product manager\"\n",
"]\n",
"c = Foo(so_df, \"C\", jobs=cjobs, n_rich_outliers=11)\n",
"c.visualize(n_lowest=3, hue=\"DevType\")\n",
"c.run_regression(x_transform=log_base_a, change_base=1.25)\n",
"c.export_image()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8357f841-23a0-4bfa-bf09-860bd3e014b8",
"metadata": {},
"outputs": [],
"source": [
"\n",
"jsjobs = [\"Developer, full-stack\",\n",
" \"Developer, front-end\",\n",
" \"Developer, mobile\"\n",
"]\n",
"\n",
"foo(so_df, \"Python\", jobs=pyjobs, hue=\"DevType\", n_rich_outliers=7, n_lowest=12)\n",
"foo(so_df, \"C\", jobs=cjobs, hue=\"DevType\", n_rich_outliers=3, n_lowest=5)\n",
"foo(so_df, \"JavaScript\", jobs=jsjobs, hue=\"DevType\", n_rich_outliers=6, country=\"Ukraine\")"
"js = Foo(so_df, \"JavaScript\", jobs=jsjobs, n_rich_outliers=6, country=\"Ukraine\")\n",
"js.visualize(hue=\"DevType\")\n",
"js.export_image()"
]
},
{