Preparing notebook for submission.

Added business understanding Q&A.
Labeled outputs by color (regression attempt).
Some code clean up.
This commit is contained in:
2025-04-23 06:40:58 -07:00
parent 320fcb343b
commit bdcba003fe

View File

@@ -91,44 +91,46 @@
"outputs": [],
"source": [
"\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import root_mean_squared_error\n",
"from sklearn.model_selection import StratifiedShuffleSplit\n",
"from sklearn.metrics import root_mean_squared_error, r2_score\n",
"import traceback\n",
"import numpy as np\n",
"\n",
"# still haven't come up with a name\n",
"class Foo:\n",
" def __init__(self, dataset, language, jobs=None, n_rich_outliers=0, n_poor_outliers=0, country=\"United States of America\"):\n",
" def __init__(self, dataset, language, jobs=None, \n",
" n_rich_outliers=0, n_poor_outliers=0, \n",
" country=\"United States of America\"):\n",
" self.devs = None\n",
" self.canvas = None\n",
" self.language = language\n",
" self.country = country\n",
" # focus on people who have given ...\n",
" key = \"ConvertedCompYearly\"\n",
" key2 = \"YearsCodePro\"\n",
" df = dataset.dropna(subset=[key, key2])\n",
" self.key = key\n",
" self.key2 = key2\n",
"\n",
" criteria = {\"MainBranch\":\"I am a developer by profession\"}\n",
"\n",
" #print(df[\"Country\"].unique)\n",
" key_x = \"YearsCodePro\"\n",
" key_y = \"ConvertedCompYearly\"\n",
" df = dataset.dropna(subset=[key_x, key_y])\n",
" self.key_x = key_x\n",
" self.key_y = key_y\n",
" \n",
" qualifiers = {\n",
" \"MainBranch\":\"I am a developer by profession\",\n",
" }\n",
" if country:\n",
" criteria[\"Country\"] = country\n",
" for k in criteria:\n",
" df = df[df[k] == criteria[k] ] \n",
" qualifiers[\"Country\"] = country\n",
" for k in qualifiers:\n",
" df = df[df[k] == qualifiers[k] ] \n",
"\n",
" # chatgpt tells me about filtering with multiple strings\n",
" if jobs:\n",
" df = df[df.isin(jobs).any(axis=1)]\n",
"\n",
" devs = None\n",
" if len(language) > 1:\n",
" devs = get_lang_devs(df, language)\n",
" else:\n",
" if len(language) == 1 or language in [\"Python\", \"Java\"]:\n",
" devs = get_c_devs(df, lang=language)\n",
" else:\n",
" devs = get_lang_devs(df, language)\n",
" \n",
" replacement_dict = {\n",
" 'Less than 1 year': '0.5',\n",
" 'More than 50 years': '51',\n",
@@ -136,77 +138,80 @@
"\n",
" # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n",
" pd.options.mode.chained_assignment = None # default='warn'\n",
" new_column = devs[key2].replace(replacement_dict)\n",
" devs[key2] = pd.to_numeric(new_column, errors='coerce')\n",
" new_column = devs[key_x].replace(replacement_dict)\n",
" devs[key_x] = pd.to_numeric(new_column, errors='coerce')\n",
" pd.options.mode.chained_assignment = 'warn' # default='warn'\n",
" # print( devs[key2].unique() )\n",
" # print( devs[key_x].unique() )\n",
" \n",
" indices = devs[key].nlargest(n_rich_outliers).index\n",
" indices = devs[key_y].nlargest(n_rich_outliers).index\n",
" devs = devs.drop(indices)\n",
" indices = devs[key].nsmallest(n_poor_outliers).index\n",
" indices = devs[key_y].nsmallest(n_poor_outliers).index\n",
" self.devs = devs.drop(indices)\n",
" del devs, new_column, criteria\n",
" del devs, new_column\n",
" \n",
" def visualize(self, n_lowest=0, hue=\"Country\"): \n",
" def visualize(self, n_lowest=0, \n",
" hue=\"Country\", palette=sb.color_palette() ): \n",
" self.canvas = plt.figure()\n",
" key = self.key\n",
" key2 = self.key2\n",
" key_x = self.key_x\n",
" key_y = self.key_y\n",
"\n",
" if n_lowest > 0:\n",
" # chatgpt draws my line\n",
" # Calculate the lowest nth point (for example, the 5th lowest value)\n",
" # iloc[-1] gets the last element from the n smallest\n",
" lowest_nth = self.devs[key].nsmallest(n_lowest).iloc[-1] \n",
" lowest_nth = self.devs[key_y].nsmallest(n_lowest).iloc[-1] \n",
" # Draw a horizontal line at the lowest nth point\n",
" # label=f'Lowest {n_poorest}th Point: {lowest_nth_value:.2f}'\n",
" plt.axhline(y=lowest_nth, color='purple', linestyle='--', label=\"y=%0.2f\" % lowest_nth )\n",
" plt.axhline(y=lowest_nth, color='purple', linestyle='--', \n",
" label=\"y=%0.2f\" % lowest_nth, zorder=-1 )\n",
"\n",
" sb.scatterplot(data=self.devs, x=key2, y=key, hue=hue)\n",
" sb.scatterplot(data=self.devs, x=key_x, y=key_y, hue=hue, palette=palette)\n",
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
" title = \"Annual Salary of %s Developers Over Years of Experience\" % self.language\\\n",
" + \"\\nsample size=%i\" % len (self.devs)\\\n",
" + \"\\ncountry=%s\" % self.country\n",
" plt.title(title)\n",
"\n",
" def run_regression(self, split=train_test_split, \n",
" x_transform=None, change_base=None, x_shift=0,\n",
" line_color='red'):\n",
" def run_regression(self, model=LinearRegression(), split=train_test_split, \n",
" x_transform=None, change_base=None, x_shift=0, y_shift=0,\n",
" line_color='red', random=333):\n",
" df = self.devs # .sort_values(by = self.key2)\n",
"# df['binned'] = pd.qcut(df[self.key], q=4, labels=False)\n",
" X = df[self.key2].to_frame() + x_shift\n",
" X = df[self.key_x].to_frame()\n",
" if x_transform is not None and change_base is not None:\n",
" X = x_transform (X, a=change_base ) \n",
" elif x_transform is not None:\n",
" X = x_transform (X) \n",
"\n",
" y = df[self.key].to_frame()\n",
"# y = df['binned']\n",
" X = X + x_shift\n",
" y = df[self.key_y].to_frame() + y_shift\n",
" \n",
" X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=999)\n",
" X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=random)\n",
"\n",
" model = LinearRegression()\n",
" model.fit(X_train, y_train)\n",
" y_pred = model.predict(X_test)\n",
"\n",
" print(\"+----------------------+\")\n",
" print(\"%s regression line for %s\" % (line_color, self.language))\n",
" print(\"coefficient =\", model.coef_)\n",
" print('intercept=', model.intercept_)\n",
" rmse = root_mean_squared_error(y_test, y_pred)\n",
" print(\"rmse = \", rmse)\n",
" r2 = r2_score(y_test, y_pred)\n",
" print(\"r2 score = \", r2)\n",
" print(\"sample predictions:\")\n",
" print(y_pred[3:6])\n",
" print(\"+----------------------+\")\n",
" \n",
" plt.figure(self.canvas)\n",
" plt.xlim(left=0, right=40) # Adjust these values as needed\n",
"\n",
" plt.plot(X_test, y_pred, color=line_color, label='Regression Line')\n",
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
" del y_pred, model\n",
"\n",
"\n",
" def export_image(self, filename = \"images/programmers-%s-%s.png\"):\n",
" def export_image(self, base_filename = \"images/programmers-%s-%s.png\"):\n",
" plt.figure(self.canvas)\n",
" plt.savefig(filename % (self.language, self.country), bbox_inches='tight')\n",
" filename = base_filename % (self.language, self.country)\n",
" plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
"\n",
"# the higher a is, the steeper the line gets\n",
"def log_base_a(x, a=1.07):\n",
@@ -220,7 +225,6 @@
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"# expected python jobs\n",
"pyjobs = [\"Data scientist or machine learning specialist\",\n",
@@ -230,15 +234,36 @@
"# \"Developer, QA or test\"\n",
"]\n",
"\n",
"python = Foo(so_df, \"Python\", jobs=pyjobs, n_rich_outliers=9, n_poor_outliers=2)\n",
"python.visualize(hue=\"DevType\")\n",
"# earnings vary widely after the first year\n",
"python.run_regression( x_transform=log_base_a, x_shift=1)\n",
"python.run_regression( x_transform=log_base_a, change_base=1.2, x_shift=1, line_color='magenta')\n",
"python.run_regression( x_transform=log_base_a, change_base=1.12, x_shift=1, line_color='lightgreen')\n",
"python = Foo(so_df, \"Python\", jobs=pyjobs, n_rich_outliers=12, n_poor_outliers=2)\n",
"python.visualize(hue=\"DevType\", palette=[\"#dbdb32\", \"#34bf65\", \"#ac70e0\"])\n",
"python.run_regression()\n",
"python.run_regression( x_transform=log_base_a, change_base=1.20, \n",
" x_shift=0, y_shift=-1.5e4, line_color='cyan', random=888)\n",
"python.export_image()"
]
},
{
"cell_type": "markdown",
"id": "f4e3516e-ffe3-4768-ae92-e5cb0be503f8",
"metadata": {},
"source": [
"## Business Understanding\n",
"\n",
"* For Python programmers specialized in data e.g data scientists, engineers, or analysts, a linear model moderately fits the relationship between income and years of experience. For data engineers and scientists, there is a possible divergence within the career path at 10 years of experience. \n",
"\n",
"* The typical starting salary within the field of data science is around 100,000 to 120,000 dollars.\n",
"\n",
"* The income of a data professional can either increase by 2,000 per year (red) or 10,000 per year (cyan).\n",
"\n",
"* For both models, the r2 score ranges from poor to moderate = 0.20 - 0.37 depending on the random number. The variability not explained by the model could be the result of the fields such as advertising, finance, or bio/medical technology.\n",
"\n",
"* For any given point in the career, the model is off by 39,000 or 42,000 dollars.\n",
"\n",
"Generally, for low uncomes poorly explained by the model, the cause could be getting a new job after a year of unemployment, internships, or part-time positions. For high incomes poorly explained by the model, the cause could be professionals at large companies who had recently added a programming language to their skill set. Other causes could be company size or working hours.\n",
"\n",
"(Business understanding was done for C first. Questions are there.)"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -247,18 +272,57 @@
"outputs": [],
"source": [
"# expected C jobs\n",
"cjobs = [\"Developer, embedded applications or devices\", \n",
" \"Developer, game or graphics\",\n",
" \"Hardware Engineer\" ,\n",
"cjobs = [\n",
" \"Developer, embedded applications or devices\", \n",
" \"Developer, game or graphics\",\n",
" \"Hardware Engineer\" ,\n",
" # \"Project manager\", \n",
" # \"Product manager\"\n",
"]\n",
"c = Foo(so_df, \"C\", jobs=cjobs, n_rich_outliers=11)\n",
"c.visualize(n_lowest=3, hue=\"DevType\")\n",
"c.run_regression(x_transform=log_base_a, change_base=1.25)\n",
"c = Foo(so_df, \"C\", jobs=cjobs, n_rich_outliers=30, n_poor_outliers=2)\n",
"c.visualize(n_lowest=7, hue=\"DevType\", palette=[\"#57e6da\",\"#d9e352\",\"#cc622d\"] ) \n",
"c.run_regression()\n",
"c.run_regression(x_transform=log_base_a, change_base=1.3, \n",
" x_shift=2, y_shift=-5000, line_color=\"magenta\", random=555)\n",
"c.export_image()"
]
},
{
"cell_type": "markdown",
"id": "89d86a1e-dc65-48e4-adcf-bb10188fd0b7",
"metadata": {},
"source": [
"## Business Understanding\n",
"\n",
"1. For C programmers, specifically embedded systems, graphics, and hardware engineers, a linear model fits the relationship of income and years of experience.\n",
"\n",
"2. A coefficient = 11973.469 indicates that for each year of experience, a C programmer typically earns an additional $12,000 per year.\n",
"\n",
"3. Because the graph looks like a spray of water, after 10 years of experience, the salaries for C programming professionals strongly vary.\n",
"\n",
"4. A junior C programmer, at 2 years of experience, typically earns $54,776.266 per year.\n",
"\n",
"5. An r2 score = 0.571 indicates a bit over half of the variability in data is explained by the independent variable. This however is only for incomes below 200,000 dollars. Some participants with 5 years of professional experience were reporting incomes at or around $200,000. These were considered unusual outliers. Among the game developers, they may have independently released a game.\n",
"\n",
"rmse = 21198.612 indicates the model is off by around 21,000 dollars for a given point in a career.\n",
"\n",
"### Questions that can be answered\n",
"\n",
"* Is there a linear relationship between income and years of experience.\n",
"* Is there a point in a career where raises stop ocurring?\n",
"* What is the typical salary of a entry-level or junior C programmer?\n",
"* How much more do C programmers earn for each year of experience?\n",
"* How much of the variability is explained by the model and what factors are not considered?\n"
]
},
{
"cell_type": "markdown",
"id": "928f421c-1f2b-4be1-9ce2-8f3593a9a823",
"metadata": {},
"source": [
"Below cells generate extra or unused graphs."
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -584,14 +648,6 @@
"# 55-64 years old\n",
"# 65 years or older"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b954a811-e401-48dc-9ba4-263a5f2cf5c5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {