Preparing notebook for submission.
Added business understanding Q&A. Labeled outputs by color (regression attempt). Some code clean up.
This commit is contained in:
@@ -91,44 +91,46 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||||
"from sklearn.linear_model import LinearRegression\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.metrics import root_mean_squared_error\n",
|
||||
"from sklearn.model_selection import StratifiedShuffleSplit\n",
|
||||
"from sklearn.metrics import root_mean_squared_error, r2_score\n",
|
||||
"import traceback\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# still haven't come up with a name\n",
|
||||
"class Foo:\n",
|
||||
" def __init__(self, dataset, language, jobs=None, n_rich_outliers=0, n_poor_outliers=0, country=\"United States of America\"):\n",
|
||||
" def __init__(self, dataset, language, jobs=None, \n",
|
||||
" n_rich_outliers=0, n_poor_outliers=0, \n",
|
||||
" country=\"United States of America\"):\n",
|
||||
" self.devs = None\n",
|
||||
" self.canvas = None\n",
|
||||
" self.language = language\n",
|
||||
" self.country = country\n",
|
||||
" # focus on people who have given ...\n",
|
||||
" key = \"ConvertedCompYearly\"\n",
|
||||
" key2 = \"YearsCodePro\"\n",
|
||||
" df = dataset.dropna(subset=[key, key2])\n",
|
||||
" self.key = key\n",
|
||||
" self.key2 = key2\n",
|
||||
"\n",
|
||||
" criteria = {\"MainBranch\":\"I am a developer by profession\"}\n",
|
||||
"\n",
|
||||
" #print(df[\"Country\"].unique)\n",
|
||||
" key_x = \"YearsCodePro\"\n",
|
||||
" key_y = \"ConvertedCompYearly\"\n",
|
||||
" df = dataset.dropna(subset=[key_x, key_y])\n",
|
||||
" self.key_x = key_x\n",
|
||||
" self.key_y = key_y\n",
|
||||
" \n",
|
||||
" qualifiers = {\n",
|
||||
" \"MainBranch\":\"I am a developer by profession\",\n",
|
||||
" }\n",
|
||||
" if country:\n",
|
||||
" criteria[\"Country\"] = country\n",
|
||||
" for k in criteria:\n",
|
||||
" df = df[df[k] == criteria[k] ] \n",
|
||||
" qualifiers[\"Country\"] = country\n",
|
||||
" for k in qualifiers:\n",
|
||||
" df = df[df[k] == qualifiers[k] ] \n",
|
||||
"\n",
|
||||
" # chatgpt tells me about filtering with multiple strings\n",
|
||||
" if jobs:\n",
|
||||
" df = df[df.isin(jobs).any(axis=1)]\n",
|
||||
"\n",
|
||||
" devs = None\n",
|
||||
" if len(language) > 1:\n",
|
||||
" devs = get_lang_devs(df, language)\n",
|
||||
" else:\n",
|
||||
" if len(language) == 1 or language in [\"Python\", \"Java\"]:\n",
|
||||
" devs = get_c_devs(df, lang=language)\n",
|
||||
" else:\n",
|
||||
" devs = get_lang_devs(df, language)\n",
|
||||
" \n",
|
||||
" replacement_dict = {\n",
|
||||
" 'Less than 1 year': '0.5',\n",
|
||||
" 'More than 50 years': '51',\n",
|
||||
@@ -136,77 +138,80 @@
|
||||
"\n",
|
||||
" # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n",
|
||||
" pd.options.mode.chained_assignment = None # default='warn'\n",
|
||||
" new_column = devs[key2].replace(replacement_dict)\n",
|
||||
" devs[key2] = pd.to_numeric(new_column, errors='coerce')\n",
|
||||
" new_column = devs[key_x].replace(replacement_dict)\n",
|
||||
" devs[key_x] = pd.to_numeric(new_column, errors='coerce')\n",
|
||||
" pd.options.mode.chained_assignment = 'warn' # default='warn'\n",
|
||||
" # print( devs[key2].unique() )\n",
|
||||
" # print( devs[key_x].unique() )\n",
|
||||
" \n",
|
||||
" indices = devs[key].nlargest(n_rich_outliers).index\n",
|
||||
" indices = devs[key_y].nlargest(n_rich_outliers).index\n",
|
||||
" devs = devs.drop(indices)\n",
|
||||
" indices = devs[key].nsmallest(n_poor_outliers).index\n",
|
||||
" indices = devs[key_y].nsmallest(n_poor_outliers).index\n",
|
||||
" self.devs = devs.drop(indices)\n",
|
||||
" del devs, new_column, criteria\n",
|
||||
" del devs, new_column\n",
|
||||
" \n",
|
||||
" def visualize(self, n_lowest=0, hue=\"Country\"): \n",
|
||||
" def visualize(self, n_lowest=0, \n",
|
||||
" hue=\"Country\", palette=sb.color_palette() ): \n",
|
||||
" self.canvas = plt.figure()\n",
|
||||
" key = self.key\n",
|
||||
" key2 = self.key2\n",
|
||||
" key_x = self.key_x\n",
|
||||
" key_y = self.key_y\n",
|
||||
"\n",
|
||||
" if n_lowest > 0:\n",
|
||||
" # chatgpt draws my line\n",
|
||||
" # Calculate the lowest nth point (for example, the 5th lowest value)\n",
|
||||
" # iloc[-1] gets the last element from the n smallest\n",
|
||||
" lowest_nth = self.devs[key].nsmallest(n_lowest).iloc[-1] \n",
|
||||
" lowest_nth = self.devs[key_y].nsmallest(n_lowest).iloc[-1] \n",
|
||||
" # Draw a horizontal line at the lowest nth point\n",
|
||||
" # label=f'Lowest {n_poorest}th Point: {lowest_nth_value:.2f}'\n",
|
||||
" plt.axhline(y=lowest_nth, color='purple', linestyle='--', label=\"y=%0.2f\" % lowest_nth )\n",
|
||||
" plt.axhline(y=lowest_nth, color='purple', linestyle='--', \n",
|
||||
" label=\"y=%0.2f\" % lowest_nth, zorder=-1 )\n",
|
||||
"\n",
|
||||
" sb.scatterplot(data=self.devs, x=key2, y=key, hue=hue)\n",
|
||||
" sb.scatterplot(data=self.devs, x=key_x, y=key_y, hue=hue, palette=palette)\n",
|
||||
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
|
||||
" title = \"Annual Salary of %s Developers Over Years of Experience\" % self.language\\\n",
|
||||
" + \"\\nsample size=%i\" % len (self.devs)\\\n",
|
||||
" + \"\\ncountry=%s\" % self.country\n",
|
||||
" plt.title(title)\n",
|
||||
"\n",
|
||||
" def run_regression(self, split=train_test_split, \n",
|
||||
" x_transform=None, change_base=None, x_shift=0,\n",
|
||||
" line_color='red'):\n",
|
||||
" def run_regression(self, model=LinearRegression(), split=train_test_split, \n",
|
||||
" x_transform=None, change_base=None, x_shift=0, y_shift=0,\n",
|
||||
" line_color='red', random=333):\n",
|
||||
" df = self.devs # .sort_values(by = self.key2)\n",
|
||||
"# df['binned'] = pd.qcut(df[self.key], q=4, labels=False)\n",
|
||||
" X = df[self.key2].to_frame() + x_shift\n",
|
||||
" X = df[self.key_x].to_frame()\n",
|
||||
" if x_transform is not None and change_base is not None:\n",
|
||||
" X = x_transform (X, a=change_base ) \n",
|
||||
" elif x_transform is not None:\n",
|
||||
" X = x_transform (X) \n",
|
||||
"\n",
|
||||
" y = df[self.key].to_frame()\n",
|
||||
"# y = df['binned']\n",
|
||||
" X = X + x_shift\n",
|
||||
" y = df[self.key_y].to_frame() + y_shift\n",
|
||||
" \n",
|
||||
" X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=999)\n",
|
||||
" X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=random)\n",
|
||||
"\n",
|
||||
" model = LinearRegression()\n",
|
||||
" model.fit(X_train, y_train)\n",
|
||||
" y_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
" print(\"+----------------------+\")\n",
|
||||
" print(\"%s regression line for %s\" % (line_color, self.language))\n",
|
||||
" print(\"coefficient =\", model.coef_)\n",
|
||||
" print('intercept=', model.intercept_)\n",
|
||||
" rmse = root_mean_squared_error(y_test, y_pred)\n",
|
||||
" print(\"rmse = \", rmse)\n",
|
||||
" r2 = r2_score(y_test, y_pred)\n",
|
||||
" print(\"r2 score = \", r2)\n",
|
||||
" print(\"sample predictions:\")\n",
|
||||
" print(y_pred[3:6])\n",
|
||||
" print(\"+----------------------+\")\n",
|
||||
" \n",
|
||||
" plt.figure(self.canvas)\n",
|
||||
" plt.xlim(left=0, right=40) # Adjust these values as needed\n",
|
||||
"\n",
|
||||
" plt.plot(X_test, y_pred, color=line_color, label='Regression Line')\n",
|
||||
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
|
||||
" del y_pred, model\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" def export_image(self, filename = \"images/programmers-%s-%s.png\"):\n",
|
||||
" def export_image(self, base_filename = \"images/programmers-%s-%s.png\"):\n",
|
||||
" plt.figure(self.canvas)\n",
|
||||
" plt.savefig(filename % (self.language, self.country), bbox_inches='tight')\n",
|
||||
" filename = base_filename % (self.language, self.country)\n",
|
||||
" plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
|
||||
"\n",
|
||||
"# the higher a is, the steeper the line gets\n",
|
||||
"def log_base_a(x, a=1.07):\n",
|
||||
@@ -220,7 +225,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"# expected python jobs\n",
|
||||
"pyjobs = [\"Data scientist or machine learning specialist\",\n",
|
||||
@@ -230,15 +234,36 @@
|
||||
"# \"Developer, QA or test\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"python = Foo(so_df, \"Python\", jobs=pyjobs, n_rich_outliers=9, n_poor_outliers=2)\n",
|
||||
"python.visualize(hue=\"DevType\")\n",
|
||||
"# earnings vary widely after the first year\n",
|
||||
"python.run_regression( x_transform=log_base_a, x_shift=1)\n",
|
||||
"python.run_regression( x_transform=log_base_a, change_base=1.2, x_shift=1, line_color='magenta')\n",
|
||||
"python.run_regression( x_transform=log_base_a, change_base=1.12, x_shift=1, line_color='lightgreen')\n",
|
||||
"python = Foo(so_df, \"Python\", jobs=pyjobs, n_rich_outliers=12, n_poor_outliers=2)\n",
|
||||
"python.visualize(hue=\"DevType\", palette=[\"#dbdb32\", \"#34bf65\", \"#ac70e0\"])\n",
|
||||
"python.run_regression()\n",
|
||||
"python.run_regression( x_transform=log_base_a, change_base=1.20, \n",
|
||||
" x_shift=0, y_shift=-1.5e4, line_color='cyan', random=888)\n",
|
||||
"python.export_image()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f4e3516e-ffe3-4768-ae92-e5cb0be503f8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Business Understanding\n",
|
||||
"\n",
|
||||
"* For Python programmers specialized in data e.g data scientists, engineers, or analysts, a linear model moderately fits the relationship between income and years of experience. For data engineers and scientists, there is a possible divergence within the career path at 10 years of experience. \n",
|
||||
"\n",
|
||||
"* The typical starting salary within the field of data science is around 100,000 to 120,000 dollars.\n",
|
||||
"\n",
|
||||
"* The income of a data professional can either increase by 2,000 per year (red) or 10,000 per year (cyan).\n",
|
||||
"\n",
|
||||
"* For both models, the r2 score ranges from poor to moderate = 0.20 - 0.37 depending on the random number. The variability not explained by the model could be the result of the fields such as advertising, finance, or bio/medical technology.\n",
|
||||
"\n",
|
||||
"* For any given point in the career, the model is off by 39,000 or 42,000 dollars.\n",
|
||||
"\n",
|
||||
"Generally, for low uncomes poorly explained by the model, the cause could be getting a new job after a year of unemployment, internships, or part-time positions. For high incomes poorly explained by the model, the cause could be professionals at large companies who had recently added a programming language to their skill set. Other causes could be company size or working hours.\n",
|
||||
"\n",
|
||||
"(Business understanding was done for C first. Questions are there.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -247,18 +272,57 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# expected C jobs\n",
|
||||
"cjobs = [\"Developer, embedded applications or devices\", \n",
|
||||
" \"Developer, game or graphics\",\n",
|
||||
" \"Hardware Engineer\" ,\n",
|
||||
"cjobs = [\n",
|
||||
" \"Developer, embedded applications or devices\", \n",
|
||||
" \"Developer, game or graphics\",\n",
|
||||
" \"Hardware Engineer\" ,\n",
|
||||
" # \"Project manager\", \n",
|
||||
" # \"Product manager\"\n",
|
||||
"]\n",
|
||||
"c = Foo(so_df, \"C\", jobs=cjobs, n_rich_outliers=11)\n",
|
||||
"c.visualize(n_lowest=3, hue=\"DevType\")\n",
|
||||
"c.run_regression(x_transform=log_base_a, change_base=1.25)\n",
|
||||
"c = Foo(so_df, \"C\", jobs=cjobs, n_rich_outliers=30, n_poor_outliers=2)\n",
|
||||
"c.visualize(n_lowest=7, hue=\"DevType\", palette=[\"#57e6da\",\"#d9e352\",\"#cc622d\"] ) \n",
|
||||
"c.run_regression()\n",
|
||||
"c.run_regression(x_transform=log_base_a, change_base=1.3, \n",
|
||||
" x_shift=2, y_shift=-5000, line_color=\"magenta\", random=555)\n",
|
||||
"c.export_image()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "89d86a1e-dc65-48e4-adcf-bb10188fd0b7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Business Understanding\n",
|
||||
"\n",
|
||||
"1. For C programmers, specifically embedded systems, graphics, and hardware engineers, a linear model fits the relationship of income and years of experience.\n",
|
||||
"\n",
|
||||
"2. A coefficient = 11973.469 indicates that for each year of experience, a C programmer typically earns an additional $12,000 per year.\n",
|
||||
"\n",
|
||||
"3. Because the graph looks like a spray of water, after 10 years of experience, the salaries for C programming professionals strongly vary.\n",
|
||||
"\n",
|
||||
"4. A junior C programmer, at 2 years of experience, typically earns $54,776.266 per year.\n",
|
||||
"\n",
|
||||
"5. An r2 score = 0.571 indicates a bit over half of the variability in data is explained by the independent variable. This however is only for incomes below 200,000 dollars. Some participants with 5 years of professional experience were reporting incomes at or around $200,000. These were considered unusual outliers. Among the game developers, they may have independently released a game.\n",
|
||||
"\n",
|
||||
"rmse = 21198.612 indicates the model is off by around 21,000 dollars for a given point in a career.\n",
|
||||
"\n",
|
||||
"### Questions that can be answered\n",
|
||||
"\n",
|
||||
"* Is there a linear relationship between income and years of experience.\n",
|
||||
"* Is there a point in a career where raises stop ocurring?\n",
|
||||
"* What is the typical salary of a entry-level or junior C programmer?\n",
|
||||
"* How much more do C programmers earn for each year of experience?\n",
|
||||
"* How much of the variability is explained by the model and what factors are not considered?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "928f421c-1f2b-4be1-9ce2-8f3593a9a823",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below cells generate extra or unused graphs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -584,14 +648,6 @@
|
||||
"# 55-64 years old\n",
|
||||
"# 65 years or older"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b954a811-e401-48dc-9ba4-263a5f2cf5c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Reference in New Issue
Block a user