Probe into developers excluded from analysis.
Added charts on participants who did not specify an annual income compared with those who did. Can print quantity of rows with NaN dropped.
This commit is contained in:
@@ -74,14 +74,14 @@
|
||||
"source": [
|
||||
"# draw count plot of developers based on age\n",
|
||||
"\n",
|
||||
"def visualize_devs(df, lang, key='Age'):\n",
|
||||
"def visualize_devs(df, title, key='Age'):\n",
|
||||
" '''\n",
|
||||
" Draws count plot of developers based on attributes.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" df: a DataFrame, the subset of the data set.\n",
|
||||
" lang: string, the programming language, used for labeling.\n",
|
||||
" key: string, the attribute to count (age).\n",
|
||||
" df: a DataFrame, the subset of the data set.\n",
|
||||
" title: string, title of the chart.\n",
|
||||
" key: string, the attribute to count (age).\n",
|
||||
" outputs:\n",
|
||||
" no return values, will draw and save a graphic.\n",
|
||||
" '''\n",
|
||||
@@ -94,9 +94,8 @@
|
||||
" '45-54 years old', '55-64 years old', \\\n",
|
||||
" '65 years or older', 'Prefer not to say']\n",
|
||||
" sb.countplot(x=key, data=df, order=order)\n",
|
||||
" title='Ages of %s Programmers' % lang\n",
|
||||
" plt.title(title)\n",
|
||||
" filename= 'images/%s-of-%s-programmers.png' % (key, lang)\n",
|
||||
" filename= 'images/%s.png' % title.replace(\" \", \"-\")\n",
|
||||
" plt.savefig(filename, bbox_inches='tight')\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@@ -153,12 +152,13 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"visualize_devs( get_c_devs(so_df) , 'C')\n",
|
||||
"visualize_devs( get_c_devs(so_df, lang='Python') , 'Python')\n",
|
||||
"visualize_devs( get_c_devs(so_df) , 'Ages of C Programmers')\n",
|
||||
"visualize_devs( get_c_devs(so_df, lang='Python') , 'Ages of Python Programmers')\n",
|
||||
"\n",
|
||||
"for lang in ['Cobol', 'Prolog', 'Ada']:\n",
|
||||
" title = 'Ages of %s Programmers' % lang\n",
|
||||
" foo = get_lang_devs(so_df, lang)\n",
|
||||
" visualize_devs(foo, lang)"
|
||||
" visualize_devs(foo, title)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -168,7 +168,19 @@
|
||||
"source": [
|
||||
"## Preparing the Data\n",
|
||||
"\n",
|
||||
"`__init__()` specifies which rows to omit and which to use, so the data for modeling doesn't look like a shotgun blast of rainbow colors."
|
||||
"`__init__()` specifies which rows to omit and which to use, so the data for modeling doesn't look like a shotgun blast of rainbow colors.\n",
|
||||
"\n",
|
||||
"### NaNs are dropped\n",
|
||||
"\n",
|
||||
"No values are assumed in the place of NaN.\n",
|
||||
"\n",
|
||||
"Missing values are dropped for developers who:\n",
|
||||
"* did not specify their years of professional experience\n",
|
||||
"* did not disclose an annual compensation.\n",
|
||||
"\n",
|
||||
"Roughly half of the developers that fit the criteria for analysis (42% for Python, 54% for C) have not specified their annual compensation. This analysis makes no assumptions on the reason. \n",
|
||||
"\n",
|
||||
"The age distribution is similar between those who specified annual compensation and those who declined to do so. This supports that the analysis is not significantly altered by missing data.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -187,7 +199,7 @@
|
||||
"\n",
|
||||
"# still haven't come up with a name\n",
|
||||
"class Foo:\n",
|
||||
" def __init__(self, dataset, language, jobs=None, \n",
|
||||
" def __init__(self, df, language, jobs=None, \n",
|
||||
" n_rich_outliers=0, n_poor_outliers=0, \n",
|
||||
" country='United States of America'):\n",
|
||||
" '''\n",
|
||||
@@ -210,10 +222,9 @@
|
||||
" # focus on people who have given ...\n",
|
||||
" key_x = 'YearsCodePro'\n",
|
||||
" key_y = 'ConvertedCompYearly'\n",
|
||||
" df = dataset.dropna(subset=[key_x, key_y])\n",
|
||||
" self.key_x = key_x\n",
|
||||
" self.key_y = key_y\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" qualifiers = {\n",
|
||||
" 'MainBranch': 'I am a developer by profession',\n",
|
||||
" }\n",
|
||||
@@ -231,7 +242,11 @@
|
||||
" devs = get_c_devs(df, lang=language)\n",
|
||||
" else:\n",
|
||||
" devs = get_lang_devs(df, language)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" self.df_no_x = devs[devs[key_x].isnull()]\n",
|
||||
" self.df_no_y = devs[devs[key_y].isnull()]\n",
|
||||
" devs = devs.dropna(subset=[key_x, key_y])\n",
|
||||
"\n",
|
||||
" replacement_dict = {\n",
|
||||
" 'Less than 1 year': '0.5',\n",
|
||||
" 'More than 50 years': '51',\n",
|
||||
@@ -402,6 +417,21 @@
|
||||
" filename = base_filename % (self.language, self.country)\n",
|
||||
" plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
|
||||
"\n",
|
||||
" def probe_excluded_rows(self):\n",
|
||||
" '''\n",
|
||||
" Display information about developers excluded from analysis.\n",
|
||||
" '''\n",
|
||||
" nan_x_count = self.df_no_x.shape[0]\n",
|
||||
" nan_y_count = self.df_no_y.shape[0]\n",
|
||||
" print(nan_x_count, 'did not specify', self.key_x)\n",
|
||||
" print(nan_y_count, 'did not specify', self.key_y)\n",
|
||||
" print('total developers:', self.devs.shape[0] \n",
|
||||
" + nan_x_count + nan_y_count)\n",
|
||||
" title1 = 'Age of %s Programmers excluded from analysis'\n",
|
||||
" visualize_devs(self.df_no_y, title1 % self.language)\n",
|
||||
" title2 = 'Age of %s programmers included in analysis'\n",
|
||||
" visualize_devs(self.devs, title2 % self.language)\n",
|
||||
" \n",
|
||||
"def show_model_stats(coef, intercept, y_test, y_pred, label):\n",
|
||||
" '''\n",
|
||||
" Displays model performance.\n",
|
||||
@@ -474,7 +504,8 @@
|
||||
"#python.run_regression(x_transform=log_base_a, change_base=1.20, risky=2, random=555, \n",
|
||||
"# color='pink', name='Risky regression line')\n",
|
||||
"python.run_log_regression(nodraw=False)\n",
|
||||
"python.export_image()"
|
||||
"python.export_image()\n",
|
||||
"python.probe_excluded_rows()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -530,7 +561,8 @@
|
||||
"c.run_regression(x_transform=log_base_a, change_base=1.3, \n",
|
||||
" x_shift=2, y_shift=-5000, color='magenta', random=555)\n",
|
||||
"c.run_log_regression(nodraw=False)\n",
|
||||
"c.export_image()"
|
||||
"c.export_image()\n",
|
||||
"c.probe_excluded_rows()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
Reference in New Issue
Block a user