diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index 60df211..7b994f5 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -74,14 +74,14 @@ "source": [ "# draw count plot of developers based on age\n", "\n", - "def visualize_devs(df, lang, key='Age'):\n", + "def visualize_devs(df, title, key='Age'):\n", " '''\n", " Draws count plot of developers based on attributes.\n", "\n", " inputs:\n", - " df: a DataFrame, the subset of the data set.\n", - " lang: string, the programming language, used for labeling.\n", - " key: string, the attribute to count (age).\n", + " df: a DataFrame, the subset of the data set.\n", + " title: string, title of the chart.\n", + " key: string, the attribute to count (age).\n", " outputs:\n", " no return values, will draw and save a graphic.\n", " '''\n", @@ -94,9 +94,8 @@ " '45-54 years old', '55-64 years old', \\\n", " '65 years or older', 'Prefer not to say']\n", " sb.countplot(x=key, data=df, order=order)\n", - " title='Ages of %s Programmers' % lang\n", " plt.title(title)\n", - " filename= 'images/%s-of-%s-programmers.png' % (key, lang)\n", + " filename= 'images/%s.png' % title.replace(\" \", \"-\")\n", " plt.savefig(filename, bbox_inches='tight')\n", "\n", "\n", @@ -153,12 +152,13 @@ "metadata": {}, "outputs": [], "source": [ - "visualize_devs( get_c_devs(so_df) , 'C')\n", - "visualize_devs( get_c_devs(so_df, lang='Python') , 'Python')\n", + "visualize_devs( get_c_devs(so_df) , 'Ages of C Programmers')\n", + "visualize_devs( get_c_devs(so_df, lang='Python') , 'Ages of Python Programmers')\n", "\n", "for lang in ['Cobol', 'Prolog', 'Ada']:\n", + " title = 'Ages of %s Programmers' % lang\n", " foo = get_lang_devs(so_df, lang)\n", - " visualize_devs(foo, lang)" + " visualize_devs(foo, title)" ] }, { @@ -168,7 +168,19 @@ "source": [ "## Preparing the Data\n", "\n", - "`__init__()` specifies which rows to omit and which to use, so the data for modeling doesn't look like a shotgun blast of rainbow colors." + "`__init__()` specifies which rows to omit and which to use, so the data for modeling doesn't look like a shotgun blast of rainbow colors.\n", + "\n", + "### NaNs are dropped\n", + "\n", + "No values are assumed in the place of NaN.\n", + "\n", + "Missing values are dropped for developers who:\n", + "* did not specify their years of professional experience\n", + "* did not disclose an annual compensation.\n", + "\n", + "Roughly half of the developers that fit the criteria for analysis (42% for Python, 54% for C) have not specified their annual compensation. This analysis makes no assumptions on the reason. \n", + "\n", + "The age distribution is similar between those who specified annual compensation and those who declined to do so. This supports that the analysis is not significantly altered by missing data.\n" ] }, { @@ -187,7 +199,7 @@ "\n", "# still haven't come up with a name\n", "class Foo:\n", - " def __init__(self, dataset, language, jobs=None, \n", + " def __init__(self, df, language, jobs=None, \n", " n_rich_outliers=0, n_poor_outliers=0, \n", " country='United States of America'):\n", " '''\n", @@ -210,10 +222,9 @@ " # focus on people who have given ...\n", " key_x = 'YearsCodePro'\n", " key_y = 'ConvertedCompYearly'\n", - " df = dataset.dropna(subset=[key_x, key_y])\n", " self.key_x = key_x\n", " self.key_y = key_y\n", - " \n", + "\n", " qualifiers = {\n", " 'MainBranch': 'I am a developer by profession',\n", " }\n", @@ -231,7 +242,11 @@ " devs = get_c_devs(df, lang=language)\n", " else:\n", " devs = get_lang_devs(df, language)\n", - " \n", + "\n", + " self.df_no_x = devs[devs[key_x].isnull()]\n", + " self.df_no_y = devs[devs[key_y].isnull()]\n", + " devs = devs.dropna(subset=[key_x, key_y])\n", + "\n", " replacement_dict = {\n", " 'Less than 1 year': '0.5',\n", " 'More than 50 years': '51',\n", @@ -402,6 +417,21 @@ " filename = base_filename % (self.language, self.country)\n", " plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n", "\n", + " def probe_excluded_rows(self):\n", + " '''\n", + " Display information about developers excluded from analysis.\n", + " '''\n", + " nan_x_count = self.df_no_x.shape[0]\n", + " nan_y_count = self.df_no_y.shape[0]\n", + " print(nan_x_count, 'did not specify', self.key_x)\n", + " print(nan_y_count, 'did not specify', self.key_y)\n", + " print('total developers:', self.devs.shape[0] \n", + " + nan_x_count + nan_y_count)\n", + " title1 = 'Age of %s Programmers excluded from analysis'\n", + " visualize_devs(self.df_no_y, title1 % self.language)\n", + " title2 = 'Age of %s programmers included in analysis'\n", + " visualize_devs(self.devs, title2 % self.language)\n", + " \n", "def show_model_stats(coef, intercept, y_test, y_pred, label):\n", " '''\n", " Displays model performance.\n", @@ -474,7 +504,8 @@ "#python.run_regression(x_transform=log_base_a, change_base=1.20, risky=2, random=555, \n", "# color='pink', name='Risky regression line')\n", "python.run_log_regression(nodraw=False)\n", - "python.export_image()" + "python.export_image()\n", + "python.probe_excluded_rows()" ] }, { @@ -530,7 +561,8 @@ "c.run_regression(x_transform=log_base_a, change_base=1.3, \n", " x_shift=2, y_shift=-5000, color='magenta', random=555)\n", "c.run_log_regression(nodraw=False)\n", - "c.export_image()" + "c.export_image()\n", + "c.probe_excluded_rows()" ] }, {