Rewrote explanation for dropped rows.

Because the function about excluded rows was changed to show a different key.
Expanded probe_excluded_rows().
2025-04-30 04:06:51 -07:00 · 2025-04-30 03:29:07 -07:00 · 2025-04-29 08:13:45 -07:00 · 2025-04-29 08:13:26 -07:00
1 changed files with 190 additions and 16 deletions
--- a/stackoverflow-survey.ipynb
+++ b/stackoverflow-survey.ipynb
@@ -74,7 +74,17 @@
   "source": [
    "# draw count plot of developers based on age\n",
    "\n",
-    "def visualize_devs(df, lang, key='Age'):\n",
+    "def visualize_devs(df, title, key='Age'):\n",
+    "    '''\n",
+    "    Draws count plot of developers based on attributes.\n",
+    "\n",
+    "    inputs:\n",
+    "        df:    a DataFrame, the subset of the data set.\n",
+    "        title: string, title of the chart.\n",
+    "        key:   string, the attribute to count (age).\n",
+    "    outputs:\n",
+    "        no return values, will draw and save a graphic.\n",
+    "    '''\n",
    "    plt.figure()\n",
    "    plt.xticks(rotation=45)\n",
    "    # from:\n",
@@ -84,13 +94,23 @@
    "              '45-54 years old', '55-64 years old',  \\\n",
    "              '65 years or older', 'Prefer not to say']\n",
    "    sb.countplot(x=key, data=df, order=order)\n",
-    "    title='Ages of %s Programmers' % lang\n",
    "    plt.title(title)\n",
-    "    filename= 'images/%s-of-%s-programmers.png' % (key, lang)\n",
+    "    filename= 'images/%s.png' % title.replace(\" \", \"-\")\n",
    "    plt.savefig(filename, bbox_inches='tight')\n",
    "\n",
    "\n",
    "def get_lang_devs(df, lang):\n",
+    "    '''\n",
+    "    Returns a DataFrame, subset of the data set, of developers that have\n",
+    "    worked with a specified programming language.\n",
+    "\n",
+    "    inputs:\n",
+    "        df:   a DataFrame, can be the entire published data set.\n",
+    "        lang: a string, the programming language.\n",
+    "    outputs:\n",
+    "        a DataFrame of developers that have worked with `lang` programming \n",
+    "        language.\n",
+    "    '''\n",
    "    col = 'LanguageHaveWorkedWith'\n",
    "    # will not work for single character languages (C, R)\n",
    "    # will mangle Java and JavaScript, Python and MicroPython\n",
@@ -98,6 +118,19 @@
    "\n",
    "\n",
    "def get_c_devs(df, lang='C'):\n",
+    "    '''\n",
+    "    Returns a DataFrame, subset of the data set, of developers that have\n",
+    "    worked with a specified programming language.\n",
+    "    Similar to get_lang_devs() but adapted for languages named by a single\n",
+    "    letter, or names like 'Java' which is contained in 'JavaScript'.\n",
+    "\n",
+    "    inputs:\n",
+    "        df:   a DataFrame, can be the entire published data set.\n",
+    "        lang: a string, the programming language.\n",
+    "    outputs:\n",
+    "        a DataFrame of developers that have worked with `lang` programming \n",
+    "        language.\n",
+    "    '''\n",
    "    key = 'LanguageHaveWorkedWith'\n",
    "    cdevs = []\n",
    "    for index, dev in df.iterrows():\n",
@@ -119,12 +152,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "visualize_devs( get_c_devs(so_df) , 'C')\n",
-    "visualize_devs( get_c_devs(so_df, lang='Python') , 'Python')\n",
+    "visualize_devs( get_c_devs(so_df) , 'Ages of C Programmers')\n",
+    "visualize_devs( get_c_devs(so_df, lang='Python') , 'Ages of Python Programmers')\n",
    "\n",
    "for lang in ['Cobol', 'Prolog', 'Ada']:\n",
+    "    title = 'Ages of %s Programmers' % lang\n",
    "    foo = get_lang_devs(so_df, lang)\n",
-    "    visualize_devs(foo, lang)"
+    "    visualize_devs(foo, title)"
   ]
  },
  {
@@ -134,7 +168,22 @@
   "source": [
    "## Preparing the Data\n",
    "\n",
-    "`__init__()` specifies which rows to omit and which to use, so the data for modeling doesn't look like a shotgun blast of rainbow colors."
+    "`__init__()` specifies which rows to omit and which to use, so the data for modeling doesn't look like a shotgun blast of rainbow colors.\n",
+    "\n",
+    "### NaNs are dropped\n",
+    "\n",
+    "No values are assumed in the place of NaN for keys 'YearsCodePro' and 'ConvertedCompYearly'.\n",
+    "\n",
+    "Rows with NaN are dropped for developers who:\n",
+    "* did not specify their years of professional experience\n",
+    "* did not disclose an annual compensation.\n",
+    "\n",
+    "More developers declined to specify their income than years of experience. Between total and included rows, the distributions of years of experience is similar. This supports that the analysis is not significantly altered by missing data.\n",
+    "\n",
+    "See charts\n",
+    "\n",
+    "* Python Developers Total vs Included\n",
+    "* C Developers Total vs Included"
   ]
  },
  {
@@ -153,9 +202,22 @@
    "\n",
    "# still haven't come up with a name\n",
    "class Foo:\n",
-    "    def __init__(self, dataset, language, jobs=None, \n",
+    "    def __init__(self, df, language, jobs=None, \n",
    "                 n_rich_outliers=0, n_poor_outliers=0, \n",
    "                 country='United States of America'):\n",
+    "        '''\n",
+    "        inputs:\n",
+    "            dataset:  A DataFrame, can be the full data set.\n",
+    "            language: string, the programming language \n",
+    "                a developer has worked with.\n",
+    "            jobs:     list of strings, job positions \n",
+    "            - typically domains where the language is dominant.\n",
+    "            n_rich_outliers: integer, removes samples from the \n",
+    "                upper limit of the y-axis.\n",
+    "            n_poor_outliers: integer, removes samples from the \n",
+    "                lower limit of the y-axis.\n",
+    "            country: string, specifies the country of origin.\n",
+    "        '''\n",
    "        self.devs   = None\n",
    "        self.canvas = None\n",
    "        self.language = language\n",
@@ -163,10 +225,9 @@
    "        # focus on people who have given ...\n",
    "        key_x  = 'YearsCodePro'\n",
    "        key_y  = 'ConvertedCompYearly'\n",
-    "        df   = dataset.dropna(subset=[key_x, key_y])\n",
    "        self.key_x = key_x\n",
    "        self.key_y = key_y\n",
-    "    \n",
+    "\n",
    "        qualifiers = {\n",
    "            'MainBranch': 'I am a developer by profession',\n",
    "       }\n",
@@ -184,7 +245,11 @@
    "            devs = get_c_devs(df, lang=language)\n",
    "        else:\n",
    "            devs = get_lang_devs(df, language)\n",
-    "        \n",
+    "\n",
+    "        self.df_no_x = devs[devs[key_x].isnull()]\n",
+    "        self.df_no_y = devs[devs[key_y].isnull()]\n",
+    "        devs  = devs.dropna(subset=[key_x, key_y])\n",
+    "\n",
    "        replacement_dict = {\n",
    "            'Less than 1 year': '0.5',\n",
    "            'More than 50 years': '51',\n",
@@ -192,8 +257,12 @@
    "\n",
    "        # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n",
    "        pd.options.mode.chained_assignment = None  # default='warn'\n",
+    "    \n",
    "        new_column = devs[key_x].replace(replacement_dict)\n",
-    "        devs[key_x] = pd.to_numeric(new_column, errors='coerce')\n",
+    "        devs[key_x] = pd.to_numeric(new_column, errors='raise')\n",
+    "\n",
+    "        new_column = self.df_no_y[key_x].replace(replacement_dict)\n",
+    "        self.df_no_y[key_x] = pd.to_numeric(new_column, errors='raise')\n",
    "        pd.options.mode.chained_assignment = 'warn'  # default='warn'\n",
    "        # print( devs[key_x].unique() )\n",
    "        \n",
@@ -204,7 +273,16 @@
    "        del devs, new_column\n",
    "    \n",
    "    def visualize(self,  hue='Country', \n",
-    "                  palette=sb.color_palette() ):    \n",
+    "                  palette=sb.color_palette() ):\n",
+    "        '''\n",
+    "        Draw scatter plot of samples included in self.devs.\n",
+    "\n",
+    "        inputs:\n",
+    "            hue:     string, colorize dots by a given key.\n",
+    "            palette: list of strings (color codes)\n",
+    "                     or string (matplotlib predefined palettes),\n",
+    "                     specifies the colors to use when coloring dots.\n",
+    "        '''\n",
    "        self.canvas = plt.figure()\n",
    "        key_x = self.key_x\n",
    "        key_y = self.key_y\n",
@@ -220,6 +298,29 @@
    "                       x_shift=0, y_shift=0,\n",
    "                       random=333, risky=0,\n",
    "                       color='red', name='Regression Line' ):\n",
+    "        '''\n",
+    "        Run linear regresssion and draws a straight line.\n",
+    "\n",
+    "        inputs:\n",
+    "            x_transform: function, function to tune the independent variable.\n",
+    "            change_base: float or integer, specifies base \n",
+    "                for logarithmic function, not used if x_transform is None.\n",
+    "            x_shift: integer, for tuning, shifts the position \n",
+    "                of the line on the x-axis.\n",
+    "            y_shift: integer, for tuning, shifts the position \n",
+    "                of the line on the y-axis.\n",
+    "            random:  integer, random seed for train_test_split; \n",
+    "                change to test generalization.\n",
+    "            risky    integer ranging from 0 to 2,\n",
+    "                    0 = does nothing (default),\n",
+    "                    1 = sorts the independent variable,\n",
+    "                    2 = sorts the dependent variable,\n",
+    "               performs unrecommended operation to sort data,\n",
+    "               risking the model training on the order of values.\n",
+    "               May draw nice lines that generalize across random states.\n",
+    "           color: string, color of the regression line.\n",
+    "           name:  string, label of regression line on the legend.\n",
+    "        '''\n",
    "        df = self.devs # .sort_values(by = self.key2)\n",
    "        X = df[[self.key_x]]\n",
    "        y = df[[self.key_y]]\n",
@@ -251,7 +352,7 @@
    "    \n",
    "        m = model.coef_[0][0]\n",
    "        b = model.intercept_[0]\n",
-    "        label = '%s log regression line for %s' % (color, self.language)\n",
+    "        label = '%s regression line for %s' % (color, self.language)\n",
    "        show_model_stats(m, b, y_test, y_pred, label)\n",
    "\n",
    "        plt.figure(self.canvas)\n",
@@ -262,6 +363,20 @@
    "        del y_pred, model, X, y\n",
    "\n",
    "    def run_log_regression(self, color='pink', nodraw=True):\n",
+    "        '''\n",
+    "        Runs logarithmic regression and draws a line that contours \n",
+    "        at the point of diminishing returns.\n",
+    "\n",
+    "        Logarithmic regression provides a better fit for the data;\n",
+    "        however, it is not part of the course.\n",
+    "\n",
+    "        Can illustrate an interesting relationship between the\n",
+    "        \"default\" linear model and a tuned linear model.\n",
+    "\n",
+    "        inputs:\n",
+    "            color:   color of the regression line.\n",
+    "            nodraw:  whether or not to draw the line.\n",
+    "        '''\n",
    "        df = self.devs\n",
    "        X = df[[self.key_x]] #.sort_values(by=self.key_x)\n",
    "        y = df[[self.key_y]] #.sort_values(by=self.key_y)\n",
@@ -296,11 +411,61 @@
    "        plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
    "\n",
    "    def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",
+    "        '''\n",
+    "        Saves canvas to file.\n",
+    "\n",
+    "        inputs:\n",
+    "            base_filename: string with two format codes (two strings),\n",
+    "                this string will be interpolated by...\n",
+    "                1. the programming language\n",
+    "                2. the country of origin.\n",
+    "        '''\n",
    "        plt.figure(self.canvas)\n",
    "        filename = base_filename % (self.language, self.country)\n",
    "        plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
    "\n",
+    "    def probe_excluded_rows(self):\n",
+    "        '''\n",
+    "        Display information about developers excluded from analysis.\n",
+    "        '''\n",
+    "        nan_x_count = self.df_no_x.shape[0]\n",
+    "        nan_y_count = self.df_no_y.shape[0]\n",
+    "        print(nan_x_count, 'did not specify', self.key_x)\n",
+    "        print(nan_y_count, 'did not specify', self.key_y)\n",
+    "        print('total developers:', self.devs.shape[0] \n",
+    "              + nan_x_count + nan_y_count)\n",
+    "        title = '%s Developers Total vs Included' % self.language\n",
+    "        total_devs = pd.concat([self.devs, self.df_no_y])\n",
+    "    \n",
+    "        plt.figure()\n",
+    "        plt.title(title)\n",
+    "        plt.xticks(rotation=45)\n",
+    "        key   = self.key_x\n",
+    "\n",
+    "        bins = [0, 10, 20, 30, 40, 50]\n",
+    "        labels = ['0-10', '11-20', '21-30', '31-40', '41-50']\n",
+    "        total_binned = pd.cut(total_devs[key], bins=bins, labels=labels).to_frame()\n",
+    "        devs_binned  = pd.cut(self.devs[key], bins=bins, labels=labels).to_frame()\n",
+    "\n",
+    "        sb.countplot(x=key, data=total_binned, label='total')\n",
+    "        sb.countplot(x=key, data=devs_binned,\n",
+    "                     color='red', label='included in analysis')\n",
+    "        plt.legend()\n",
+    "        plt.savefig('images/%s-total-vs-included.png' % self.language)\n",
+    "        \n",
+    "    \n",
    "def show_model_stats(coef, intercept, y_test, y_pred, label):\n",
+    "    '''\n",
+    "    Displays model performance.\n",
+    "\n",
+    "    inputs:\n",
+    "        coef:      the coefficient of the model.\n",
+    "        intercept: the y-intercept of the model.\n",
+    "        y_test:    true values to compare against model predictions.\n",
+    "        y_pred:    prediction values from the model.\n",
+    "    \n",
+    "        label:     string, to help identify which line (e.g color).\n",
+    "    '''\n",
    "    print('+----------------------+')\n",
    "    print(label)\n",
    "    print('coefficient = %0.2f' % coef)\n",
@@ -315,6 +480,13 @@
    "\n",
    "# the higher a is, the steeper the line gets\n",
    "def log_base_a(x, a=1.07):\n",
+    "    '''\n",
+    "    Performs logarithmic transformation of value 'x' with base 'a'.\n",
+    "\n",
+    "    inputs:\n",
+    "        x: numeric, the variable to be transformed.\n",
+    "        a: numeric, the new base.\n",
+    "    '''\n",
    "    return np.log10(x)/np.log(a)"
   ]
  },
@@ -354,7 +526,8 @@
    "#python.run_regression(x_transform=log_base_a, change_base=1.20, risky=2, random=555, \n",
    "#                      color='pink', name='Risky regression line')\n",
    "python.run_log_regression(nodraw=False)\n",
-    "python.export_image()"
+    "python.export_image()\n",
+    "python.probe_excluded_rows()"
   ]
  },
  {
@@ -410,7 +583,8 @@
    "c.run_regression(x_transform=log_base_a, change_base=1.3, \n",
    "                 x_shift=2, y_shift=-5000, color='magenta', random=555)\n",
    "c.run_log_regression(nodraw=False)\n",
-    "c.export_image()"
+    "c.export_image()\n",
+    "c.probe_excluded_rows()"
   ]
  },
  {
Author	SHA1	Message	Date
scuti	2fd699497f	Rewrote explanation for dropped rows. Because the function about excluded rows was changed to show a different key.	2025-04-30 04:06:51 -07:00
scuti	08ab9f126c	Expanded probe_excluded_rows(). Shows blue bars to represent total professional developers, and red bar to represent those included in the analysis. x-axis is years of professional experience (changed from age).	2025-04-30 03:29:07 -07:00
scuti	d5443bd1fb	Probe into developers excluded from analysis. Added charts on participants who did not specify an annual income compared with those who did. Can print quantity of rows with NaN dropped.	2025-04-29 08:13:45 -07:00
scuti	2af2414219	Write DOCSTRINGS for functions. Also corrected typo in label.	2025-04-29 08:13:26 -07:00