Write DOCSTRINGS for functions.

Also corrected typo in label.
2025-04-28 20:51:39 -07:00
parent f49283a7cc
commit 2af2414219
1 changed files with 122 additions and 2 deletions
--- a/stackoverflow-survey.ipynb
+++ b/stackoverflow-survey.ipynb
@@ -75,6 +75,16 @@
    "# draw count plot of developers based on age\n",
    "\n",
    "def visualize_devs(df, lang, key='Age'):\n",
+    "    '''\n",
+    "    Draws count plot of developers based on attributes.\n",
+    "\n",
+    "    inputs:\n",
+    "        df:   a DataFrame, the subset of the data set.\n",
+    "        lang: string, the programming language, used for labeling.\n",
+    "        key:  string, the attribute to count (age).\n",
+    "    outputs:\n",
+    "        no return values, will draw and save a graphic.\n",
+    "    '''\n",
    "    plt.figure()\n",
    "    plt.xticks(rotation=45)\n",
    "    # from:\n",
@@ -91,6 +101,17 @@
    "\n",
    "\n",
    "def get_lang_devs(df, lang):\n",
+    "    '''\n",
+    "    Returns a DataFrame, subset of the data set, of developers that have\n",
+    "    worked with a specified programming language.\n",
+    "\n",
+    "    inputs:\n",
+    "        df:   a DataFrame, can be the entire published data set.\n",
+    "        lang: a string, the programming language.\n",
+    "    outputs:\n",
+    "        a DataFrame of developers that have worked with `lang` programming \n",
+    "        language.\n",
+    "    '''\n",
    "    col = 'LanguageHaveWorkedWith'\n",
    "    # will not work for single character languages (C, R)\n",
    "    # will mangle Java and JavaScript, Python and MicroPython\n",
@@ -98,6 +119,19 @@
    "\n",
    "\n",
    "def get_c_devs(df, lang='C'):\n",
+    "    '''\n",
+    "    Returns a DataFrame, subset of the data set, of developers that have\n",
+    "    worked with a specified programming language.\n",
+    "    Similar to get_lang_devs() but adapted for languages named by a single\n",
+    "    letter, or names like 'Java' which is contained in 'JavaScript'.\n",
+    "\n",
+    "    inputs:\n",
+    "        df:   a DataFrame, can be the entire published data set.\n",
+    "        lang: a string, the programming language.\n",
+    "    outputs:\n",
+    "        a DataFrame of developers that have worked with `lang` programming \n",
+    "        language.\n",
+    "    '''\n",
    "    key = 'LanguageHaveWorkedWith'\n",
    "    cdevs = []\n",
    "    for index, dev in df.iterrows():\n",
@@ -156,6 +190,19 @@
    "    def __init__(self, dataset, language, jobs=None, \n",
    "                 n_rich_outliers=0, n_poor_outliers=0, \n",
    "                 country='United States of America'):\n",
+    "        '''\n",
+    "        inputs:\n",
+    "            dataset:  A DataFrame, can be the full data set.\n",
+    "            language: string, the programming language \n",
+    "                a developer has worked with.\n",
+    "            jobs:     list of strings, job positions \n",
+    "            - typically domains where the language is dominant.\n",
+    "            n_rich_outliers: integer, removes samples from the \n",
+    "                upper limit of the y-axis.\n",
+    "            n_poor_outliers: integer, removes samples from the \n",
+    "                lower limit of the y-axis.\n",
+    "            country: string, specifies the country of origin.\n",
+    "        '''\n",
    "        self.devs   = None\n",
    "        self.canvas = None\n",
    "        self.language = language\n",
@@ -204,7 +251,16 @@
    "        del devs, new_column\n",
    "    \n",
    "    def visualize(self,  hue='Country', \n",
-    "                  palette=sb.color_palette() ):    \n",
+    "                  palette=sb.color_palette() ):\n",
+    "        '''\n",
+    "        Draw scatter plot of samples included in self.devs.\n",
+    "\n",
+    "        inputs:\n",
+    "            hue:     string, colorize dots by a given key.\n",
+    "            palette: list of strings (color codes)\n",
+    "                     or string (matplotlib predefined palettes),\n",
+    "                     specifies the colors to use when coloring dots.\n",
+    "        '''\n",
    "        self.canvas = plt.figure()\n",
    "        key_x = self.key_x\n",
    "        key_y = self.key_y\n",
@@ -220,6 +276,29 @@
    "                       x_shift=0, y_shift=0,\n",
    "                       random=333, risky=0,\n",
    "                       color='red', name='Regression Line' ):\n",
+    "        '''\n",
+    "        Run linear regresssion and draws a straight line.\n",
+    "\n",
+    "        inputs:\n",
+    "            x_transform: function, function to tune the independent variable.\n",
+    "            change_base: float or integer, specifies base \n",
+    "                for logarithmic function, not used if x_transform is None.\n",
+    "            x_shift: integer, for tuning, shifts the position \n",
+    "                of the line on the x-axis.\n",
+    "            y_shift: integer, for tuning, shifts the position \n",
+    "                of the line on the y-axis.\n",
+    "            random:  integer, random seed for train_test_split; \n",
+    "                change to test generalization.\n",
+    "            risky    integer ranging from 0 to 2,\n",
+    "                    0 = does nothing (default),\n",
+    "                    1 = sorts the independent variable,\n",
+    "                    2 = sorts the dependent variable,\n",
+    "               performs unrecommended operation to sort data,\n",
+    "               risking the model training on the order of values.\n",
+    "               May draw nice lines that generalize across random states.\n",
+    "           color: string, color of the regression line.\n",
+    "           name:  string, label of regression line on the legend.\n",
+    "        '''\n",
    "        df = self.devs # .sort_values(by = self.key2)\n",
    "        X = df[[self.key_x]]\n",
    "        y = df[[self.key_y]]\n",
@@ -251,7 +330,7 @@
    "    \n",
    "        m = model.coef_[0][0]\n",
    "        b = model.intercept_[0]\n",
-    "        label = '%s log regression line for %s' % (color, self.language)\n",
+    "        label = '%s regression line for %s' % (color, self.language)\n",
    "        show_model_stats(m, b, y_test, y_pred, label)\n",
    "\n",
    "        plt.figure(self.canvas)\n",
@@ -262,6 +341,20 @@
    "        del y_pred, model, X, y\n",
    "\n",
    "    def run_log_regression(self, color='pink', nodraw=True):\n",
+    "        '''\n",
+    "        Runs logarithmic regression and draws a line that contours \n",
+    "        at the point of diminishing returns.\n",
+    "\n",
+    "        Logarithmic regression provides a better fit for the data;\n",
+    "        however, it is not part of the course.\n",
+    "\n",
+    "        Can illustrate an interesting relationship between the\n",
+    "        \"default\" linear model and a tuned linear model.\n",
+    "\n",
+    "        inputs:\n",
+    "            color:   color of the regression line.\n",
+    "            nodraw:  whether or not to draw the line.\n",
+    "        '''\n",
    "        df = self.devs\n",
    "        X = df[[self.key_x]] #.sort_values(by=self.key_x)\n",
    "        y = df[[self.key_y]] #.sort_values(by=self.key_y)\n",
@@ -296,11 +389,31 @@
    "        plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
    "\n",
    "    def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",
+    "        '''\n",
+    "        Saves canvas to file.\n",
+    "\n",
+    "        inputs:\n",
+    "            base_filename: string with two format codes (two strings),\n",
+    "                this string will be interpolated by...\n",
+    "                1. the programming language\n",
+    "                2. the country of origin.\n",
+    "        '''\n",
    "        plt.figure(self.canvas)\n",
    "        filename = base_filename % (self.language, self.country)\n",
    "        plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
    "\n",
    "def show_model_stats(coef, intercept, y_test, y_pred, label):\n",
+    "    '''\n",
+    "    Displays model performance.\n",
+    "\n",
+    "    inputs:\n",
+    "        coef:      the coefficient of the model.\n",
+    "        intercept: the y-intercept of the model.\n",
+    "        y_test:    true values to compare against model predictions.\n",
+    "        y_pred:    prediction values from the model.\n",
+    "    \n",
+    "        label:     string, to help identify which line (e.g color).\n",
+    "    '''\n",
    "    print('+----------------------+')\n",
    "    print(label)\n",
    "    print('coefficient = %0.2f' % coef)\n",
@@ -315,6 +428,13 @@
    "\n",
    "# the higher a is, the steeper the line gets\n",
    "def log_base_a(x, a=1.07):\n",
+    "    '''\n",
+    "    Performs logarithmic transformation of value 'x' with base 'a'.\n",
+    "\n",
+    "    inputs:\n",
+    "        x: numeric, the variable to be transformed.\n",
+    "        a: numeric, the new base.\n",
+    "    '''\n",
    "    return np.log10(x)/np.log(a)"
   ]
  },