diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index a5acebe..60df211 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -75,6 +75,16 @@ "# draw count plot of developers based on age\n", "\n", "def visualize_devs(df, lang, key='Age'):\n", + " '''\n", + " Draws count plot of developers based on attributes.\n", + "\n", + " inputs:\n", + " df: a DataFrame, the subset of the data set.\n", + " lang: string, the programming language, used for labeling.\n", + " key: string, the attribute to count (age).\n", + " outputs:\n", + " no return values, will draw and save a graphic.\n", + " '''\n", " plt.figure()\n", " plt.xticks(rotation=45)\n", " # from:\n", @@ -91,6 +101,17 @@ "\n", "\n", "def get_lang_devs(df, lang):\n", + " '''\n", + " Returns a DataFrame, subset of the data set, of developers that have\n", + " worked with a specified programming language.\n", + "\n", + " inputs:\n", + " df: a DataFrame, can be the entire published data set.\n", + " lang: a string, the programming language.\n", + " outputs:\n", + " a DataFrame of developers that have worked with `lang` programming \n", + " language.\n", + " '''\n", " col = 'LanguageHaveWorkedWith'\n", " # will not work for single character languages (C, R)\n", " # will mangle Java and JavaScript, Python and MicroPython\n", @@ -98,6 +119,19 @@ "\n", "\n", "def get_c_devs(df, lang='C'):\n", + " '''\n", + " Returns a DataFrame, subset of the data set, of developers that have\n", + " worked with a specified programming language.\n", + " Similar to get_lang_devs() but adapted for languages named by a single\n", + " letter, or names like 'Java' which is contained in 'JavaScript'.\n", + "\n", + " inputs:\n", + " df: a DataFrame, can be the entire published data set.\n", + " lang: a string, the programming language.\n", + " outputs:\n", + " a DataFrame of developers that have worked with `lang` programming \n", + " language.\n", + " '''\n", " key = 'LanguageHaveWorkedWith'\n", " cdevs = []\n", " for index, dev in df.iterrows():\n", @@ -156,6 +190,19 @@ " def __init__(self, dataset, language, jobs=None, \n", " n_rich_outliers=0, n_poor_outliers=0, \n", " country='United States of America'):\n", + " '''\n", + " inputs:\n", + " dataset: A DataFrame, can be the full data set.\n", + " language: string, the programming language \n", + " a developer has worked with.\n", + " jobs: list of strings, job positions \n", + " - typically domains where the language is dominant.\n", + " n_rich_outliers: integer, removes samples from the \n", + " upper limit of the y-axis.\n", + " n_poor_outliers: integer, removes samples from the \n", + " lower limit of the y-axis.\n", + " country: string, specifies the country of origin.\n", + " '''\n", " self.devs = None\n", " self.canvas = None\n", " self.language = language\n", @@ -204,7 +251,16 @@ " del devs, new_column\n", " \n", " def visualize(self, hue='Country', \n", - " palette=sb.color_palette() ): \n", + " palette=sb.color_palette() ):\n", + " '''\n", + " Draw scatter plot of samples included in self.devs.\n", + "\n", + " inputs:\n", + " hue: string, colorize dots by a given key.\n", + " palette: list of strings (color codes)\n", + " or string (matplotlib predefined palettes),\n", + " specifies the colors to use when coloring dots.\n", + " '''\n", " self.canvas = plt.figure()\n", " key_x = self.key_x\n", " key_y = self.key_y\n", @@ -220,6 +276,29 @@ " x_shift=0, y_shift=0,\n", " random=333, risky=0,\n", " color='red', name='Regression Line' ):\n", + " '''\n", + " Run linear regresssion and draws a straight line.\n", + "\n", + " inputs:\n", + " x_transform: function, function to tune the independent variable.\n", + " change_base: float or integer, specifies base \n", + " for logarithmic function, not used if x_transform is None.\n", + " x_shift: integer, for tuning, shifts the position \n", + " of the line on the x-axis.\n", + " y_shift: integer, for tuning, shifts the position \n", + " of the line on the y-axis.\n", + " random: integer, random seed for train_test_split; \n", + " change to test generalization.\n", + " risky integer ranging from 0 to 2,\n", + " 0 = does nothing (default),\n", + " 1 = sorts the independent variable,\n", + " 2 = sorts the dependent variable,\n", + " performs unrecommended operation to sort data,\n", + " risking the model training on the order of values.\n", + " May draw nice lines that generalize across random states.\n", + " color: string, color of the regression line.\n", + " name: string, label of regression line on the legend.\n", + " '''\n", " df = self.devs # .sort_values(by = self.key2)\n", " X = df[[self.key_x]]\n", " y = df[[self.key_y]]\n", @@ -251,7 +330,7 @@ " \n", " m = model.coef_[0][0]\n", " b = model.intercept_[0]\n", - " label = '%s log regression line for %s' % (color, self.language)\n", + " label = '%s regression line for %s' % (color, self.language)\n", " show_model_stats(m, b, y_test, y_pred, label)\n", "\n", " plt.figure(self.canvas)\n", @@ -262,6 +341,20 @@ " del y_pred, model, X, y\n", "\n", " def run_log_regression(self, color='pink', nodraw=True):\n", + " '''\n", + " Runs logarithmic regression and draws a line that contours \n", + " at the point of diminishing returns.\n", + "\n", + " Logarithmic regression provides a better fit for the data;\n", + " however, it is not part of the course.\n", + "\n", + " Can illustrate an interesting relationship between the\n", + " \"default\" linear model and a tuned linear model.\n", + "\n", + " inputs:\n", + " color: color of the regression line.\n", + " nodraw: whether or not to draw the line.\n", + " '''\n", " df = self.devs\n", " X = df[[self.key_x]] #.sort_values(by=self.key_x)\n", " y = df[[self.key_y]] #.sort_values(by=self.key_y)\n", @@ -296,11 +389,31 @@ " plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", "\n", " def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n", + " '''\n", + " Saves canvas to file.\n", + "\n", + " inputs:\n", + " base_filename: string with two format codes (two strings),\n", + " this string will be interpolated by...\n", + " 1. the programming language\n", + " 2. the country of origin.\n", + " '''\n", " plt.figure(self.canvas)\n", " filename = base_filename % (self.language, self.country)\n", " plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n", "\n", "def show_model_stats(coef, intercept, y_test, y_pred, label):\n", + " '''\n", + " Displays model performance.\n", + "\n", + " inputs:\n", + " coef: the coefficient of the model.\n", + " intercept: the y-intercept of the model.\n", + " y_test: true values to compare against model predictions.\n", + " y_pred: prediction values from the model.\n", + " \n", + " label: string, to help identify which line (e.g color).\n", + " '''\n", " print('+----------------------+')\n", " print(label)\n", " print('coefficient = %0.2f' % coef)\n", @@ -315,6 +428,13 @@ "\n", "# the higher a is, the steeper the line gets\n", "def log_base_a(x, a=1.07):\n", + " '''\n", + " Performs logarithmic transformation of value 'x' with base 'a'.\n", + "\n", + " inputs:\n", + " x: numeric, the variable to be transformed.\n", + " a: numeric, the new base.\n", + " '''\n", " return np.log10(x)/np.log(a)" ] },