Write DOCSTRINGS for functions.
Also corrected typo in label.
This commit is contained in:
@@ -75,6 +75,16 @@
|
||||
"# draw count plot of developers based on age\n",
|
||||
"\n",
|
||||
"def visualize_devs(df, lang, key='Age'):\n",
|
||||
" '''\n",
|
||||
" Draws count plot of developers based on attributes.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" df: a DataFrame, the subset of the data set.\n",
|
||||
" lang: string, the programming language, used for labeling.\n",
|
||||
" key: string, the attribute to count (age).\n",
|
||||
" outputs:\n",
|
||||
" no return values, will draw and save a graphic.\n",
|
||||
" '''\n",
|
||||
" plt.figure()\n",
|
||||
" plt.xticks(rotation=45)\n",
|
||||
" # from:\n",
|
||||
@@ -91,6 +101,17 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_lang_devs(df, lang):\n",
|
||||
" '''\n",
|
||||
" Returns a DataFrame, subset of the data set, of developers that have\n",
|
||||
" worked with a specified programming language.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" df: a DataFrame, can be the entire published data set.\n",
|
||||
" lang: a string, the programming language.\n",
|
||||
" outputs:\n",
|
||||
" a DataFrame of developers that have worked with `lang` programming \n",
|
||||
" language.\n",
|
||||
" '''\n",
|
||||
" col = 'LanguageHaveWorkedWith'\n",
|
||||
" # will not work for single character languages (C, R)\n",
|
||||
" # will mangle Java and JavaScript, Python and MicroPython\n",
|
||||
@@ -98,6 +119,19 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_c_devs(df, lang='C'):\n",
|
||||
" '''\n",
|
||||
" Returns a DataFrame, subset of the data set, of developers that have\n",
|
||||
" worked with a specified programming language.\n",
|
||||
" Similar to get_lang_devs() but adapted for languages named by a single\n",
|
||||
" letter, or names like 'Java' which is contained in 'JavaScript'.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" df: a DataFrame, can be the entire published data set.\n",
|
||||
" lang: a string, the programming language.\n",
|
||||
" outputs:\n",
|
||||
" a DataFrame of developers that have worked with `lang` programming \n",
|
||||
" language.\n",
|
||||
" '''\n",
|
||||
" key = 'LanguageHaveWorkedWith'\n",
|
||||
" cdevs = []\n",
|
||||
" for index, dev in df.iterrows():\n",
|
||||
@@ -156,6 +190,19 @@
|
||||
" def __init__(self, dataset, language, jobs=None, \n",
|
||||
" n_rich_outliers=0, n_poor_outliers=0, \n",
|
||||
" country='United States of America'):\n",
|
||||
" '''\n",
|
||||
" inputs:\n",
|
||||
" dataset: A DataFrame, can be the full data set.\n",
|
||||
" language: string, the programming language \n",
|
||||
" a developer has worked with.\n",
|
||||
" jobs: list of strings, job positions \n",
|
||||
" - typically domains where the language is dominant.\n",
|
||||
" n_rich_outliers: integer, removes samples from the \n",
|
||||
" upper limit of the y-axis.\n",
|
||||
" n_poor_outliers: integer, removes samples from the \n",
|
||||
" lower limit of the y-axis.\n",
|
||||
" country: string, specifies the country of origin.\n",
|
||||
" '''\n",
|
||||
" self.devs = None\n",
|
||||
" self.canvas = None\n",
|
||||
" self.language = language\n",
|
||||
@@ -204,7 +251,16 @@
|
||||
" del devs, new_column\n",
|
||||
" \n",
|
||||
" def visualize(self, hue='Country', \n",
|
||||
" palette=sb.color_palette() ): \n",
|
||||
" palette=sb.color_palette() ):\n",
|
||||
" '''\n",
|
||||
" Draw scatter plot of samples included in self.devs.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" hue: string, colorize dots by a given key.\n",
|
||||
" palette: list of strings (color codes)\n",
|
||||
" or string (matplotlib predefined palettes),\n",
|
||||
" specifies the colors to use when coloring dots.\n",
|
||||
" '''\n",
|
||||
" self.canvas = plt.figure()\n",
|
||||
" key_x = self.key_x\n",
|
||||
" key_y = self.key_y\n",
|
||||
@@ -220,6 +276,29 @@
|
||||
" x_shift=0, y_shift=0,\n",
|
||||
" random=333, risky=0,\n",
|
||||
" color='red', name='Regression Line' ):\n",
|
||||
" '''\n",
|
||||
" Run linear regresssion and draws a straight line.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" x_transform: function, function to tune the independent variable.\n",
|
||||
" change_base: float or integer, specifies base \n",
|
||||
" for logarithmic function, not used if x_transform is None.\n",
|
||||
" x_shift: integer, for tuning, shifts the position \n",
|
||||
" of the line on the x-axis.\n",
|
||||
" y_shift: integer, for tuning, shifts the position \n",
|
||||
" of the line on the y-axis.\n",
|
||||
" random: integer, random seed for train_test_split; \n",
|
||||
" change to test generalization.\n",
|
||||
" risky integer ranging from 0 to 2,\n",
|
||||
" 0 = does nothing (default),\n",
|
||||
" 1 = sorts the independent variable,\n",
|
||||
" 2 = sorts the dependent variable,\n",
|
||||
" performs unrecommended operation to sort data,\n",
|
||||
" risking the model training on the order of values.\n",
|
||||
" May draw nice lines that generalize across random states.\n",
|
||||
" color: string, color of the regression line.\n",
|
||||
" name: string, label of regression line on the legend.\n",
|
||||
" '''\n",
|
||||
" df = self.devs # .sort_values(by = self.key2)\n",
|
||||
" X = df[[self.key_x]]\n",
|
||||
" y = df[[self.key_y]]\n",
|
||||
@@ -251,7 +330,7 @@
|
||||
" \n",
|
||||
" m = model.coef_[0][0]\n",
|
||||
" b = model.intercept_[0]\n",
|
||||
" label = '%s log regression line for %s' % (color, self.language)\n",
|
||||
" label = '%s regression line for %s' % (color, self.language)\n",
|
||||
" show_model_stats(m, b, y_test, y_pred, label)\n",
|
||||
"\n",
|
||||
" plt.figure(self.canvas)\n",
|
||||
@@ -262,6 +341,20 @@
|
||||
" del y_pred, model, X, y\n",
|
||||
"\n",
|
||||
" def run_log_regression(self, color='pink', nodraw=True):\n",
|
||||
" '''\n",
|
||||
" Runs logarithmic regression and draws a line that contours \n",
|
||||
" at the point of diminishing returns.\n",
|
||||
"\n",
|
||||
" Logarithmic regression provides a better fit for the data;\n",
|
||||
" however, it is not part of the course.\n",
|
||||
"\n",
|
||||
" Can illustrate an interesting relationship between the\n",
|
||||
" \"default\" linear model and a tuned linear model.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" color: color of the regression line.\n",
|
||||
" nodraw: whether or not to draw the line.\n",
|
||||
" '''\n",
|
||||
" df = self.devs\n",
|
||||
" X = df[[self.key_x]] #.sort_values(by=self.key_x)\n",
|
||||
" y = df[[self.key_y]] #.sort_values(by=self.key_y)\n",
|
||||
@@ -296,11 +389,31 @@
|
||||
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
|
||||
"\n",
|
||||
" def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",
|
||||
" '''\n",
|
||||
" Saves canvas to file.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" base_filename: string with two format codes (two strings),\n",
|
||||
" this string will be interpolated by...\n",
|
||||
" 1. the programming language\n",
|
||||
" 2. the country of origin.\n",
|
||||
" '''\n",
|
||||
" plt.figure(self.canvas)\n",
|
||||
" filename = base_filename % (self.language, self.country)\n",
|
||||
" plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
|
||||
"\n",
|
||||
"def show_model_stats(coef, intercept, y_test, y_pred, label):\n",
|
||||
" '''\n",
|
||||
" Displays model performance.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" coef: the coefficient of the model.\n",
|
||||
" intercept: the y-intercept of the model.\n",
|
||||
" y_test: true values to compare against model predictions.\n",
|
||||
" y_pred: prediction values from the model.\n",
|
||||
" \n",
|
||||
" label: string, to help identify which line (e.g color).\n",
|
||||
" '''\n",
|
||||
" print('+----------------------+')\n",
|
||||
" print(label)\n",
|
||||
" print('coefficient = %0.2f' % coef)\n",
|
||||
@@ -315,6 +428,13 @@
|
||||
"\n",
|
||||
"# the higher a is, the steeper the line gets\n",
|
||||
"def log_base_a(x, a=1.07):\n",
|
||||
" '''\n",
|
||||
" Performs logarithmic transformation of value 'x' with base 'a'.\n",
|
||||
"\n",
|
||||
" inputs:\n",
|
||||
" x: numeric, the variable to be transformed.\n",
|
||||
" a: numeric, the new base.\n",
|
||||
" '''\n",
|
||||
" return np.log10(x)/np.log(a)"
|
||||
]
|
||||
},
|
||||
|
Reference in New Issue
Block a user