Write DOCSTRINGS for functions.

Also corrected typo in label.
This commit is contained in:
2025-04-28 20:51:39 -07:00
parent f49283a7cc
commit 2af2414219

View File

@@ -75,6 +75,16 @@
"# draw count plot of developers based on age\n",
"\n",
"def visualize_devs(df, lang, key='Age'):\n",
" '''\n",
" Draws count plot of developers based on attributes.\n",
"\n",
" inputs:\n",
" df: a DataFrame, the subset of the data set.\n",
" lang: string, the programming language, used for labeling.\n",
" key: string, the attribute to count (age).\n",
" outputs:\n",
" no return values, will draw and save a graphic.\n",
" '''\n",
" plt.figure()\n",
" plt.xticks(rotation=45)\n",
" # from:\n",
@@ -91,6 +101,17 @@
"\n",
"\n",
"def get_lang_devs(df, lang):\n",
" '''\n",
" Returns a DataFrame, subset of the data set, of developers that have\n",
" worked with a specified programming language.\n",
"\n",
" inputs:\n",
" df: a DataFrame, can be the entire published data set.\n",
" lang: a string, the programming language.\n",
" outputs:\n",
" a DataFrame of developers that have worked with `lang` programming \n",
" language.\n",
" '''\n",
" col = 'LanguageHaveWorkedWith'\n",
" # will not work for single character languages (C, R)\n",
" # will mangle Java and JavaScript, Python and MicroPython\n",
@@ -98,6 +119,19 @@
"\n",
"\n",
"def get_c_devs(df, lang='C'):\n",
" '''\n",
" Returns a DataFrame, subset of the data set, of developers that have\n",
" worked with a specified programming language.\n",
" Similar to get_lang_devs() but adapted for languages named by a single\n",
" letter, or names like 'Java' which is contained in 'JavaScript'.\n",
"\n",
" inputs:\n",
" df: a DataFrame, can be the entire published data set.\n",
" lang: a string, the programming language.\n",
" outputs:\n",
" a DataFrame of developers that have worked with `lang` programming \n",
" language.\n",
" '''\n",
" key = 'LanguageHaveWorkedWith'\n",
" cdevs = []\n",
" for index, dev in df.iterrows():\n",
@@ -156,6 +190,19 @@
" def __init__(self, dataset, language, jobs=None, \n",
" n_rich_outliers=0, n_poor_outliers=0, \n",
" country='United States of America'):\n",
" '''\n",
" inputs:\n",
" dataset: A DataFrame, can be the full data set.\n",
" language: string, the programming language \n",
" a developer has worked with.\n",
" jobs: list of strings, job positions \n",
" - typically domains where the language is dominant.\n",
" n_rich_outliers: integer, removes samples from the \n",
" upper limit of the y-axis.\n",
" n_poor_outliers: integer, removes samples from the \n",
" lower limit of the y-axis.\n",
" country: string, specifies the country of origin.\n",
" '''\n",
" self.devs = None\n",
" self.canvas = None\n",
" self.language = language\n",
@@ -204,7 +251,16 @@
" del devs, new_column\n",
" \n",
" def visualize(self, hue='Country', \n",
" palette=sb.color_palette() ): \n",
" palette=sb.color_palette() ):\n",
" '''\n",
" Draw scatter plot of samples included in self.devs.\n",
"\n",
" inputs:\n",
" hue: string, colorize dots by a given key.\n",
" palette: list of strings (color codes)\n",
" or string (matplotlib predefined palettes),\n",
" specifies the colors to use when coloring dots.\n",
" '''\n",
" self.canvas = plt.figure()\n",
" key_x = self.key_x\n",
" key_y = self.key_y\n",
@@ -220,6 +276,29 @@
" x_shift=0, y_shift=0,\n",
" random=333, risky=0,\n",
" color='red', name='Regression Line' ):\n",
" '''\n",
" Run linear regresssion and draws a straight line.\n",
"\n",
" inputs:\n",
" x_transform: function, function to tune the independent variable.\n",
" change_base: float or integer, specifies base \n",
" for logarithmic function, not used if x_transform is None.\n",
" x_shift: integer, for tuning, shifts the position \n",
" of the line on the x-axis.\n",
" y_shift: integer, for tuning, shifts the position \n",
" of the line on the y-axis.\n",
" random: integer, random seed for train_test_split; \n",
" change to test generalization.\n",
" risky integer ranging from 0 to 2,\n",
" 0 = does nothing (default),\n",
" 1 = sorts the independent variable,\n",
" 2 = sorts the dependent variable,\n",
" performs unrecommended operation to sort data,\n",
" risking the model training on the order of values.\n",
" May draw nice lines that generalize across random states.\n",
" color: string, color of the regression line.\n",
" name: string, label of regression line on the legend.\n",
" '''\n",
" df = self.devs # .sort_values(by = self.key2)\n",
" X = df[[self.key_x]]\n",
" y = df[[self.key_y]]\n",
@@ -251,7 +330,7 @@
" \n",
" m = model.coef_[0][0]\n",
" b = model.intercept_[0]\n",
" label = '%s log regression line for %s' % (color, self.language)\n",
" label = '%s regression line for %s' % (color, self.language)\n",
" show_model_stats(m, b, y_test, y_pred, label)\n",
"\n",
" plt.figure(self.canvas)\n",
@@ -262,6 +341,20 @@
" del y_pred, model, X, y\n",
"\n",
" def run_log_regression(self, color='pink', nodraw=True):\n",
" '''\n",
" Runs logarithmic regression and draws a line that contours \n",
" at the point of diminishing returns.\n",
"\n",
" Logarithmic regression provides a better fit for the data;\n",
" however, it is not part of the course.\n",
"\n",
" Can illustrate an interesting relationship between the\n",
" \"default\" linear model and a tuned linear model.\n",
"\n",
" inputs:\n",
" color: color of the regression line.\n",
" nodraw: whether or not to draw the line.\n",
" '''\n",
" df = self.devs\n",
" X = df[[self.key_x]] #.sort_values(by=self.key_x)\n",
" y = df[[self.key_y]] #.sort_values(by=self.key_y)\n",
@@ -296,11 +389,31 @@
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
"\n",
" def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",
" '''\n",
" Saves canvas to file.\n",
"\n",
" inputs:\n",
" base_filename: string with two format codes (two strings),\n",
" this string will be interpolated by...\n",
" 1. the programming language\n",
" 2. the country of origin.\n",
" '''\n",
" plt.figure(self.canvas)\n",
" filename = base_filename % (self.language, self.country)\n",
" plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
"\n",
"def show_model_stats(coef, intercept, y_test, y_pred, label):\n",
" '''\n",
" Displays model performance.\n",
"\n",
" inputs:\n",
" coef: the coefficient of the model.\n",
" intercept: the y-intercept of the model.\n",
" y_test: true values to compare against model predictions.\n",
" y_pred: prediction values from the model.\n",
" \n",
" label: string, to help identify which line (e.g color).\n",
" '''\n",
" print('+----------------------+')\n",
" print(label)\n",
" print('coefficient = %0.2f' % coef)\n",
@@ -315,6 +428,13 @@
"\n",
"# the higher a is, the steeper the line gets\n",
"def log_base_a(x, a=1.07):\n",
" '''\n",
" Performs logarithmic transformation of value 'x' with base 'a'.\n",
"\n",
" inputs:\n",
" x: numeric, the variable to be transformed.\n",
" a: numeric, the new base.\n",
" '''\n",
" return np.log10(x)/np.log(a)"
]
},