|
|
|
@@ -31,7 +31,7 @@
|
|
|
|
|
"import matplotlib.pyplot as plt\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# avoid burning my eyes @ night\n",
|
|
|
|
|
"plt.style.use(\"dark_background\")"
|
|
|
|
|
"plt.style.use('dark_background')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@@ -51,14 +51,14 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"FILE = \"data/survey_results_public.csv\"\n",
|
|
|
|
|
"FILE = 'data/survey_results_public.csv'\n",
|
|
|
|
|
"so_df = pd.read_csv(FILE)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"print(so_df.keys())\n",
|
|
|
|
|
"so_df.describe()\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# check for people who aren't paying attention\n",
|
|
|
|
|
"count_not_apple = (so_df[\"Check\"] != \"Apples\").sum()\n",
|
|
|
|
|
"count_not_apple = (so_df['Check'] != 'Apples').sum()\n",
|
|
|
|
|
"print(count_not_apple)\n",
|
|
|
|
|
"print(so_df.shape)\n",
|
|
|
|
|
"assert(count_not_apple == 0)\n",
|
|
|
|
@@ -74,7 +74,7 @@
|
|
|
|
|
"source": [
|
|
|
|
|
"# draw count plot of developers based on age\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def visualize_devs(df, lang, key=\"Age\",):\n",
|
|
|
|
|
"def visualize_devs(df, lang, key='Age'):\n",
|
|
|
|
|
" plt.figure()\n",
|
|
|
|
|
" plt.xticks(rotation=45)\n",
|
|
|
|
|
" # from:\n",
|
|
|
|
@@ -84,19 +84,21 @@
|
|
|
|
|
" '45-54 years old', '55-64 years old', \\\n",
|
|
|
|
|
" '65 years or older', 'Prefer not to say']\n",
|
|
|
|
|
" sb.countplot(x=key, data=df, order=order)\n",
|
|
|
|
|
" title=\"Ages of %s Programmers\" % lang\n",
|
|
|
|
|
" title='Ages of %s Programmers' % lang\n",
|
|
|
|
|
" plt.title(title)\n",
|
|
|
|
|
" filename= \"images/%s-of-%s-programmers.png\" % (key, lang)\n",
|
|
|
|
|
" plt.savefig(filename, bbox_inches=\"tight\")\n",
|
|
|
|
|
" filename= 'images/%s-of-%s-programmers.png' % (key, lang)\n",
|
|
|
|
|
" plt.savefig(filename, bbox_inches='tight')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def get_lang_devs(df, lang):\n",
|
|
|
|
|
" col = \"LanguageHaveWorkedWith\"\n",
|
|
|
|
|
" col = 'LanguageHaveWorkedWith'\n",
|
|
|
|
|
" # will not work for single character languages (C, R)\n",
|
|
|
|
|
" # will mangle Java and JavaScript, Python and MicroPython\n",
|
|
|
|
|
" return df[ df[col].str.contains(lang, na=False) ] \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def get_c_devs(df, lang=\"C\"):\n",
|
|
|
|
|
" key = \"LanguageHaveWorkedWith\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def get_c_devs(df, lang='C'):\n",
|
|
|
|
|
" key = 'LanguageHaveWorkedWith'\n",
|
|
|
|
|
" cdevs = []\n",
|
|
|
|
|
" for index, dev in df.iterrows():\n",
|
|
|
|
|
" try:\n",
|
|
|
|
@@ -117,9 +119,9 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"visualize_devs( get_c_devs(so_df) , \"C\")\n",
|
|
|
|
|
"visualize_devs( get_c_devs(so_df) , 'C')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"for lang in [\"Cobol\", \"Prolog\", \"Ada\", \"Python\"]:\n",
|
|
|
|
|
"for lang in ['Cobol', 'Prolog', 'Ada', 'Python']:\n",
|
|
|
|
|
" foo = get_lang_devs(so_df, lang)\n",
|
|
|
|
|
" visualize_devs(foo, lang)"
|
|
|
|
|
]
|
|
|
|
@@ -152,23 +154,23 @@
|
|
|
|
|
"class Foo:\n",
|
|
|
|
|
" def __init__(self, dataset, language, jobs=None, \n",
|
|
|
|
|
" n_rich_outliers=0, n_poor_outliers=0, \n",
|
|
|
|
|
" country=\"United States of America\"):\n",
|
|
|
|
|
" country='United States of America'):\n",
|
|
|
|
|
" self.devs = None\n",
|
|
|
|
|
" self.canvas = None\n",
|
|
|
|
|
" self.language = language\n",
|
|
|
|
|
" self.country = country\n",
|
|
|
|
|
" # focus on people who have given ...\n",
|
|
|
|
|
" key_x = \"YearsCodePro\"\n",
|
|
|
|
|
" key_y = \"ConvertedCompYearly\"\n",
|
|
|
|
|
" key_x = 'YearsCodePro'\n",
|
|
|
|
|
" key_y = 'ConvertedCompYearly'\n",
|
|
|
|
|
" df = dataset.dropna(subset=[key_x, key_y])\n",
|
|
|
|
|
" self.key_x = key_x\n",
|
|
|
|
|
" self.key_y = key_y\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" qualifiers = {\n",
|
|
|
|
|
" \"MainBranch\":\"I am a developer by profession\",\n",
|
|
|
|
|
" 'MainBranch': 'I am a developer by profession',\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
" if country:\n",
|
|
|
|
|
" qualifiers[\"Country\"] = country\n",
|
|
|
|
|
" qualifiers['Country'] = country\n",
|
|
|
|
|
" for k in qualifiers:\n",
|
|
|
|
|
" df = df[df[k] == qualifiers[k] ] \n",
|
|
|
|
|
"\n",
|
|
|
|
@@ -177,7 +179,7 @@
|
|
|
|
|
" df = df[df.isin(jobs).any(axis=1)]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" devs = None\n",
|
|
|
|
|
" if len(language) == 1 or language in [\"Python\", \"Java\"]:\n",
|
|
|
|
|
" if len(language) == 1 or language in ['Python', 'Java']:\n",
|
|
|
|
|
" devs = get_c_devs(df, lang=language)\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" devs = get_lang_devs(df, language)\n",
|
|
|
|
@@ -200,7 +202,7 @@
|
|
|
|
|
" self.devs = devs.drop(indices)\n",
|
|
|
|
|
" del devs, new_column\n",
|
|
|
|
|
" \n",
|
|
|
|
|
" def visualize(self, hue=\"Country\", \n",
|
|
|
|
|
" def visualize(self, hue='Country', \n",
|
|
|
|
|
" palette=sb.color_palette() ): \n",
|
|
|
|
|
" self.canvas = plt.figure()\n",
|
|
|
|
|
" key_x = self.key_x\n",
|
|
|
|
@@ -208,9 +210,9 @@
|
|
|
|
|
"\n",
|
|
|
|
|
" sb.scatterplot(data=self.devs, x=key_x, y=key_y, hue=hue, palette=palette)\n",
|
|
|
|
|
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
|
|
|
|
|
" title = \"Annual Compensation of %s Programmers Over Years of Experience\" % self.language\\\n",
|
|
|
|
|
" + \"\\nsample size=%i\" % len (self.devs)\\\n",
|
|
|
|
|
" + \"\\ncountry=%s\" % self.country\n",
|
|
|
|
|
" title = 'Annual Compensation of %s Programmers Over Years of Experience' % self.language\\\n",
|
|
|
|
|
" + '\\nsample size=%i' % len (self.devs)\\\n",
|
|
|
|
|
" + '\\ncountry=%s' % self.country\n",
|
|
|
|
|
" plt.title(title)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def run_regression(self, model=LinearRegression(), split=train_test_split, \n",
|
|
|
|
@@ -232,31 +234,32 @@
|
|
|
|
|
" \n",
|
|
|
|
|
" m = model.coef_[0][0]\n",
|
|
|
|
|
" b = model.intercept_[0]\n",
|
|
|
|
|
" print(\"+----------------------+\")\n",
|
|
|
|
|
" print(\"%s regression line for %s\" % (line_color, self.language))\n",
|
|
|
|
|
" print(\"coefficient = %0.2f\" % m)\n",
|
|
|
|
|
" print('+----------------------+')\n",
|
|
|
|
|
" print('%s regression line for %s' % (line_color, self.language))\n",
|
|
|
|
|
" print('coefficient = %0.2f' % m)\n",
|
|
|
|
|
" print('intercept = %0.2f' % b)\n",
|
|
|
|
|
" rmse = root_mean_squared_error(y_test, y_pred)\n",
|
|
|
|
|
" print(\"rmse = %0.2f\" % rmse)\n",
|
|
|
|
|
" print('rmse = %0.2f' % rmse)\n",
|
|
|
|
|
" r2 = r2_score(y_test, y_pred)\n",
|
|
|
|
|
" print(\"r2 score = %0.2f\" % r2)\n",
|
|
|
|
|
" print(\"sample predictions:\")\n",
|
|
|
|
|
" print('r2 score = %0.2f' % r2)\n",
|
|
|
|
|
" print('sample predictions:')\n",
|
|
|
|
|
" print(y_pred[3:6])\n",
|
|
|
|
|
" print(\"+----------------------+\")\n",
|
|
|
|
|
" print('+----------------------+')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" plt.figure(self.canvas)\n",
|
|
|
|
|
" plt.plot(X_test, y_pred, color=line_color, label='Regression Line')\n",
|
|
|
|
|
" plt.axhline(y=b, color=\"purple\", linestyle='--', \n",
|
|
|
|
|
" label=\"b=%0.2f\" % b, zorder=-1 )\n",
|
|
|
|
|
" plt.axhline(y=b, color='purple', linestyle='--', \n",
|
|
|
|
|
" label='b=%0.2f' % b, zorder=-1 )\n",
|
|
|
|
|
" plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
|
|
|
|
|
" del y_pred, model\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" def export_image(self, base_filename = \"images/programmers-%s-%s.png\"):\n",
|
|
|
|
|
" def export_image(self, base_filename = 'images/programmers-%s-%s.png'):\n",
|
|
|
|
|
" plt.figure(self.canvas)\n",
|
|
|
|
|
" filename = base_filename % (self.language, self.country)\n",
|
|
|
|
|
" plt.savefig(filename.replace(' ', '-'), bbox_inches='tight')\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# the higher a is, the steeper the line gets\n",
|
|
|
|
|
"def log_base_a(x, a=1.07):\n",
|
|
|
|
|
" return np.log10(x)/np.log(a)"
|
|
|
|
@@ -281,15 +284,15 @@
|
|
|
|
|
"source": [
|
|
|
|
|
"\n",
|
|
|
|
|
"# expected python jobs\n",
|
|
|
|
|
"pyjobs = [\"Data scientist or machine learning specialist\",\n",
|
|
|
|
|
" \"Data or business analyst\",\n",
|
|
|
|
|
" \"Data engineer\",\n",
|
|
|
|
|
"pyjobs = ['Data scientist or machine learning specialist',\n",
|
|
|
|
|
" 'Data or business analyst',\n",
|
|
|
|
|
" 'Data engineer',\n",
|
|
|
|
|
"# \"DevOps specialist\",\n",
|
|
|
|
|
"# \"Developer, QA or test\"\n",
|
|
|
|
|
"]\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"python = Foo(so_df, \"Python\", jobs=pyjobs, n_rich_outliers=12, n_poor_outliers=2)\n",
|
|
|
|
|
"python.visualize(hue=\"DevType\", palette=[\"#dbdb32\", \"#34bf65\", \"#ac70e0\"])\n",
|
|
|
|
|
"python = Foo(so_df, 'Python', jobs=pyjobs, n_rich_outliers=12, n_poor_outliers=2)\n",
|
|
|
|
|
"python.visualize(hue='DevType', palette=['#dbdb32', '#34bf65', '#ac70e0'])\n",
|
|
|
|
|
"python.run_regression()\n",
|
|
|
|
|
"python.run_regression( x_transform=log_base_a, change_base=1.20, \n",
|
|
|
|
|
" x_shift=0, y_shift=-1.5e4, line_color='cyan', random=888)\n",
|
|
|
|
@@ -337,17 +340,17 @@
|
|
|
|
|
"source": [
|
|
|
|
|
"# expected C jobs\n",
|
|
|
|
|
"cjobs = [\n",
|
|
|
|
|
" \"Developer, embedded applications or devices\", \n",
|
|
|
|
|
" \"Developer, game or graphics\",\n",
|
|
|
|
|
" \"Hardware Engineer\" ,\n",
|
|
|
|
|
" 'Developer, embedded applications or devices', \n",
|
|
|
|
|
" 'Developer, game or graphics',\n",
|
|
|
|
|
" 'Hardware Engineer',\n",
|
|
|
|
|
" # \"Project manager\", \n",
|
|
|
|
|
" # \"Product manager\"\n",
|
|
|
|
|
"]\n",
|
|
|
|
|
"c = Foo(so_df, \"C\", jobs=cjobs, n_rich_outliers=30, n_poor_outliers=2)\n",
|
|
|
|
|
"c.visualize(hue=\"DevType\", palette=[\"#57e6da\",\"#d9e352\",\"#cc622d\"] ) \n",
|
|
|
|
|
"c = Foo(so_df, 'C', jobs=cjobs, n_rich_outliers=30, n_poor_outliers=2)\n",
|
|
|
|
|
"c.visualize(hue='DevType', palette=['#57e6da','#d9e352','#cc622d'] ) \n",
|
|
|
|
|
"c.run_regression()\n",
|
|
|
|
|
"c.run_regression(x_transform=log_base_a, change_base=1.3, \n",
|
|
|
|
|
" x_shift=2, y_shift=-5000, line_color=\"magenta\", random=555)\n",
|
|
|
|
|
" x_shift=2, y_shift=-5000, line_color='magenta', random=555)\n",
|
|
|
|
|
"c.export_image()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|