From cbd575697f4c4870202c6873381f7fbbd71016f9 Mon Sep 17 00:00:00 2001 From: scuti Date: Sat, 19 Apr 2025 14:10:44 -0700 Subject: [PATCH] Squashed commit of the following: commit e1691bb85b611c84ae9e4315523de1b79837ef2b Author: scuti Date: Sat Apr 19 14:00:28 2025 -0700 Created graph for job title and compensation commit 50e00a42686f7135508ca08d1354a36012e839d7 Author: scuti Date: Sat Apr 19 06:38:16 2025 -0700 Got visualization idea for annual compensation --- stackoverflow-survey.ipynb | 51 ++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index d94cb92..70af2cc 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -168,14 +168,55 @@ "cell_type": "code", "execution_count": null, "id": "e90cf119-c50d-468a-bc87-72dac41176ce", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "# print survey ans\n", - "employment_status = Counter(so_df[\"MainBranch\"])\n", - "print(employment_status)\n", + "# see how much money are people making\n", "\n", - "print(so_df[\"ConvertedCompYearly\"][])" + "def get_mean_by_category(df, category, key=\"ConvertedCompYearly\"):\n", + " unique = df[category].unique()\n", + " result = dict()\n", + " for u in unique:\n", + " mean = df[df[category] == u][key].mean()\n", + " result[u] = mean\n", + " return result\n", + "\n", + "def show_me_the_money(df, saveto=None):\n", + " key_x = \"ConvertedCompYearly\"\n", + " key_y = \"DevType\"\n", + " \n", + " means = get_mean_by_category(df, key_y) \n", + " mean_df = pd.DataFrame(means.items(), columns=[key_y, key_x])\n", + "\n", + " plt.figure(figsize=(14,18)) \n", + " plt.axvline(x=1e5, color='red', linestyle='--', label=\"x = $100,000\")\n", + " plt.axvline(x=1e6, color='lightgreen', linestyle='--', label=\"x = millionaire\")\n", + " sb.barplot(x=key_x, y=key_y, data=mean_df.sort_values(by=key_x), \\\n", + " color='lavender', alpha=0.7, label=\"average compensation\")\n", + " sb.stripplot(x=key_x, y=key_y, data=df, \\\n", + " size=3, jitter=True)\n", + " if saveto is not None:\n", + " plt.savefig(saveto, bbox_inches='tight')\n", + " \n", + "# print survey ans\n", + "#employment_status = Counter(so_df[\"MainBranch\"])\n", + "#print(employment_status)\n", + "\n", + "#employment_type = Counter(so_df[\"DevType\"])\n", + "#print(employment_type)\n", + "\n", + "key = \"ConvertedCompYearly\"\n", + "# answers = so_df[:-1][key].count()\n", + "# print(answers, \"people answered re: \", key)\n", + "df_no_na = so_df.dropna(subset=[key])\n", + "indices = df_no_na[key].nlargest(15).index\n", + "\n", + "show_me_the_money( df_no_na.drop(indices), saveto=\"images/compensation-by-profession.png\" )\n", + "# could also ask myself what portion of developers \n", + "# earn less than the mean compensation\n", + "# (what titles have high standard deviations in earnings)" ] }, {