From cbd575697f4c4870202c6873381f7fbbd71016f9 Mon Sep 17 00:00:00 2001
From: scuti <scuti@tutamail.com>
Date: Sat, 19 Apr 2025 14:10:44 -0700
Subject: [PATCH] Squashed commit of the following:

commit e1691bb85b611c84ae9e4315523de1b79837ef2b
Author: scuti <scuti@tutamail.com>
Date:   Sat Apr 19 14:00:28 2025 -0700

    Created graph for job title and compensation

commit 50e00a42686f7135508ca08d1354a36012e839d7
Author: scuti <scuti@tutamail.com>
Date:   Sat Apr 19 06:38:16 2025 -0700

    Got visualization idea for annual compensation
---
 stackoverflow-survey.ipynb | 51 ++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb
index d94cb92..70af2cc 100644
--- a/stackoverflow-survey.ipynb
+++ b/stackoverflow-survey.ipynb
@@ -168,14 +168,55 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "e90cf119-c50d-468a-bc87-72dac41176ce",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "# print survey ans\n",
-    "employment_status = Counter(so_df[\"MainBranch\"])\n",
-    "print(employment_status)\n",
+    "# see how much money are people making\n",
     "\n",
-    "print(so_df[\"ConvertedCompYearly\"][])"
+    "def get_mean_by_category(df, category, key=\"ConvertedCompYearly\"):\n",
+    "    unique = df[category].unique()\n",
+    "    result = dict()\n",
+    "    for u in unique:\n",
+    "        mean = df[df[category] == u][key].mean()\n",
+    "        result[u] = mean\n",
+    "    return result\n",
+    "\n",
+    "def show_me_the_money(df, saveto=None):\n",
+    "    key_x = \"ConvertedCompYearly\"\n",
+    "    key_y = \"DevType\"\n",
+    "    \n",
+    "    means   = get_mean_by_category(df, key_y) \n",
+    "    mean_df = pd.DataFrame(means.items(), columns=[key_y, key_x])\n",
+    "\n",
+    "    plt.figure(figsize=(14,18)) \n",
+    "    plt.axvline(x=1e5, color='red', linestyle='--', label=\"x = $100,000\")\n",
+    "    plt.axvline(x=1e6, color='lightgreen', linestyle='--', label=\"x = millionaire\")\n",
+    "    sb.barplot(x=key_x, y=key_y, data=mean_df.sort_values(by=key_x), \\\n",
+    "               color='lavender', alpha=0.7, label=\"average compensation\")\n",
+    "    sb.stripplot(x=key_x, y=key_y, data=df, \\\n",
+    "                 size=3, jitter=True)\n",
+    "    if saveto is not None:\n",
+    "        plt.savefig(saveto, bbox_inches='tight')\n",
+    "    \n",
+    "# print survey ans\n",
+    "#employment_status = Counter(so_df[\"MainBranch\"])\n",
+    "#print(employment_status)\n",
+    "\n",
+    "#employment_type = Counter(so_df[\"DevType\"])\n",
+    "#print(employment_type)\n",
+    "\n",
+    "key = \"ConvertedCompYearly\"\n",
+    "#    answers = so_df[:-1][key].count()\n",
+    "#    print(answers, \"people answered re: \", key)\n",
+    "df_no_na = so_df.dropna(subset=[key])\n",
+    "indices  = df_no_na[key].nlargest(15).index\n",
+    "\n",
+    "show_me_the_money( df_no_na.drop(indices), saveto=\"images/compensation-by-profession.png\" )\n",
+    "# could also ask myself what portion of developers \n",
+    "# earn less than the mean compensation\n",
+    "# (what titles have high standard deviations in earnings)"
    ]
   },
   {