From 08eb095bf63759621fed215880056844e2c05f59 Mon Sep 17 00:00:00 2001
From: scuti <scuti@tutamail.com>
Date: Sun, 20 Apr 2025 20:32:27 -0700
Subject: [PATCH] Added chart for years of experience and earnings.

Can select developers by programming language.
Colorize dots by country, employment status.
---
 stackoverflow-survey.ipynb | 175 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 174 insertions(+), 1 deletion(-)

diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb
index cd96f03..94847a9 100644
--- a/stackoverflow-survey.ipynb
+++ b/stackoverflow-survey.ipynb
@@ -278,7 +278,180 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "acd193c3-eb73-498c-a8d4-c59c0eb5dcdb",
+   "id": "cdf21b1c-1316-422f-ad14-48150f80366c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# key   = \"DevType\"\n",
+    "# prof  = \"Developer, full-stack\"\n",
+    "\n",
+    "key   = \"MainBranch\"\n",
+    "prof = \"I am a developer by profession\"\n",
+    "col   = \"ConvertedCompYearly\"\n",
+    "\n",
+    "devs =  df_no_na[df_no_na[key] ==  prof ] \n",
+    "pd.set_option('display.float_format', '{:.2f}'.format)\n",
+    "devs.describe()[col]\n",
+    "\n",
+    "# who the hell is making $1/yr \n",
+    "# devs[devs[col] == 1.0]\n",
+    "\n",
+    "# who are the millionaires\n",
+    "# devs[devs[col] > 1e6]\n",
+    "\n",
+    "# who make more than the mean\n",
+    "# devs[devs[col] > 76230.84]\n",
+    "\n",
+    "# who make more than the median\n",
+    "# devs[devs[col] > 63316.00]\n",
+    "\n",
+    "# the ancient ones\n",
+    "so_df[so_df[\"YearsCodePro\"] == 'More than 50 years']\n",
+    "# should drop the 18-24 year old who is either bullshitting or recalls a past life\n",
+    "# 55-64 years old\n",
+    "# 65 years or older"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e9b0c49-eac6-45e1-83f1-92813e734ef5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# draw count plot of developers based on age\n",
+    "\n",
+    "def visualize_devs(df, lang, key=\"Age\",):\n",
+    "    plt.figure()\n",
+    "    plt.xticks(rotation=45)\n",
+    "    # from:\n",
+    "    # print(df[key].unique())\n",
+    "    order =  ['Under 18 years old', '18-24 years old',  \\\n",
+    "              '25-34 years old','35-44 years old',\\\n",
+    "              '45-54 years old', '55-64 years old',  \\\n",
+    "              '65 years or older', 'Prefer not to say']\n",
+    "    sb.countplot(x=key, data=df, order=order)\n",
+    "    title=\"Ages of %s Programmers\" % lang\n",
+    "    plt.title(title)\n",
+    "    filename= \"images/%s-of-%s-programmers.png\" % (key, lang)\n",
+    "    plt.savefig(filename, bbox_inches=\"tight\")\n",
+    "\n",
+    "def get_lang_devs(df, lang):\n",
+    "    col = \"LanguageHaveWorkedWith\"\n",
+    "    # will not work for single character languages (C, R)\n",
+    "    # will mangle Java and JavaScript, Python and MicroPython\n",
+    "    return df[ df[col].str.contains(lang, na=False) ] \n",
+    "\n",
+    "def get_c_devs(df, lang=\"C\"):\n",
+    "    key = \"LanguageHaveWorkedWith\"\n",
+    "    cdevs = []\n",
+    "    for index, dev in df.iterrows():\n",
+    "        try:\n",
+    "            # split string into list\n",
+    "            langs_used = dev[key].split(';')\n",
+    "            if lang in langs_used:\n",
+    "                cdevs.append(dev)\n",
+    "        except AttributeError:\n",
+    "#            print(dev[key])\n",
+    "            pass\n",
+    "    return pd.DataFrame(cdevs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11a1b9fb-db48-4749-8d77-4241a99d7bad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "visualize_devs( get_c_devs(so_df) , \"C\")\n",
+    "\n",
+    "for lang in [\"Cobol\", \"Prolog\", \"Ada\", \"Python\"]:\n",
+    "    foo = get_lang_devs(so_df, lang)\n",
+    "    visualize_devs(foo, lang)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67b56700-5847-4af8-87ec-74249aa95749",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# focus on people who have given ...\n",
+    "key  = \"ConvertedCompYearly\"\n",
+    "key2 = \"YearsCodePro\"\n",
+    "df = so_df.dropna(subset=[key, key2])\n",
+    "\n",
+    "criteria = {\"MainBranch\":\"I am a developer by profession\"}\n",
+    "\n",
+    "#print(df[\"Country\"].unique)\n",
+    "\n",
+    "# criteria[\"Country\"] = \"United States of America\"\n",
+    "for k in criteria:\n",
+    "    df = df[df[k] == criteria[k] ] \n",
+    "\n",
+    "jobs = None\n",
+    "# expected C jobs\n",
+    "#jobs = [\"Developer, embedded applications or devices\", \n",
+    "#        \"Developer, game or graphics\",\n",
+    "#        \"Engineering manager\" , \n",
+    "#        \"Project manager\", \n",
+    "#        \"Product manager\"\n",
+    "#]\n",
+    "\n",
+    "# expected python jobs\n",
+    "#jobs = [\"Data scientist or machine learning specialist\",\n",
+    "#        \"Data or business analyst\",\n",
+    "#        \"Data engineer\",\n",
+    "#        \"DevOps specialist\",\n",
+    "#        \"Developer, QA or test\"\n",
+    "#]\n",
+    "\n",
+    "# chatgpt tells me about filtering with multiple strings\n",
+    "if jobs:\n",
+    "    df = df[df.isin(jobs).any(axis=1)]\n",
+    "\n",
+    "# play with these\n",
+    "language = \"Cobol\"\n",
+    "legend = True\n",
+    "NUM_OF_TOO_RICH = 3\n",
+    "# \"Employment\"\n",
+    "hue = \"Country\"\n",
+    "\n",
+    "devs = None\n",
+    "if len(language) > 1:\n",
+    "    devs = get_lang_devs(df, language)\n",
+    "else:\n",
+    "    devs = get_c_devs(df, lang=language)\n",
+    "replacement_dict = {\n",
+    "    'Less than 1 year': '0.5',\n",
+    "    'More than 50 years': '51',\n",
+    "}\n",
+    "\n",
+    "# https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n",
+    "pd.options.mode.chained_assignment = None  # default='warn'\n",
+    "new_column = devs[key2].replace(replacement_dict)\n",
+    "devs[key2] = pd.to_numeric(new_column, errors='coerce')\n",
+    "# print( devs[key2].unique() )\n",
+    "\n",
+    "indices  = devs[key].nlargest(NUM_OF_TOO_RICH).index\n",
+    "devs = devs.drop(indices)\n",
+    "print( len (devs) )\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.xticks(rotation=90)\n",
+    "sb.scatterplot(data=devs, x=key2, y=key, hue=hue, legend=legend)\n",
+    "plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n",
+    "plt.title(\"Annual Salary of %s Developers Over Years of Experience\" %language)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b954a811-e401-48dc-9ba4-263a5f2cf5c5",
    "metadata": {},
    "outputs": [],
    "source": []