From 08eb095bf63759621fed215880056844e2c05f59 Mon Sep 17 00:00:00 2001 From: scuti Date: Sun, 20 Apr 2025 20:32:27 -0700 Subject: [PATCH] Added chart for years of experience and earnings. Can select developers by programming language. Colorize dots by country, employment status. --- stackoverflow-survey.ipynb | 175 ++++++++++++++++++++++++++++++++++++- 1 file changed, 174 insertions(+), 1 deletion(-) diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index cd96f03..94847a9 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -278,7 +278,180 @@ { "cell_type": "code", "execution_count": null, - "id": "acd193c3-eb73-498c-a8d4-c59c0eb5dcdb", + "id": "cdf21b1c-1316-422f-ad14-48150f80366c", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# key = \"DevType\"\n", + "# prof = \"Developer, full-stack\"\n", + "\n", + "key = \"MainBranch\"\n", + "prof = \"I am a developer by profession\"\n", + "col = \"ConvertedCompYearly\"\n", + "\n", + "devs = df_no_na[df_no_na[key] == prof ] \n", + "pd.set_option('display.float_format', '{:.2f}'.format)\n", + "devs.describe()[col]\n", + "\n", + "# who the hell is making $1/yr \n", + "# devs[devs[col] == 1.0]\n", + "\n", + "# who are the millionaires\n", + "# devs[devs[col] > 1e6]\n", + "\n", + "# who make more than the mean\n", + "# devs[devs[col] > 76230.84]\n", + "\n", + "# who make more than the median\n", + "# devs[devs[col] > 63316.00]\n", + "\n", + "# the ancient ones\n", + "so_df[so_df[\"YearsCodePro\"] == 'More than 50 years']\n", + "# should drop the 18-24 year old who is either bullshitting or recalls a past life\n", + "# 55-64 years old\n", + "# 65 years or older" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e9b0c49-eac6-45e1-83f1-92813e734ef5", + "metadata": {}, + "outputs": [], + "source": [ + "# draw count plot of developers based on age\n", + "\n", + "def visualize_devs(df, lang, key=\"Age\",):\n", + " plt.figure()\n", + " plt.xticks(rotation=45)\n", + " # from:\n", + " # print(df[key].unique())\n", + " order = ['Under 18 years old', '18-24 years old', \\\n", + " '25-34 years old','35-44 years old',\\\n", + " '45-54 years old', '55-64 years old', \\\n", + " '65 years or older', 'Prefer not to say']\n", + " sb.countplot(x=key, data=df, order=order)\n", + " title=\"Ages of %s Programmers\" % lang\n", + " plt.title(title)\n", + " filename= \"images/%s-of-%s-programmers.png\" % (key, lang)\n", + " plt.savefig(filename, bbox_inches=\"tight\")\n", + "\n", + "def get_lang_devs(df, lang):\n", + " col = \"LanguageHaveWorkedWith\"\n", + " # will not work for single character languages (C, R)\n", + " # will mangle Java and JavaScript, Python and MicroPython\n", + " return df[ df[col].str.contains(lang, na=False) ] \n", + "\n", + "def get_c_devs(df, lang=\"C\"):\n", + " key = \"LanguageHaveWorkedWith\"\n", + " cdevs = []\n", + " for index, dev in df.iterrows():\n", + " try:\n", + " # split string into list\n", + " langs_used = dev[key].split(';')\n", + " if lang in langs_used:\n", + " cdevs.append(dev)\n", + " except AttributeError:\n", + "# print(dev[key])\n", + " pass\n", + " return pd.DataFrame(cdevs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11a1b9fb-db48-4749-8d77-4241a99d7bad", + "metadata": {}, + "outputs": [], + "source": [ + "visualize_devs( get_c_devs(so_df) , \"C\")\n", + "\n", + "for lang in [\"Cobol\", \"Prolog\", \"Ada\", \"Python\"]:\n", + " foo = get_lang_devs(so_df, lang)\n", + " visualize_devs(foo, lang)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67b56700-5847-4af8-87ec-74249aa95749", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# focus on people who have given ...\n", + "key = \"ConvertedCompYearly\"\n", + "key2 = \"YearsCodePro\"\n", + "df = so_df.dropna(subset=[key, key2])\n", + "\n", + "criteria = {\"MainBranch\":\"I am a developer by profession\"}\n", + "\n", + "#print(df[\"Country\"].unique)\n", + "\n", + "# criteria[\"Country\"] = \"United States of America\"\n", + "for k in criteria:\n", + " df = df[df[k] == criteria[k] ] \n", + "\n", + "jobs = None\n", + "# expected C jobs\n", + "#jobs = [\"Developer, embedded applications or devices\", \n", + "# \"Developer, game or graphics\",\n", + "# \"Engineering manager\" , \n", + "# \"Project manager\", \n", + "# \"Product manager\"\n", + "#]\n", + "\n", + "# expected python jobs\n", + "#jobs = [\"Data scientist or machine learning specialist\",\n", + "# \"Data or business analyst\",\n", + "# \"Data engineer\",\n", + "# \"DevOps specialist\",\n", + "# \"Developer, QA or test\"\n", + "#]\n", + "\n", + "# chatgpt tells me about filtering with multiple strings\n", + "if jobs:\n", + " df = df[df.isin(jobs).any(axis=1)]\n", + "\n", + "# play with these\n", + "language = \"Cobol\"\n", + "legend = True\n", + "NUM_OF_TOO_RICH = 3\n", + "# \"Employment\"\n", + "hue = \"Country\"\n", + "\n", + "devs = None\n", + "if len(language) > 1:\n", + " devs = get_lang_devs(df, language)\n", + "else:\n", + " devs = get_c_devs(df, lang=language)\n", + "replacement_dict = {\n", + " 'Less than 1 year': '0.5',\n", + " 'More than 50 years': '51',\n", + "}\n", + "\n", + "# https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n", + "pd.options.mode.chained_assignment = None # default='warn'\n", + "new_column = devs[key2].replace(replacement_dict)\n", + "devs[key2] = pd.to_numeric(new_column, errors='coerce')\n", + "# print( devs[key2].unique() )\n", + "\n", + "indices = devs[key].nlargest(NUM_OF_TOO_RICH).index\n", + "devs = devs.drop(indices)\n", + "print( len (devs) )\n", + "\n", + "plt.figure()\n", + "plt.xticks(rotation=90)\n", + "sb.scatterplot(data=devs, x=key2, y=key, hue=hue, legend=legend)\n", + "plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", + "plt.title(\"Annual Salary of %s Developers Over Years of Experience\" %language)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b954a811-e401-48dc-9ba4-263a5f2cf5c5", "metadata": {}, "outputs": [], "source": []