From f073019538bb1fbc75de0fae718a4bd18d33df99 Mon Sep 17 00:00:00 2001 From: scuti Date: Sun, 20 Apr 2025 22:00:57 -0700 Subject: [PATCH] Created function to generate chart of salary over years of exp. --- stackoverflow-survey.ipynb | 129 +++++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 56 deletions(-) diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index 94847a9..ee85569 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -380,72 +380,89 @@ "outputs": [], "source": [ "\n", - "# focus on people who have given ...\n", - "key = \"ConvertedCompYearly\"\n", - "key2 = \"YearsCodePro\"\n", - "df = so_df.dropna(subset=[key, key2])\n", + "# come up with name later\n", + "# do_actual_project\n", + "# earnings over years of exp\n", + "def foo(dataset, language, jobs=None, n_rich_outliers=0, n_lowest=0, hue=\"Country\", country=\"United States of America\"):\n", + " # focus on people who have given ...\n", + " key = \"ConvertedCompYearly\"\n", + " key2 = \"YearsCodePro\"\n", + " df = dataset.dropna(subset=[key, key2])\n", + " \n", + " criteria = {\"MainBranch\":\"I am a developer by profession\"}\n", + " \n", + " #print(df[\"Country\"].unique)\n", + " if country:\n", + " criteria[\"Country\"] = country\n", + " for k in criteria:\n", + " df = df[df[k] == criteria[k] ] \n", + " \n", + " # chatgpt tells me about filtering with multiple strings\n", + " if jobs:\n", + " df = df[df.isin(jobs).any(axis=1)]\n", + " \n", + " devs = None\n", + " if len(language) > 1:\n", + " devs = get_lang_devs(df, language)\n", + " else:\n", + " devs = get_c_devs(df, lang=language)\n", + " replacement_dict = {\n", + " 'Less than 1 year': '0.5',\n", + " 'More than 50 years': '51',\n", + " }\n", + " \n", + " # https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n", + " pd.options.mode.chained_assignment = None # default='warn'\n", + " new_column = devs[key2].replace(replacement_dict)\n", + " devs[key2] = pd.to_numeric(new_column, errors='coerce')\n", + " # print( devs[key2].unique() )\n", + " \n", + " indices = devs[key].nlargest(n_rich_outliers).index\n", + " devs = devs.drop(indices)\n", "\n", - "criteria = {\"MainBranch\":\"I am a developer by profession\"}\n", + " plt.figure()\n", + " \n", + " if n_lowest > 0:\n", + " # chatgpt draws my line\n", + " # Calculate the lowest nth point (for example, the 5th lowest value)\n", + " # iloc[-1] gets the last element from the n smallest\n", + " lowest_nth = df[key].nsmallest(n_lowest).iloc[-1] \n", + " # Draw a horizontal line at the lowest nth point\n", + " # label=f'Lowest {n_poorest}th Point: {lowest_nth_value:.2f}'\n", + " plt.axhline(y=lowest_nth, color='r', linestyle='--', label=\"y=%0.2f\" % lowest_nth )\n", "\n", - "#print(df[\"Country\"].unique)\n", + "# plt.xticks(rotation=90)\n", + " sb.scatterplot(data=devs, x=key2, y=key, hue=hue)\n", + " plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", + " title = \"Annual Salary of %s Developers Over Years of Experience\" %language\\\n", + " + \"\\nsample size=%i\" % len (devs)\\\n", + " + \"\\ncountry=%s\" %country\n", + " plt.title(title)\n", "\n", - "# criteria[\"Country\"] = \"United States of America\"\n", - "for k in criteria:\n", - " df = df[df[k] == criteria[k] ] \n", - "\n", - "jobs = None\n", "# expected C jobs\n", - "#jobs = [\"Developer, embedded applications or devices\", \n", - "# \"Developer, game or graphics\",\n", - "# \"Engineering manager\" , \n", - "# \"Project manager\", \n", - "# \"Product manager\"\n", - "#]\n", + "cjobs = [\"Developer, embedded applications or devices\", \n", + " \"Developer, game or graphics\",\n", + " \"Hardware Engineer\" ,\n", + " # \"Project manager\", \n", + " # \"Product manager\"\n", + "]\n", "\n", "# expected python jobs\n", - "#jobs = [\"Data scientist or machine learning specialist\",\n", - "# \"Data or business analyst\",\n", - "# \"Data engineer\",\n", + "pyjobs = [\"Data scientist or machine learning specialist\",\n", + " \"Data or business analyst\",\n", + " \"Data engineer\",\n", "# \"DevOps specialist\",\n", "# \"Developer, QA or test\"\n", - "#]\n", + "]\n", "\n", - "# chatgpt tells me about filtering with multiple strings\n", - "if jobs:\n", - " df = df[df.isin(jobs).any(axis=1)]\n", + "jsjobs = [\"Developer, full-stack\",\n", + " \"Developer, front-end\",\n", + " \"Developer, mobile\"\n", + "]\n", "\n", - "# play with these\n", - "language = \"Cobol\"\n", - "legend = True\n", - "NUM_OF_TOO_RICH = 3\n", - "# \"Employment\"\n", - "hue = \"Country\"\n", - "\n", - "devs = None\n", - "if len(language) > 1:\n", - " devs = get_lang_devs(df, language)\n", - "else:\n", - " devs = get_c_devs(df, lang=language)\n", - "replacement_dict = {\n", - " 'Less than 1 year': '0.5',\n", - " 'More than 50 years': '51',\n", - "}\n", - "\n", - "# https://stackoverflow.com/questions/47443134/update-column-in-pandas-dataframe-without-warning\n", - "pd.options.mode.chained_assignment = None # default='warn'\n", - "new_column = devs[key2].replace(replacement_dict)\n", - "devs[key2] = pd.to_numeric(new_column, errors='coerce')\n", - "# print( devs[key2].unique() )\n", - "\n", - "indices = devs[key].nlargest(NUM_OF_TOO_RICH).index\n", - "devs = devs.drop(indices)\n", - "print( len (devs) )\n", - "\n", - "plt.figure()\n", - "plt.xticks(rotation=90)\n", - "sb.scatterplot(data=devs, x=key2, y=key, hue=hue, legend=legend)\n", - "plt.legend(loc='lower center', bbox_to_anchor=(1.5,0)) \n", - "plt.title(\"Annual Salary of %s Developers Over Years of Experience\" %language)\n" + "foo(so_df, \"Python\", jobs=pyjobs, hue=\"DevType\", n_rich_outliers=7, n_lowest=12)\n", + "foo(so_df, \"C\", jobs=cjobs, hue=\"DevType\", n_rich_outliers=3, n_lowest=5)\n", + "foo(so_df, \"JavaScript\", jobs=jsjobs, hue=\"DevType\", n_rich_outliers=6, country=\"Ukraine\")" ] }, {