diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index d94cb92..70af2cc 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -168,14 +168,55 @@ "cell_type": "code", "execution_count": null, "id": "e90cf119-c50d-468a-bc87-72dac41176ce", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "# print survey ans\n", - "employment_status = Counter(so_df[\"MainBranch\"])\n", - "print(employment_status)\n", + "# see how much money are people making\n", "\n", - "print(so_df[\"ConvertedCompYearly\"][])" + "def get_mean_by_category(df, category, key=\"ConvertedCompYearly\"):\n", + " unique = df[category].unique()\n", + " result = dict()\n", + " for u in unique:\n", + " mean = df[df[category] == u][key].mean()\n", + " result[u] = mean\n", + " return result\n", + "\n", + "def show_me_the_money(df, saveto=None):\n", + " key_x = \"ConvertedCompYearly\"\n", + " key_y = \"DevType\"\n", + " \n", + " means = get_mean_by_category(df, key_y) \n", + " mean_df = pd.DataFrame(means.items(), columns=[key_y, key_x])\n", + "\n", + " plt.figure(figsize=(14,18)) \n", + " plt.axvline(x=1e5, color='red', linestyle='--', label=\"x = $100,000\")\n", + " plt.axvline(x=1e6, color='lightgreen', linestyle='--', label=\"x = millionaire\")\n", + " sb.barplot(x=key_x, y=key_y, data=mean_df.sort_values(by=key_x), \\\n", + " color='lavender', alpha=0.7, label=\"average compensation\")\n", + " sb.stripplot(x=key_x, y=key_y, data=df, \\\n", + " size=3, jitter=True)\n", + " if saveto is not None:\n", + " plt.savefig(saveto, bbox_inches='tight')\n", + " \n", + "# print survey ans\n", + "#employment_status = Counter(so_df[\"MainBranch\"])\n", + "#print(employment_status)\n", + "\n", + "#employment_type = Counter(so_df[\"DevType\"])\n", + "#print(employment_type)\n", + "\n", + "key = \"ConvertedCompYearly\"\n", + "# answers = so_df[:-1][key].count()\n", + "# print(answers, \"people answered re: \", key)\n", + "df_no_na = so_df.dropna(subset=[key])\n", + "indices = df_no_na[key].nlargest(15).index\n", + "\n", + "show_me_the_money( df_no_na.drop(indices), saveto=\"images/compensation-by-profession.png\" )\n", + "# could also ask myself what portion of developers \n", + "# earn less than the mean compensation\n", + "# (what titles have high standard deviations in earnings)" ] }, {