diff --git a/stackoverflow-survey.ipynb b/stackoverflow-survey.ipynb index 34ecd6b..59f75fc 100644 --- a/stackoverflow-survey.ipynb +++ b/stackoverflow-survey.ipynb @@ -38,281 +38,6 @@ "# print(so_df[:3])\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "35b9727a-176c-4193-a1f9-a508aecd2d1c", - "metadata": {}, - "outputs": [], - "source": [ - "# get popularity of different programming languages\n", - "\n", - "#keys re: languages are:\n", - "#LanguageHaveWorkedWith,LanguageWantToWorkWith,LanguageAdmired,LanguageDesired\n", - "\n", - "# draw as strip chart\n", - "# https://seaborn.pydata.org/generated/seaborn.stripplot.html#seaborn.stripplot\n", - "\n", - "def get_langs(dataset, key=\"LanguageHaveWorkedWith\"):\n", - " lang_count = Counter()\n", - " assert(key in dataset.keys())\n", - " for response in dataset[key]:\n", - " if type(response) == str:\n", - " lang_count.update(response.split(';'))\n", - " langs_by_popularity = dict(\n", - " sorted(lang_count.items(), key=lambda item: item[1], reverse=True)\n", - " )\n", - " return langs_by_popularity\n", - "\n", - "def visualize_langs(langs, langs2, label1 = \"condition1\", label2 = \"condition2\", saveto=None):\n", - " DOT_COLOR1 = \"lightblue\"\n", - " DOT_COLOR2 = \"red\"\n", - " BG_COLOR = \"black\" \n", - " df = pd.DataFrame(langs.items(), columns=['Languages', 'Count'])\n", - " df2 = pd.DataFrame(langs2.items(), columns=['Languages', 'Count'])\n", - " \n", - " plt.figure(figsize=(10,15)) \n", - " \n", - " sb.stripplot(x='Count', y='Languages', data=df, \\\n", - " size=5, color=DOT_COLOR1, label=\"have worked with\", jitter=True)\n", - " sb.stripplot(x='Count', y='Languages', data=df2, \\\n", - " size=5, color=DOT_COLOR2, label=\"want to work with\", jitter=True)\n", - " \n", - " # chatgpt draws my legend\n", - " # Create custom legend handles to avoid duplicates\n", - " # color = 'w' means do not draw line bissecting point\n", - " blue_patch = plt.Line2D(\n", - " [0], [0], marker='o', color=BG_COLOR, \\\n", - " label=label1, markerfacecolor=DOT_COLOR1, markersize=10)\n", - " red_patch = plt.Line2D(\n", - " [0], [0], marker='o', color=BG_COLOR, \\\n", - " label=label2, markerfacecolor=DOT_COLOR2, markersize=10)\n", - " \n", - " # Show the legend with custom handles\n", - " plt.legend(handles=[blue_patch, red_patch], loc=\"center right\")\n", - " \n", - " plt.grid(axis='x', linestyle='--', alpha=0.75) \n", - " plt.title(\"%s vs %s\" % (label1, label2))\n", - " if saveto is not None:\n", - " plt.savefig(saveto, bbox_inches='tight')\n", - " del df, df2\n", - "\n", - "l1 = get_langs( so_df )\n", - "l2 = get_langs( so_df, \"LanguageWantToWorkWith\" )\n", - "visualize_langs(l1,l2, \n", - " label1=\"have worked with\", label2=\"want to work with\",\n", - " saveto=\"images/used-vs-want2use.png\")\n", - "\n", - "l3 = get_langs( so_df, \"LanguageAdmired\")\n", - "l4 = get_langs( so_df, \"LanguageWantToWorkWith\")\n", - "visualize_langs(l3, l4, \n", - " label1=\"admired\", label2=\"want to work with\",\n", - " saveto=\"images/admired-vs-want2use.png\")\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0bfdb92-378a-4452-91cc-4d21afd2d6cc", - "metadata": {}, - "outputs": [], - "source": [ - "# draw horizontal bar plot\n", - "# https://seaborn.pydata.org/examples/part_whole_bars.html\n", - "\n", - "# investigate extrinsic vs intrinsic motivation\n", - "def get_difference(dict1, dict2, proportion=False):\n", - " keys = dict1.keys()\n", - " result = dict()\n", - " for key in keys:\n", - " if proportion:\n", - " result[key] = round((dict1[key] - dict2[key])/dict2[key],2)\n", - " else:\n", - " result[key] = dict1[key] - dict2[key]\n", - " return result\n", - "\n", - "def visualize_diff(diff_dict, color=\"lightblue\", saveto=None):\n", - " diff_sorted = dict(\n", - " sorted(diff_dict.items(), key=lambda item: item[1], reverse=True)\n", - " )\n", - " KEY = \"Value\"\n", - " df = pd.DataFrame(diff_sorted.items(), columns=['Languages', 'Value'])\n", - " plt.figure(figsize=(15,20)) \n", - " sb.barplot(x=KEY, y='Languages', data=df, color=color)\n", - " DELTA = '\\u0394'\n", - " for index, value in enumerate(df[KEY]):\n", - " # chatgpt annotates my chart\n", - " # Position the text at the base of the bar\n", - " if value >= 0:\n", - " # Adjust the x position for positive values\n", - " plt.text(value, index, DELTA+str(value), va='center', ha=\"left\") \n", - " else:\n", - " # Adjust the x position for negative values\n", - " plt.text(value, index, DELTA+str(value), va='center', ha='right') \n", - " lowest = 0\n", - " offset = 0\n", - " positive_values = df[df[KEY] > 0][KEY]\n", - " if not positive_values.empty:\n", - " lowest = positive_values.min()\n", - " offset = list(positive_values).count(lowest) \n", - " if len(positive_values) < len(df):\n", - " # don't draw the line if every value is greater than 0_\n", - " plt.axhline(y=df[KEY].tolist().index(lowest) + (offset-0.5), \n", - " color='red', linestyle='--', zorder=-1)\n", - " if saveto is not None:\n", - " plt.savefig(saveto, bbox_inches='tight')\n", - " \n", - "motiv_diff = get_difference(l2, l1, proportion=True)\n", - "# print(motiv_diff)\n", - "visualize_diff(motiv_diff, saveto=\"images/delta.png\")\n", - "motiv_diff = get_difference(l2, l1)\n", - "visualize_diff(motiv_diff, saveto=\"images/delta-b.png\")\n", - "\n", - "# no clear description of what \"admired\" is\n", - "# in the schema\n", - "# but generally people want to use the languages\n", - "# they admire\n", - "\n", - "# determine level of hype\n", - "# hype = get_difference(l4, l3)\n", - "# print(hype)\n", - "# visualize_diff(hype, color=\"red\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6b1a935-eeda-416f-8adf-5e854d3aa066", - "metadata": {}, - "outputs": [], - "source": [ - "# do people fall out of love with langs\n", - "# the more they are used professionally?\n", - "\n", - "def visualize_favor(df, key_x, key_y, MAGIC_X=0, MAGIC_Y=0, title=str(), saveto=None):\n", - " plt.figure()\n", - " OFFSET = 1 # push text away from point slightly\n", - " for i in range(merged.shape[0]):\n", - " # label points that aren't un a cluster\n", - " if merged[key_x][i] > MAGIC_X or merged[key_y][i] > MAGIC_Y:\n", - " plt.text(merged[key_x].iloc[i]+OFFSET, \n", - " merged[key_y].iloc[i]+OFFSET, \n", - " merged[\"Language\"].iloc[i], \n", - " ha=\"left\",\n", - " size='medium')\n", - "\n", - " sb.scatterplot(data=merged, x=key_x, y=key_y, hue=\"Language\")\n", - " plt.legend(loc='lower left', bbox_to_anchor=(0, -1.25), ncol=3) \n", - " plt.title(title)\n", - " if saveto is not None:\n", - " plt.savefig(saveto, bbox_inches='tight')\n", - " pass\n", - "key_x = \"Users\"\n", - "key_y = \"Potential '\\u0394'Users\"\n", - "df1 = pd.DataFrame(l1.items(), columns=['Language', key_x])\n", - "df2 = pd.DataFrame(motiv_diff.items(), columns=['Language', key_y])\n", - "# chatgpt tells me how to combine df\n", - "merged = pd.merge(df1, df2[[\"Language\", key_y]], on='Language', how='left')\n", - "visualize_favor(merged, key_x, key_y, \n", - " MAGIC_X=5000, MAGIC_Y=2000, \n", - " saveto=\"images/favor.png\")\n", - "del df1, df2, merged" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90cf119-c50d-468a-bc87-72dac41176ce", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# see how much money are people making\n", - "\n", - "def get_mean_by_category(df, category, key=\"ConvertedCompYearly\"):\n", - " unique = df[category].unique()\n", - " result = dict()\n", - " for u in unique:\n", - " mean = df[df[category] == u][key].mean()\n", - " result[u] = mean\n", - " return result\n", - "\n", - "def show_me_the_money(df, saveto=None):\n", - " key_x = \"ConvertedCompYearly\"\n", - " key_y = \"DevType\"\n", - " \n", - " means = get_mean_by_category(df, key_y) \n", - " mean_df = pd.DataFrame(means.items(), columns=[key_y, key_x])\n", - "\n", - " plt.figure(figsize=(14,18)) \n", - " plt.axvline(x=1e5, color='red', linestyle='--', label=\"x = $100,000\")\n", - " plt.axvline(x=1e6, color='lightgreen', linestyle='--', label=\"x = millionaire\")\n", - " sb.barplot(x=key_x, y=key_y, data=mean_df.sort_values(by=key_x), \\\n", - " color='lavender', alpha=0.7, label=\"average compensation\")\n", - " sb.stripplot(x=key_x, y=key_y, data=df, \\\n", - " size=3, jitter=True)\n", - " if saveto is not None:\n", - " plt.savefig(saveto, bbox_inches='tight')\n", - " \n", - "# print survey ans\n", - "#employment_status = Counter(so_df[\"MainBranch\"])\n", - "#print(employment_status)\n", - "\n", - "#employment_type = Counter(so_df[\"DevType\"])\n", - "#print(employment_type)\n", - "\n", - "key = \"ConvertedCompYearly\"\n", - "# answers = so_df[:-1][key].count()\n", - "# print(answers, \"people answered re: \", key)\n", - "df_no_na = so_df.dropna(subset=[key])\n", - "indices = df_no_na[key].nlargest(15).index\n", - "\n", - "show_me_the_money( df_no_na.drop(indices), saveto=\"images/compensation-by-profession.png\" )\n", - "# could also ask myself what portion of developers \n", - "# earn less than the mean compensation\n", - "# (what titles have high standard deviations in earnings)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdf21b1c-1316-422f-ad14-48150f80366c", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# key = \"DevType\"\n", - "# prof = \"Developer, full-stack\"\n", - "\n", - "key = \"MainBranch\"\n", - "prof = \"I am a developer by profession\"\n", - "col = \"ConvertedCompYearly\"\n", - "\n", - "devs = df_no_na[df_no_na[key] == prof ] \n", - "pd.set_option('display.float_format', '{:.2f}'.format)\n", - "devs.describe()[col]\n", - "\n", - "# who the hell is making $1/yr \n", - "# devs[devs[col] == 1.0]\n", - "\n", - "# who are the millionaires\n", - "# devs[devs[col] > 1e6]\n", - "\n", - "# who make more than the mean\n", - "# devs[devs[col] > 76230.84]\n", - "\n", - "# who make more than the median\n", - "# devs[devs[col] > 63316.00]\n", - "\n", - "# the ancient ones\n", - "so_df[so_df[\"YearsCodePro\"] == 'More than 50 years']\n", - "# should drop the 18-24 year old who is either bullshitting or recalls a past life\n", - "# 55-64 years old\n", - "# 65 years or older" - ] - }, { "cell_type": "code", "execution_count": null, @@ -358,20 +83,6 @@ " return pd.DataFrame(cdevs)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "11a1b9fb-db48-4749-8d77-4241a99d7bad", - "metadata": {}, - "outputs": [], - "source": [ - "visualize_devs( get_c_devs(so_df) , \"C\")\n", - "\n", - "for lang in [\"Cobol\", \"Prolog\", \"Ada\", \"Python\"]:\n", - " foo = get_lang_devs(so_df, lang)\n", - " visualize_devs(foo, lang)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -379,7 +90,6 @@ "metadata": {}, "outputs": [], "source": [ - "##### import numpy as np\n", "\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", @@ -567,6 +277,314 @@ "js.export_image()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "11a1b9fb-db48-4749-8d77-4241a99d7bad", + "metadata": {}, + "outputs": [], + "source": [ + "visualize_devs( get_c_devs(so_df) , \"C\")\n", + "\n", + "for lang in [\"Cobol\", \"Prolog\", \"Ada\", \"Python\"]:\n", + " foo = get_lang_devs(so_df, lang)\n", + " visualize_devs(foo, lang)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35b9727a-176c-4193-a1f9-a508aecd2d1c", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "# get popularity of different programming languages\n", + "\n", + "#keys re: languages are:\n", + "#LanguageHaveWorkedWith,LanguageWantToWorkWith,LanguageAdmired,LanguageDesired\n", + "\n", + "# draw as strip chart\n", + "# https://seaborn.pydata.org/generated/seaborn.stripplot.html#seaborn.stripplot\n", + "\n", + "def get_langs(dataset, key=\"LanguageHaveWorkedWith\"):\n", + " lang_count = Counter()\n", + " assert(key in dataset.keys())\n", + " for response in dataset[key]:\n", + " if type(response) == str:\n", + " lang_count.update(response.split(';'))\n", + " langs_by_popularity = dict(\n", + " sorted(lang_count.items(), key=lambda item: item[1], reverse=True)\n", + " )\n", + " return langs_by_popularity\n", + "\n", + "def visualize_langs(langs, langs2, label1 = \"condition1\", label2 = \"condition2\", saveto=None):\n", + " DOT_COLOR1 = \"lightblue\"\n", + " DOT_COLOR2 = \"red\"\n", + " BG_COLOR = \"black\" \n", + " df = pd.DataFrame(langs.items(), columns=['Languages', 'Count'])\n", + " df2 = pd.DataFrame(langs2.items(), columns=['Languages', 'Count'])\n", + " \n", + " plt.figure(figsize=(10,15)) \n", + " \n", + " sb.stripplot(x='Count', y='Languages', data=df, \\\n", + " size=5, color=DOT_COLOR1, label=\"have worked with\", jitter=True)\n", + " sb.stripplot(x='Count', y='Languages', data=df2, \\\n", + " size=5, color=DOT_COLOR2, label=\"want to work with\", jitter=True)\n", + " \n", + " # chatgpt draws my legend\n", + " # Create custom legend handles to avoid duplicates\n", + " # color = 'w' means do not draw line bissecting point\n", + " blue_patch = plt.Line2D(\n", + " [0], [0], marker='o', color=BG_COLOR, \\\n", + " label=label1, markerfacecolor=DOT_COLOR1, markersize=10)\n", + " red_patch = plt.Line2D(\n", + " [0], [0], marker='o', color=BG_COLOR, \\\n", + " label=label2, markerfacecolor=DOT_COLOR2, markersize=10)\n", + " \n", + " # Show the legend with custom handles\n", + " plt.legend(handles=[blue_patch, red_patch], loc=\"center right\")\n", + " \n", + " plt.grid(axis='x', linestyle='--', alpha=0.75) \n", + " plt.title(\"%s vs %s\" % (label1, label2))\n", + " if saveto is not None:\n", + " plt.savefig(saveto, bbox_inches='tight')\n", + " del df, df2\n", + "\n", + "l1 = get_langs( so_df )\n", + "l2 = get_langs( so_df, \"LanguageWantToWorkWith\" )\n", + "visualize_langs(l1,l2, \n", + " label1=\"have worked with\", label2=\"want to work with\",\n", + " saveto=\"images/used-vs-want2use.png\")\n", + "\n", + "l3 = get_langs( so_df, \"LanguageAdmired\")\n", + "l4 = get_langs( so_df, \"LanguageWantToWorkWith\")\n", + "visualize_langs(l3, l4, \n", + " label1=\"admired\", label2=\"want to work with\",\n", + " saveto=\"images/admired-vs-want2use.png\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0bfdb92-378a-4452-91cc-4d21afd2d6cc", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "# draw horizontal bar plot\n", + "# https://seaborn.pydata.org/examples/part_whole_bars.html\n", + "\n", + "# investigate extrinsic vs intrinsic motivation\n", + "def get_difference(dict1, dict2, proportion=False):\n", + " keys = dict1.keys()\n", + " result = dict()\n", + " for key in keys:\n", + " if proportion:\n", + " result[key] = round((dict1[key] - dict2[key])/dict2[key],2)\n", + " else:\n", + " result[key] = dict1[key] - dict2[key]\n", + " return result\n", + "\n", + "def visualize_diff(diff_dict, color=\"lightblue\", saveto=None):\n", + " diff_sorted = dict(\n", + " sorted(diff_dict.items(), key=lambda item: item[1], reverse=True)\n", + " )\n", + " KEY = \"Value\"\n", + " df = pd.DataFrame(diff_sorted.items(), columns=['Languages', 'Value'])\n", + " plt.figure(figsize=(15,20)) \n", + " sb.barplot(x=KEY, y='Languages', data=df, color=color)\n", + " DELTA = '\\u0394'\n", + " for index, value in enumerate(df[KEY]):\n", + " # chatgpt annotates my chart\n", + " # Position the text at the base of the bar\n", + " if value >= 0:\n", + " # Adjust the x position for positive values\n", + " plt.text(value, index, DELTA+str(value), va='center', ha=\"left\") \n", + " else:\n", + " # Adjust the x position for negative values\n", + " plt.text(value, index, DELTA+str(value), va='center', ha='right') \n", + " lowest = 0\n", + " offset = 0\n", + " positive_values = df[df[KEY] > 0][KEY]\n", + " if not positive_values.empty:\n", + " lowest = positive_values.min()\n", + " offset = list(positive_values).count(lowest) \n", + " if len(positive_values) < len(df):\n", + " # don't draw the line if every value is greater than 0_\n", + " plt.axhline(y=df[KEY].tolist().index(lowest) + (offset-0.5), \n", + " color='red', linestyle='--', zorder=-1)\n", + " if saveto is not None:\n", + " plt.savefig(saveto, bbox_inches='tight')\n", + " \n", + "motiv_diff = get_difference(l2, l1, proportion=True)\n", + "# print(motiv_diff)\n", + "visualize_diff(motiv_diff, saveto=\"images/delta.png\")\n", + "motiv_diff = get_difference(l2, l1)\n", + "visualize_diff(motiv_diff, saveto=\"images/delta-b.png\")\n", + "\n", + "# no clear description of what \"admired\" is\n", + "# in the schema\n", + "# but generally people want to use the languages\n", + "# they admire\n", + "\n", + "# determine level of hype\n", + "# hype = get_difference(l4, l3)\n", + "# print(hype)\n", + "# visualize_diff(hype, color=\"red\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6b1a935-eeda-416f-8adf-5e854d3aa066", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "# do people fall out of love with langs\n", + "# the more they are used professionally?\n", + "\n", + "def visualize_favor(df, key_x, key_y, MAGIC_X=0, MAGIC_Y=0, title=str(), saveto=None):\n", + " plt.figure()\n", + " OFFSET = 1 # push text away from point slightly\n", + " for i in range(merged.shape[0]):\n", + " # label points that aren't un a cluster\n", + " if merged[key_x][i] > MAGIC_X or merged[key_y][i] > MAGIC_Y:\n", + " plt.text(merged[key_x].iloc[i]+OFFSET, \n", + " merged[key_y].iloc[i]+OFFSET, \n", + " merged[\"Language\"].iloc[i], \n", + " ha=\"left\",\n", + " size='medium')\n", + "\n", + " sb.scatterplot(data=merged, x=key_x, y=key_y, hue=\"Language\")\n", + " plt.legend(loc='lower left', bbox_to_anchor=(0, -1.25), ncol=3) \n", + " plt.title(title)\n", + " if saveto is not None:\n", + " plt.savefig(saveto, bbox_inches='tight')\n", + " pass\n", + "key_x = \"Users\"\n", + "key_y = \"Potential '\\u0394'Users\"\n", + "df1 = pd.DataFrame(l1.items(), columns=['Language', key_x])\n", + "df2 = pd.DataFrame(motiv_diff.items(), columns=['Language', key_y])\n", + "# chatgpt tells me how to combine df\n", + "merged = pd.merge(df1, df2[[\"Language\", key_y]], on='Language', how='left')\n", + "visualize_favor(merged, key_x, key_y, \n", + " MAGIC_X=5000, MAGIC_Y=2000, \n", + " saveto=\"images/favor.png\")\n", + "del df1, df2, merged" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90cf119-c50d-468a-bc87-72dac41176ce", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "# see how much money are people making\n", + "\n", + "def get_mean_by_category(df, category, key=\"ConvertedCompYearly\"):\n", + " unique = df[category].unique()\n", + " result = dict()\n", + " for u in unique:\n", + " mean = df[df[category] == u][key].mean()\n", + " result[u] = mean\n", + " return result\n", + "\n", + "def show_me_the_money(df, saveto=None):\n", + " key_x = \"ConvertedCompYearly\"\n", + " key_y = \"DevType\"\n", + " \n", + " means = get_mean_by_category(df, key_y) \n", + " mean_df = pd.DataFrame(means.items(), columns=[key_y, key_x])\n", + "\n", + " plt.figure(figsize=(14,18)) \n", + " plt.axvline(x=1e5, color='red', linestyle='--', label=\"x = $100,000\")\n", + " plt.axvline(x=1e6, color='lightgreen', linestyle='--', label=\"x = millionaire\")\n", + " sb.barplot(x=key_x, y=key_y, data=mean_df.sort_values(by=key_x), \\\n", + " color='lavender', alpha=0.7, label=\"average compensation\")\n", + " sb.stripplot(x=key_x, y=key_y, data=df, \\\n", + " size=3, jitter=True)\n", + " if saveto is not None:\n", + " plt.savefig(saveto, bbox_inches='tight')\n", + " \n", + "# print survey ans\n", + "#employment_status = Counter(so_df[\"MainBranch\"])\n", + "#print(employment_status)\n", + "\n", + "#employment_type = Counter(so_df[\"DevType\"])\n", + "#print(employment_type)\n", + "\n", + "key = \"ConvertedCompYearly\"\n", + "# answers = so_df[:-1][key].count()\n", + "# print(answers, \"people answered re: \", key)\n", + "df_no_na = so_df.dropna(subset=[key])\n", + "indices = df_no_na[key].nlargest(15).index\n", + "\n", + "show_me_the_money( df_no_na.drop(indices), saveto=\"images/compensation-by-profession.png\" )\n", + "# could also ask myself what portion of developers \n", + "# earn less than the mean compensation\n", + "# (what titles have high standard deviations in earnings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdf21b1c-1316-422f-ad14-48150f80366c", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "\n", + "# key = \"DevType\"\n", + "# prof = \"Developer, full-stack\"\n", + "\n", + "key = \"MainBranch\"\n", + "prof = \"I am a developer by profession\"\n", + "col = \"ConvertedCompYearly\"\n", + "\n", + "devs = df_no_na[df_no_na[key] == prof ] \n", + "pd.set_option('display.float_format', '{:.2f}'.format)\n", + "devs.describe()[col]\n", + "\n", + "# who the hell is making $1/yr \n", + "# devs[devs[col] == 1.0]\n", + "\n", + "# who are the millionaires\n", + "# devs[devs[col] > 1e6]\n", + "\n", + "# who make more than the mean\n", + "# devs[devs[col] > 76230.84]\n", + "\n", + "# who make more than the median\n", + "# devs[devs[col] > 63316.00]\n", + "\n", + "# the ancient ones\n", + "so_df[so_df[\"YearsCodePro\"] == 'More than 50 years']\n", + "# should drop the 18-24 year old who is either bullshitting or recalls a past life\n", + "# 55-64 years old\n", + "# 65 years or older" + ] + }, { "cell_type": "code", "execution_count": null,