Data retrieved from here: https://insights.stackoverflow.com/survey
from pathlib import Path
import pandas as pd
import requests
import zipfile
import io
pd.set_option("display.precision", 2)
DATASET_URL = 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip'
DATASET_FILE = 'survey_results_public.csv'
if not Path(DATASET_FILE).exists():
r = requests.get(DATASET_URL)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extract(DATASET_FILE)
df = pd.read_csv(DATASET_FILE)
df.LanguageHaveWorkedWith = df.LanguageHaveWorkedWith.fillna('').map(lambda s: s.split(';') if s != '' else [])
langs = set([l for ls in df.LanguageHaveWorkedWith for l in ls])
div = []
for lang in langs:
dfl = df[df.LanguageHaveWorkedWith.map(lambda l: lang in l)]
N = len(dfl)
cis = dfl.Trans[dfl.Trans == 'No'].count()
male = dfl.Gender[dfl.Gender == 'Man'].count()
white = (dfl.Ethnicity.str.contains('White') | dfl.Ethnicity.str.contains('European')).sum()
div.append({
'lang': lang,
'N': N,
'white': white/N,
'male': male/N,
'cis': cis/N
})
div = pd.DataFrame(div)
div.sort_values('N', ascending=False)
lang | N | white | male | cis | |
---|---|---|---|---|---|
2 | JavaScript | 46443 | 0.60 | 0.90 | 0.94 |
41 | HTML/CSS | 39142 | 0.60 | 0.90 | 0.94 |
29 | SQL | 35127 | 0.63 | 0.91 | 0.95 |
19 | Python | 34155 | 0.61 | 0.90 | 0.94 |
10 | TypeScript | 24752 | 0.64 | 0.91 | 0.95 |
27 | Java | 23644 | 0.59 | 0.90 | 0.94 |
32 | Bash/Shell | 20656 | 0.71 | 0.90 | 0.94 |
39 | C# | 19883 | 0.67 | 0.92 | 0.94 |
5 | C++ | 16024 | 0.60 | 0.90 | 0.93 |
24 | PHP | 14827 | 0.59 | 0.90 | 0.94 |
36 | C | 13692 | 0.59 | 0.90 | 0.92 |
1 | PowerShell | 8575 | 0.71 | 0.91 | 0.94 |
8 | Go | 7922 | 0.64 | 0.91 | 0.93 |
4 | Rust | 6625 | 0.71 | 0.89 | 0.91 |
14 | Kotlin | 6507 | 0.60 | 0.91 | 0.94 |
31 | Dart | 4648 | 0.48 | 0.92 | 0.94 |
23 | Ruby | 4299 | 0.66 | 0.88 | 0.93 |
28 | Assembly | 3887 | 0.66 | 0.88 | 0.90 |
3 | Swift | 3489 | 0.62 | 0.90 | 0.94 |
18 | R | 3308 | 0.61 | 0.87 | 0.94 |
38 | VBA | 3185 | 0.65 | 0.90 | 0.93 |
6 | MATLAB | 2913 | 0.56 | 0.88 | 0.93 |
25 | Lua | 2867 | 0.69 | 0.87 | 0.89 |
12 | Groovy | 2357 | 0.70 | 0.90 | 0.94 |
22 | Delphi | 2311 | 0.72 | 0.94 | 0.93 |
40 | Scala | 1837 | 0.65 | 0.89 | 0.92 |
13 | Objective-C | 1698 | 0.61 | 0.90 | 0.92 |
15 | Perl | 1644 | 0.69 | 0.87 | 0.89 |
26 | Haskell | 1577 | 0.70 | 0.85 | 0.89 |
30 | Elixir | 1528 | 0.67 | 0.89 | 0.92 |
16 | Julia | 1084 | 0.63 | 0.86 | 0.88 |
20 | Clojure | 1070 | 0.69 | 0.89 | 0.91 |
0 | Solidity | 1031 | 0.46 | 0.90 | 0.91 |
9 | LISP | 932 | 0.70 | 0.85 | 0.87 |
17 | F# | 730 | 0.74 | 0.90 | 0.90 |
35 | Fortran | 646 | 0.69 | 0.86 | 0.87 |
11 | Erlang | 641 | 0.68 | 0.85 | 0.87 |
21 | APL | 504 | 0.45 | 0.73 | 0.72 |
34 | COBOL | 464 | 0.66 | 0.84 | 0.85 |
7 | SAS | 435 | 0.52 | 0.81 | 0.85 |
33 | OCaml | 422 | 0.66 | 0.83 | 0.83 |
37 | Crystal | 340 | 0.59 | 0.81 | 0.81 |
for k in ['white', 'male', 'cis']:
print(f'lowest % of: {k}')
print(div.sort_values(k).iloc[:10])
print('\n')
lowest % of: white lang N white male cis 21 APL 504 0.45 0.73 0.72 0 Solidity 1031 0.46 0.90 0.91 31 Dart 4648 0.48 0.92 0.94 7 SAS 435 0.52 0.81 0.85 6 MATLAB 2913 0.56 0.88 0.93 24 PHP 14827 0.59 0.90 0.94 36 C 13692 0.59 0.90 0.92 27 Java 23644 0.59 0.90 0.94 37 Crystal 340 0.59 0.81 0.81 5 C++ 16024 0.60 0.90 0.93 lowest % of: male lang N white male cis 21 APL 504 0.45 0.73 0.72 7 SAS 435 0.52 0.81 0.85 37 Crystal 340 0.59 0.81 0.81 33 OCaml 422 0.66 0.83 0.83 34 COBOL 464 0.66 0.84 0.85 9 LISP 932 0.70 0.85 0.87 26 Haskell 1577 0.70 0.85 0.89 11 Erlang 641 0.68 0.85 0.87 35 Fortran 646 0.69 0.86 0.87 16 Julia 1084 0.63 0.86 0.88 lowest % of: cis lang N white male cis 21 APL 504 0.45 0.73 0.72 37 Crystal 340 0.59 0.81 0.81 33 OCaml 422 0.66 0.83 0.83 7 SAS 435 0.52 0.81 0.85 34 COBOL 464 0.66 0.84 0.85 9 LISP 932 0.70 0.85 0.87 11 Erlang 641 0.68 0.85 0.87 35 Fortran 646 0.69 0.86 0.87 16 Julia 1084 0.63 0.86 0.88 26 Haskell 1577 0.70 0.85 0.89