Data retrieved from here: https://insights.stackoverflow.com/survey
from pathlib import Path
import pandas as pd
import requests
import zipfile
import io
pd.set_option("display.precision", 2)
DATASET_URL = 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip'
DATASET_FILE = 'survey_results_public.csv'
if not Path(DATASET_FILE).exists():
r = requests.get(DATASET_URL)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extract(DATASET_FILE)
df = pd.read_csv(DATASET_FILE)
df.LanguageHaveWorkedWith = df.LanguageHaveWorkedWith.fillna('').map(lambda s: s.split(';') if s != '' else [])
langs = set([l for ls in df.LanguageHaveWorkedWith for l in ls])
div = []
for lang in langs:
dfl = df[df.LanguageHaveWorkedWith.map(lambda l: lang in l)]
N = len(dfl)
cis = dfl.Trans[dfl.Trans == 'No'].count()
male = dfl.Gender[dfl.Gender == 'Man'].count()
white = (dfl.Ethnicity.str.contains('White') | dfl.Ethnicity.str.contains('European')).sum()
div.append({
'lang': lang,
'N': N,
'white': white/N,
'male': male/N,
'cis': cis/N
})
div = pd.DataFrame(div)
div.sort_values('N', ascending=False)
| lang | N | white | male | cis | |
|---|---|---|---|---|---|
| 2 | JavaScript | 46443 | 0.60 | 0.90 | 0.94 |
| 41 | HTML/CSS | 39142 | 0.60 | 0.90 | 0.94 |
| 29 | SQL | 35127 | 0.63 | 0.91 | 0.95 |
| 19 | Python | 34155 | 0.61 | 0.90 | 0.94 |
| 10 | TypeScript | 24752 | 0.64 | 0.91 | 0.95 |
| 27 | Java | 23644 | 0.59 | 0.90 | 0.94 |
| 32 | Bash/Shell | 20656 | 0.71 | 0.90 | 0.94 |
| 39 | C# | 19883 | 0.67 | 0.92 | 0.94 |
| 5 | C++ | 16024 | 0.60 | 0.90 | 0.93 |
| 24 | PHP | 14827 | 0.59 | 0.90 | 0.94 |
| 36 | C | 13692 | 0.59 | 0.90 | 0.92 |
| 1 | PowerShell | 8575 | 0.71 | 0.91 | 0.94 |
| 8 | Go | 7922 | 0.64 | 0.91 | 0.93 |
| 4 | Rust | 6625 | 0.71 | 0.89 | 0.91 |
| 14 | Kotlin | 6507 | 0.60 | 0.91 | 0.94 |
| 31 | Dart | 4648 | 0.48 | 0.92 | 0.94 |
| 23 | Ruby | 4299 | 0.66 | 0.88 | 0.93 |
| 28 | Assembly | 3887 | 0.66 | 0.88 | 0.90 |
| 3 | Swift | 3489 | 0.62 | 0.90 | 0.94 |
| 18 | R | 3308 | 0.61 | 0.87 | 0.94 |
| 38 | VBA | 3185 | 0.65 | 0.90 | 0.93 |
| 6 | MATLAB | 2913 | 0.56 | 0.88 | 0.93 |
| 25 | Lua | 2867 | 0.69 | 0.87 | 0.89 |
| 12 | Groovy | 2357 | 0.70 | 0.90 | 0.94 |
| 22 | Delphi | 2311 | 0.72 | 0.94 | 0.93 |
| 40 | Scala | 1837 | 0.65 | 0.89 | 0.92 |
| 13 | Objective-C | 1698 | 0.61 | 0.90 | 0.92 |
| 15 | Perl | 1644 | 0.69 | 0.87 | 0.89 |
| 26 | Haskell | 1577 | 0.70 | 0.85 | 0.89 |
| 30 | Elixir | 1528 | 0.67 | 0.89 | 0.92 |
| 16 | Julia | 1084 | 0.63 | 0.86 | 0.88 |
| 20 | Clojure | 1070 | 0.69 | 0.89 | 0.91 |
| 0 | Solidity | 1031 | 0.46 | 0.90 | 0.91 |
| 9 | LISP | 932 | 0.70 | 0.85 | 0.87 |
| 17 | F# | 730 | 0.74 | 0.90 | 0.90 |
| 35 | Fortran | 646 | 0.69 | 0.86 | 0.87 |
| 11 | Erlang | 641 | 0.68 | 0.85 | 0.87 |
| 21 | APL | 504 | 0.45 | 0.73 | 0.72 |
| 34 | COBOL | 464 | 0.66 | 0.84 | 0.85 |
| 7 | SAS | 435 | 0.52 | 0.81 | 0.85 |
| 33 | OCaml | 422 | 0.66 | 0.83 | 0.83 |
| 37 | Crystal | 340 | 0.59 | 0.81 | 0.81 |
for k in ['white', 'male', 'cis']:
print(f'lowest % of: {k}')
print(div.sort_values(k).iloc[:10])
print('\n')
lowest % of: white
lang N white male cis
21 APL 504 0.45 0.73 0.72
0 Solidity 1031 0.46 0.90 0.91
31 Dart 4648 0.48 0.92 0.94
7 SAS 435 0.52 0.81 0.85
6 MATLAB 2913 0.56 0.88 0.93
24 PHP 14827 0.59 0.90 0.94
36 C 13692 0.59 0.90 0.92
27 Java 23644 0.59 0.90 0.94
37 Crystal 340 0.59 0.81 0.81
5 C++ 16024 0.60 0.90 0.93
lowest % of: male
lang N white male cis
21 APL 504 0.45 0.73 0.72
7 SAS 435 0.52 0.81 0.85
37 Crystal 340 0.59 0.81 0.81
33 OCaml 422 0.66 0.83 0.83
34 COBOL 464 0.66 0.84 0.85
9 LISP 932 0.70 0.85 0.87
26 Haskell 1577 0.70 0.85 0.89
11 Erlang 641 0.68 0.85 0.87
35 Fortran 646 0.69 0.86 0.87
16 Julia 1084 0.63 0.86 0.88
lowest % of: cis
lang N white male cis
21 APL 504 0.45 0.73 0.72
37 Crystal 340 0.59 0.81 0.81
33 OCaml 422 0.66 0.83 0.83
7 SAS 435 0.52 0.81 0.85
34 COBOL 464 0.66 0.84 0.85
9 LISP 932 0.70 0.85 0.87
11 Erlang 641 0.68 0.85 0.87
35 Fortran 646 0.69 0.86 0.87
16 Julia 1084 0.63 0.86 0.88
26 Haskell 1577 0.70 0.85 0.89