from pathlib import Path
import pandas as pd
import requests
import zipfile
import io

pd.set_option("display.precision", 2)


DATASET_URL = 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip'
DATASET_FILE = 'survey_results_public.csv'
if not Path(DATASET_FILE).exists():
    r = requests.get(DATASET_URL)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extract(DATASET_FILE)


df = pd.read_csv(DATASET_FILE)
df.LanguageHaveWorkedWith = df.LanguageHaveWorkedWith.fillna('').map(lambda s: s.split(';') if s != '' else [])
langs = set([l for ls in df.LanguageHaveWorkedWith for l in ls])


div = []
for lang in langs:
    dfl = df[df.LanguageHaveWorkedWith.map(lambda l: lang in l)]
    
    N = len(dfl)
    cis = dfl.Trans[dfl.Trans == 'No'].count()
    male = dfl.Gender[dfl.Gender == 'Man'].count()
    white = (dfl.Ethnicity.str.contains('White') | dfl.Ethnicity.str.contains('European')).sum()
    
    div.append({
        'lang': lang,
        'N': N,
        'white': white/N,
        'male': male/N,
        'cis': cis/N
    })
div = pd.DataFrame(div)
div.sort_values('N', ascending=False)


for k in ['white', 'male', 'cis']:
    print(f'lowest % of: {k}')
    print(div.sort_values(k).iloc[:10])    
    print('\n')

lowest % of: white
        lang      N  white  male   cis
21       APL    504   0.45  0.73  0.72
0   Solidity   1031   0.46  0.90  0.91
31      Dart   4648   0.48  0.92  0.94
7        SAS    435   0.52  0.81  0.85
6     MATLAB   2913   0.56  0.88  0.93
24       PHP  14827   0.59  0.90  0.94
36         C  13692   0.59  0.90  0.92
27      Java  23644   0.59  0.90  0.94
37   Crystal    340   0.59  0.81  0.81
5        C++  16024   0.60  0.90  0.93


lowest % of: male
       lang     N  white  male   cis
21      APL   504   0.45  0.73  0.72
7       SAS   435   0.52  0.81  0.85
37  Crystal   340   0.59  0.81  0.81
33    OCaml   422   0.66  0.83  0.83
34    COBOL   464   0.66  0.84  0.85
9      LISP   932   0.70  0.85  0.87
26  Haskell  1577   0.70  0.85  0.89
11   Erlang   641   0.68  0.85  0.87
35  Fortran   646   0.69  0.86  0.87
16    Julia  1084   0.63  0.86  0.88


lowest % of: cis
       lang     N  white  male   cis
21      APL   504   0.45  0.73  0.72
37  Crystal   340   0.59  0.81  0.81
33    OCaml   422   0.66  0.83  0.83
7       SAS   435   0.52  0.81  0.85
34    COBOL   464   0.66  0.84  0.85
9      LISP   932   0.70  0.85  0.87
11   Erlang   641   0.68  0.85  0.87
35  Fortran   646   0.69  0.86  0.87
16    Julia  1084   0.63  0.86  0.88
26  Haskell  1577   0.70  0.85  0.89

Analysis of Diversity in Programming Language Communities¶

Data Loading¶

Data Processing¶

Data Analysis¶

	lang	N	white	male	cis
2	JavaScript	46443	0.60	0.90	0.94
41	HTML/CSS	39142	0.60	0.90	0.94
29	SQL	35127	0.63	0.91	0.95
19	Python	34155	0.61	0.90	0.94
10	TypeScript	24752	0.64	0.91	0.95
27	Java	23644	0.59	0.90	0.94
32	Bash/Shell	20656	0.71	0.90	0.94
39	C#	19883	0.67	0.92	0.94
5	C++	16024	0.60	0.90	0.93
24	PHP	14827	0.59	0.90	0.94
36	C	13692	0.59	0.90	0.92
1	PowerShell	8575	0.71	0.91	0.94
8	Go	7922	0.64	0.91	0.93
4	Rust	6625	0.71	0.89	0.91
14	Kotlin	6507	0.60	0.91	0.94
31	Dart	4648	0.48	0.92	0.94
23	Ruby	4299	0.66	0.88	0.93
28	Assembly	3887	0.66	0.88	0.90
3	Swift	3489	0.62	0.90	0.94
18	R	3308	0.61	0.87	0.94
38	VBA	3185	0.65	0.90	0.93
6	MATLAB	2913	0.56	0.88	0.93
25	Lua	2867	0.69	0.87	0.89
12	Groovy	2357	0.70	0.90	0.94
22	Delphi	2311	0.72	0.94	0.93
40	Scala	1837	0.65	0.89	0.92
13	Objective-C	1698	0.61	0.90	0.92
15	Perl	1644	0.69	0.87	0.89
26	Haskell	1577	0.70	0.85	0.89
30	Elixir	1528	0.67	0.89	0.92
16	Julia	1084	0.63	0.86	0.88
20	Clojure	1070	0.69	0.89	0.91
0	Solidity	1031	0.46	0.90	0.91
9	LISP	932	0.70	0.85	0.87
17	F#	730	0.74	0.90	0.90
35	Fortran	646	0.69	0.86	0.87
11	Erlang	641	0.68	0.85	0.87
21	APL	504	0.45	0.73	0.72
34	COBOL	464	0.66	0.84	0.85
7	SAS	435	0.52	0.81	0.85
33	OCaml	422	0.66	0.83	0.83
37	Crystal	340	0.59	0.81	0.81