PronounsPage/census/analyse.py
2025-06-29 12:18:14 +02:00

545 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import Union, List, Dict
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import os
from datetime import datetime
import json
from argparse import ArgumentParser
import shutil
year = datetime.now().year
projectDir = Path(__file__).parent
inputDir = projectDir / 'input'
outputDir = projectDir.parent / 'locale' / 'pl' / 'docs' / f'spis-{year}'
openFigs = False
colours = ['#c71585']
colours_multi = ['#c71585', '#8b0f7a', '#15c79c', '#20a0d7']
pd.options.mode.chained_assignment = None
fontFamily = 'Nunito, "Open Sans", sans-serif'
graphHead = '''
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Nunito:wght@500&display=swap" rel="stylesheet">
'''
transition_sentiment = {
'już przeszł_m proces': 1,
'jestem w trakcie': 1,
'stosuję microdosing': 1,
'w części miejsc tak, w części nie': 1,
'zamierzam zacząć w przyszłości': 1,
'zamierzam przejść w przyszłości': 1,
'już przeszł_m proces zmiany na opcję neutraną': 1,
'tak': 1,
'jestem w trakcie procesu zmiany na opcję neutraną': 1,
'jestem w trakcie procesu zmiany na drugi znacznik binarny': 1,
'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie możliwe usunięcie znacznika': 1,
'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie dostępna opcja neutralna': 1,
'zamierzam przejść w przyszłości, na znacznik binarny': 1,
'już przeszł_m proces zmiany na drugi znacznik binarny': 1,
'przeszł_m część z zabiegów, które chcę przejść': 1,
'przyjmuję hormony': 1,
'stosuję microdosing': 1,
'przechodzę, we wszystkich obszarach życia': 1,
'przeszł_m już wszystkie zabiegi, które chcę przejść': 1,
'jeszcze nie wiem': 0,
'nie chcę odpowiadać': 0,
'nie zamierzam': -1,
'przechodzę/przeszł_m detranzycję': -1,
'przechodzę/przeszł_m <u>de</u>tranzycję': -1,
}
def calculate_percentages(df: pd.DataFrame, counts: pd.Series) -> pd.Series:
percentages = counts.apply(lambda x: round(100 * x / len(df), 1))
return pd.Series({
**{k: percentages[k] for k in percentages.index},
**{f"{k}__count": counts[k] for k in counts.index}
})
def extract_question(
df: pd.DataFrame,
question_number: int,
include_answers: bool = True,
include_aggregates: bool = False,
remove_underscores: bool = True
) -> pd.Series:
print('Extracting question', question_number)
question_series = df.filter(regex='^%s_%s(?!_writein)' % (
question_number,
('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)'
))
question_series.columns = [
c[len(str(question_number)) + 1:]
.replace('aggr_', 'łącznie: ')
.replace('_', ' ' if remove_underscores else '_')
.replace('łącznie: trans_', 'łącznie: trans*')
for c in question_series.columns
]
return calculate_percentages(df, question_series.sum())
def extract_question_single(
df: pd.DataFrame,
question_number: int,
include_answers: bool = True,
include_aggregates: bool = False
) -> pd.Series:
print('Extracting question', question_number)
question_series = df.filter(regex='^%s_%s(?!_writein)' % (
question_number,
('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)'
))
return calculate_percentages(df, question_series.groupby(f'{question_number}_')[f'{question_number}_'].count())
def extract_question_single_sentiment(
df: pd.DataFrame,
question_number: int,
sentiment_map: Dict[str, int],
include_answers: bool = True,
include_aggregates: bool = False
) -> pd.Series:
question_series = extract_question_single(df, question_number, include_answers, include_aggregates)
question_series.index = question_series.index.map(sentiment_map)
question_series = question_series.groupby(question_series.index).sum()
sentiment_series = pd.Series(
data=[
round(question_series[1], 1),
round(question_series[0], 1),
round(question_series[-1], 1),
round(question_series[0] + question_series[-1], 1)
],
index=['positive', 'neutral', 'negative', 'non-positive'],
)
return sentiment_series
def rename_index(data: Union[pd.DataFrame, pd.Series], new_index: List[str]) -> Union[pd.DataFrame, pd.Series]:
if type(data) is pd.Series:
data = data.copy()
data.columns = new_index
data.index = new_index
return data
return data.set_axis(labels=new_index, axis=0)
def generate_graph(
data: Union[pd.DataFrame, pd.Series],
group: str,
name: str,
title: str,
show: bool = False
):
print('Generating graph', group, name)
if isinstance(data, pd.DataFrame):
data = data.loc[[idx for idx in data.index if not str(idx).endswith('__count')]]
elif isinstance(data, pd.Series):
data = data[[not str(k).endswith('__count') for k in data.index]].reindex()
is_multi = type(data) is pd.DataFrame and len(data.columns) > 1
if all([type(c) is str and c.startswith('łącznie: ') for c in data.index]):
data = rename_index(data, [c.replace('łącznie: ', '') for c in data.index])
if 'nic, używam imienia nadanego mi przez rodziców' in data.index:
data = rename_index(data, [
'nadane ale przeciwna płeć',
'nadane ale wersja unisex',
'nadane bez zmian',
'wybrane rzeczownik',
'wybrane binarne',
'wybrane unisex',
])
if 'zamierzam zacząć w przyszłości' in data.index:
data = data.reindex(transition_sentiment.keys()).dropna()
fig = px.bar(
data,
color_discrete_sequence=colours_multi if is_multi else colours,
barmode='group',
template='plotly_white',
)
fig.update_layout(
showlegend=is_multi,
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''),
title=title,
xaxis=None,
yaxis=None,
font=dict(family=fontFamily, size=14),
)
# i can't believe i have to do that because a professor can't read a graph 🤦
if name == 'age':
fig.update_layout(
xaxis=dict(tickvals=[13, 20, 30, 40, 50, 60, 70]),
)
for trace in fig.select_traces():
trace.update(
hovertemplate='%{x}<br>%{y:.2f}%' + ('<br>%{meta}' if is_multi else '') + '<extra></extra>',
meta=trace.offsetgroup,
hoverlabel_font=dict(family=fontFamily, size=12),
visible='legendonly' if trace.name == 'Ogół' else True,
)
file_path = outputDir / group / (name + '.html')
pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn')
with open(file_path, 'r') as fr:
content = fr.read().replace('</head>', graphHead + '</head>')
with open(file_path, 'w') as fw:
fw.write(content)
def percent(value: int, size: int, precision: int = 2) -> float:
return round(100 * value / size, precision)
def ensure_empty_dir(dir: Path) -> Path:
if os.path.exists(dir):
shutil.rmtree(dir)
os.makedirs(dir, exist_ok=True)
return dir
def analyse(group: str, df: pd.DataFrame, full_df: pd.DataFrame, echo: bool = False, diffs: List[int] = None):
ensure_empty_dir(outputDir / group)
df['18_'] = df['18_'].str.rstrip('.')
df['25_'] = df['25_'].str.replace('<u>', '').str.replace('</u>', '')
df['28_'] = df['28_'].str.replace('<u>', '').str.replace('</u>', '')
df['29_'] = df['29_'].str.replace('<u>', '').str.replace('</u>', '')
df['30_'] = df['30_'].str.replace('<u>', '').str.replace('</u>', '')
df_plural = df[df['7_aggr_mnogie'] == 1]
df_neuter = df[df['7_rodzaj neutralny'] == 1]
df_transition = df[df['24_'] == 'tak']
df_english = df[df['19_nie znam / nie używam angielskiego'] != 1].drop('19_nie znam / nie używam angielskiego', axis=1)
df_attraction_split = df[df['21_'] == 'tak']
stats = {
'size': len(df),
'size_subset_percent': percent(len(df), len(full_df)),
'size_plural': len(df_plural),
'size_plural_percent': percent(len(df_plural), len(df)),
'size_neuter': len(df_neuter),
'size_neuter_percent': percent(len(df_neuter), len(df)),
'size_transition': len(df_transition),
'size_transition_subset_percent': percent(len(df_transition), len(df)),
'size_english': len(df_english),
'age': pd.Series(build_ages_histogram(df)),
'ageStats': {
'avg': round(df['age'].mean(), 1),
'median': round(df['age'].median(), 1),
'std': round(df['age'].std(), 1),
'adults': percent(len(df[df['age'] >= 18]), len(df)),
'adults_count': len(df[df['age'] >= 18]),
'under_30': percent(len(df[df['age'] < 30]), len(df)),
'under_30_count': len(df[df['age'] < 30]),
'over_30': percent(len(df[df['age'] >= 30]), len(df)),
'over_30_count': len(df[df['age'] >= 30]),
'under_25': percent(len(df[df['age'] < 25]), len(df)),
'under_25_count': len(df[df['age'] < 25]),
'over_25': percent(len(df[df['age'] >= 25]), len(df)),
'over_25_count': len(df[df['age'] >= 25]),
},
'neuter': extract_question(df, 6),
'neuterByUsers': extract_question(df_neuter, 6),
'pronounGroups': extract_question(df, 7),
'pronounGroupsAggr': extract_question(df, 7, include_answers=False, include_aggregates=True),
'pluralNouns': extract_question_single(df_plural, 8),
'pluralNonGendered': extract_question_single(df_plural, 9),
'pronouns': extract_question(df, 10),
'pronounsAggr': extract_question(df, 10, include_answers=False, include_aggregates=True),
'nouns': extract_question(df, 11),
'honorifics': extract_question(df, 12, include_aggregates=True),
'obstacles': extract_question(df, 13),
'groups': extract_question(df, 14),
'reasons': extract_question(df, 15),
'namesGender': extract_question_single(df, 16), # TODO missing aggregate
'names': extract_question_single(df, 17),
'namesAggr': extract_question(df, 17, include_answers=False, include_aggregates=True),
'namesDeclension': extract_question_single(df, 18),
'english': extract_question(df, 19, include_aggregates=True), # TODO use df_english
'labelsGender': extract_question(df, 20, include_aggregates=True, remove_underscores=False),
'labelsAttractionSplit': extract_question_single(df, 21),
'labelsSexuality': extract_question(df, 22, include_aggregates=True, remove_underscores=False),
'labelsRomantic': extract_question(df_attraction_split, 23, include_aggregates=True, remove_underscores=False),
'transtionAnswered': extract_question_single(df, 24),
'transitionSocial': extract_question_single(df_transition, 25),
'transitionSocialSentiment': extract_question_single_sentiment(df_transition, 25, sentiment_map=transition_sentiment),
'transitionName': extract_question_single(df_transition, 26),
'transitionNameSentiment': extract_question_single_sentiment(df_transition, 26, sentiment_map=transition_sentiment),
'transitionMarker': extract_question_single(df_transition, 27),
'transitionMarkerSentiment': extract_question_single_sentiment(df_transition, 27, sentiment_map=transition_sentiment),
'transitionPhysical': extract_question_single(df_transition, 28),
'transitionPhysicalSentiment': extract_question_single_sentiment(df_transition, 28, sentiment_map=transition_sentiment),
'transitionHormonal': extract_question_single(df_transition, 29),
'transitionHormonalSentiment': extract_question_single_sentiment(df_transition, 29, sentiment_map=transition_sentiment),
'transitionSurgical': extract_question_single(df_transition, 30),
'transitionSurgicalSentiment': extract_question_single_sentiment(df_transition, 30, sentiment_map=transition_sentiment),
}
stats_json = {
k: v.to_dict() if type(v) is pd.Series else v
for k, v
in stats.items()
}
stats_json['diff'] = {}
for prev_year in (diffs or []):
prev_year_key = f'spis-{prev_year}'
file_path = outputDir.parent / prev_year_key / group / 'stats.json'
if not file_path.exists():
continue
with open(file_path, 'r') as f:
prev_stats = json.load(f)
stats_json['diff'][prev_year_key] = {}
for k, v in stats_json.items():
if type(v) != dict or k == 'diff' or k not in prev_stats:
continue
stats_json['diff'][prev_year_key][k] = {}
for kk, vv in v.items():
if kk not in prev_stats[k]:
continue
stats_json['diff'][prev_year_key][k][kk] = round(vv - prev_stats[k][kk], 1)
stats_json = json.dumps(stats_json, indent=4)
if echo:
print('--- Group: %s ---' % group)
print(stats_json)
with open(outputDir / group / 'stats.json', 'w') as f:
f.write(stats_json + '\n')
return stats
def build_ages_histogram(df: pd.DataFrame) -> pd.Series:
ages = [int(a) for a in df['age'].to_list() if a > 0]
ages_hist = {i: 0 for i in range(min(ages), max(ages) + 1)}
for age in ages:
ages_hist[age] += 1
s = len(ages)
return pd.Series({
age: percent(count, s, 3)
for age, count
in ages_hist.items()
})
def generate_yearly_comparison(data: dict, show: bool = False):
df = pd.DataFrame(data)
# Reshape to long format
df_long = df.melt(id_vars="Form", var_name="Year", value_name="Percentage")
# Plot
fig = px.line(df_long,
x="Year", y="Percentage", color="Form",
title="Trendy na przestrzeni lat",
template="plotly_white",
markers=True,
#color_discrete_sequence=colours_multi,
)
fig.update_traces(line=dict(width=4), marker=dict(size=8))
for trace in fig.select_traces():
trace.update(
hovertemplate='%{x}<br>%{y:.2f}%', # + '<extra></extra>',
)
df_last_year = df_long[df_long["Year"] == df_long["Year"].max()]
for _, row in df_last_year.iterrows():
if row["Form"] == "Neutratywy":
position = "bottom left"
else:
position = "top left"
fig.add_trace(go.Scatter(
x=[row["Year"]],
y=[row["Percentage"]],
text=["<b>" + row["Form"] + "</b>"],
mode="text",
textposition=position,
showlegend=False,
hoverinfo="skip",
textfont=dict(family=fontFamily, size=16),
))
fig.update_layout(
xaxis=None,
yaxis_title="Procent (%)",
showlegend=False,
#legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''),
font=dict(family=fontFamily, size=14),
)
file_path = outputDir / 'year_by_year.html'
pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn')
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-s', '--show', dest='show', default=False, nargs='?', const=True)
parser.add_argument('-e', '--echo', dest='echo', default=False, nargs='?', const=True)
args = parser.parse_args()
if args.show:
openFigs = True
df = pd.read_csv(inputDir / 'export.csv')
df = df[df['0_'].isin(['osobą niebinarną', 'nie wiem'])]
df.loc[:, 'age'] = year - df['3_']
df.loc[df['age'] > 100, 'age'] = None
diffs = [2021, 2022, 2023, 2024]
generate_yearly_comparison({
"Form": [
"Rodzaj neutralny",
"Rodzaj postpłciowy",
"Wyłącznie formy binarne",
"Wyłącznie formy niebinarne",
"Neutratywy",
"Osobatywy"
],
"2021": [25.4, 8, 53.6, 8.4, 12, 66.7],
"2022": [43.1, 11.8, 15.7, 15.5, 14.3, 65.4],
"2023": [48, 9.3, 15.6, 18.2, 11.7, 67.5],
"2024": [45.9, 6.2, 18, 19.4, 12.3, 68],
"2025": [52.3, 8.6, 15.5, 18.9, 15.4, 74.4]
})
stats = {
'general': analyse('general', df, df, args.echo, diffs),
'location_poland': analyse('location_poland', df[df['4_'] == 'w Polsce'], df, args.echo, diffs),
'location_abroad': analyse('location_abroad', df[df['4_'] == 'za granicą'], df, args.echo, diffs),
'agab_f': analyse('agab_f', df[df['1_'] == 'żeńską'], df, args.echo, diffs),
'agab_m': analyse('agab_m', df[df['1_'] == 'męską'], df, args.echo, diffs),
# 'agab_x': analyse('agab_x', df[df['1_'] == 'inną (w jurysdykcjach, gdzie to możliwe)'], df, args.echo, diffs),
'younger': analyse('younger', df[df['age'] < 25], df, args.echo, diffs),
'older': analyse('older', df[df['age'] >= 25], df, args.echo, diffs),
'name_f': analyse('name_f', df[df['16_'] == 'żeńskie'], df, args.echo, diffs),
'name_m': analyse('name_m', df[df['16_'] == 'męskie'], df, args.echo, diffs),
'name_n': analyse('name_n', df[df['16_'] == 'neutralne płciowo'], df, args.echo, diffs),
}
comparisons = {
'by_location': {
'general': 'Ogół',
'location_poland': 'Polska',
'location_abroad': 'Zagranica',
},
'by_agab': {
'general': 'Ogół',
'agab_f': 'AFAB',
'agab_m': 'AMAB',
},
'by_age': {
'general': 'Ogół',
'younger': 'Grupa młodsza',
'older': 'Grupa starsza',
},
'by_name': {
'name_f': 'Imię żeńskie',
'name_m': 'Imię męskie',
'name_n': 'Imię neutralne płciowo',
},
}
graphs = {
'age': 'Wiek osób respondenckich',
'neuter': 'Preferowana nazwa rodzaju gramatycznego',
'neuterByUsers': 'Preferowana nazwa rodzaju gramatycznego wśród osób go używających',
'pronounGroups': 'Rodzaj gramatyczny używany w mowie',
'pronouns': 'Zaimki używane w piśmie',
'pluralNouns': 'Czy rzeczowniki również w liczbie mnogiej?',
'pluralNonGendered': 'Czy nieupłciowione formy również w liczbie mnogiej?',
'pronounsAggr': 'Zaimki używane w piśmie (zgrupowane)',
'nouns': 'Rzeczowniki',
'honorifics': 'Formy grzecznościowe',
'obstacles': 'Dlaczego nie formy niebinarne?',
'reasons': 'Co wpływa na wybór form?',
'groups': 'Formy do opisu grup mieszanych',
'namesGender': 'Upłciowienie używanego imienia',
'names': 'Wybór używanego imienia',
'namesDeclension': 'Odmiana używanego imienia',
'english': 'Zaimki w języku angielskim',
'labelsGender': 'Etykietki opisujące płeć',
'labelsSexuality': 'Etykietki opisujące orientację seksualną',
'labelsRomantic': 'Etykietki opisujące orientację romantyczną',
'transitionSocial': 'Tranzycja społeczna',
'transitionName': 'Tranzycja prawna imię',
'transitionMarker': 'Tranzycja prawna znacznik płci',
'transitionPhysical': 'Tranzycja medyczna bindery, packery, itp.',
'transitionHormonal': 'Tranzycja medyczna hormony',
'transitionSurgical': 'Tranzycja medyczna zmiany chirurgiczne',
}
for group, group_stats in stats.items():
for graph, graph_label in graphs.items():
generate_graph(group_stats[graph], group, graph, graph_label)
for comparison_key, comparison_groups in comparisons.items():
ensure_empty_dir(outputDir / comparison_key)
for graph, graph_label in graphs.items():
if comparison_key == 'by_name' and graph != 'namesDeclension':
continue
data = pd.DataFrame({
groupLabel: stats[group][graph]
for group, groupLabel
in comparison_groups.items()
})
generate_graph(data, comparison_key, graph, graph_label)
by_year = {}
for prev_year in [*diffs, year]:
file_path = outputDir.parent / f'spis-{prev_year}' / 'general' / 'stats.json'
if not file_path.exists():
continue
with open(file_path, 'r') as f:
by_year[prev_year] = json.load(f)
ensure_empty_dir(outputDir / 'by_year')
for graph, graph_label in graphs.items():
data = pd.DataFrame({
column_year: year_data[graph]
for column_year, year_data
in by_year.items()
if graph in year_data
})
generate_graph(data, 'by_year', graph, graph_label)
write_ins_dir = outputDir / 'write_ins'
write_ins_dir.mkdir(parents=True, exist_ok=True)
for column in df.columns:
if not column.endswith('__writein') and column != '31_':
continue
print(f'Extracting write-ins for question {column}')
writeins = df[column].dropna().value_counts().reset_index()
writeins.columns = ['write-in', 'count']
writeins = writeins[['count', 'write-in']]
writeins = writeins.sort_values(by=['count'], ascending=False)
writeins.to_csv(write_ins_dir / f'{column}.tsv', index=False, sep='\t')