from typing import Union, List, Dict
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import os
from datetime import datetime
import json
from argparse import ArgumentParser
import shutil
year = datetime.now().year
projectDir = Path(__file__).parent
inputDir = projectDir / 'input'
outputDir = projectDir.parent / 'locale' / 'pl' / 'docs' / f'spis-{year}'
openFigs = False
colours = ['#c71585']
colours_multi = ['#c71585', '#8b0f7a', '#15c79c', '#20a0d7']
pd.options.mode.chained_assignment = None
fontFamily = 'Nunito, "Open Sans", sans-serif'
graphHead = '''
'''
transition_sentiment = {
'już przeszł_m proces': 1,
'jestem w trakcie': 1,
'stosuję microdosing': 1,
'w części miejsc tak, w części nie': 1,
'zamierzam zacząć w przyszłości': 1,
'zamierzam przejść w przyszłości': 1,
'już przeszł_m proces zmiany na opcję neutraną': 1,
'tak': 1,
'jestem w trakcie procesu zmiany na opcję neutraną': 1,
'jestem w trakcie procesu zmiany na drugi znacznik binarny': 1,
'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie możliwe usunięcie znacznika': 1,
'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie dostępna opcja neutralna': 1,
'zamierzam przejść w przyszłości, na znacznik binarny': 1,
'już przeszł_m proces zmiany na drugi znacznik binarny': 1,
'przeszł_m część z zabiegów, które chcę przejść': 1,
'przyjmuję hormony': 1,
'stosuję microdosing': 1,
'przechodzę, we wszystkich obszarach życia': 1,
'przeszł_m już wszystkie zabiegi, które chcę przejść': 1,
'jeszcze nie wiem': 0,
'nie chcę odpowiadać': 0,
'nie zamierzam': -1,
'przechodzę/przeszł_m detranzycję': -1,
'przechodzę/przeszł_m detranzycję': -1,
}
def calculate_percentages(df: pd.DataFrame, counts: pd.Series) -> pd.Series:
percentages = counts.apply(lambda x: round(100 * x / len(df), 1))
return pd.Series({
**{k: percentages[k] for k in percentages.index},
**{f"{k}__count": counts[k] for k in counts.index}
})
def extract_question(
df: pd.DataFrame,
question_number: int,
include_answers: bool = True,
include_aggregates: bool = False,
remove_underscores: bool = True
) -> pd.Series:
print('Extracting question', question_number)
question_series = df.filter(regex='^%s_%s(?!_writein)' % (
question_number,
('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)'
))
question_series.columns = [
c[len(str(question_number)) + 1:]
.replace('aggr_', 'łącznie: ')
.replace('_', ' ' if remove_underscores else '_')
.replace('łącznie: trans_', 'łącznie: trans*')
for c in question_series.columns
]
return calculate_percentages(df, question_series.sum())
def extract_question_single(
df: pd.DataFrame,
question_number: int,
include_answers: bool = True,
include_aggregates: bool = False
) -> pd.Series:
print('Extracting question', question_number)
question_series = df.filter(regex='^%s_%s(?!_writein)' % (
question_number,
('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)'
))
return calculate_percentages(df, question_series.groupby(f'{question_number}_')[f'{question_number}_'].count())
def extract_question_single_sentiment(
df: pd.DataFrame,
question_number: int,
sentiment_map: Dict[str, int],
include_answers: bool = True,
include_aggregates: bool = False
) -> pd.Series:
question_series = extract_question_single(df, question_number, include_answers, include_aggregates)
question_series.index = question_series.index.map(sentiment_map)
question_series = question_series.groupby(question_series.index).sum()
sentiment_series = pd.Series(
data=[
round(question_series[1], 1),
round(question_series[0], 1),
round(question_series[-1], 1),
round(question_series[0] + question_series[-1], 1)
],
index=['positive', 'neutral', 'negative', 'non-positive'],
)
return sentiment_series
def rename_index(data: Union[pd.DataFrame, pd.Series], new_index: List[str]) -> Union[pd.DataFrame, pd.Series]:
if type(data) is pd.Series:
data = data.copy()
data.columns = new_index
data.index = new_index
return data
return data.set_axis(labels=new_index, axis=0)
def generate_graph(
data: Union[pd.DataFrame, pd.Series],
group: str,
name: str,
title: str,
show: bool = False
):
print('Generating graph', group, name)
if isinstance(data, pd.DataFrame):
data = data.loc[[idx for idx in data.index if not str(idx).endswith('__count')]]
elif isinstance(data, pd.Series):
data = data[[not str(k).endswith('__count') for k in data.index]].reindex()
is_multi = type(data) is pd.DataFrame and len(data.columns) > 1
if all([type(c) is str and c.startswith('łącznie: ') for c in data.index]):
data = rename_index(data, [c.replace('łącznie: ', '') for c in data.index])
if 'nic, używam imienia nadanego mi przez rodziców' in data.index:
data = rename_index(data, [
'nadane – ale przeciwna płeć',
'nadane – ale wersja unisex',
'nadane – bez zmian',
'wybrane – rzeczownik',
'wybrane – binarne',
'wybrane – unisex',
])
if 'zamierzam zacząć w przyszłości' in data.index:
data = data.reindex(transition_sentiment.keys()).dropna()
fig = px.bar(
data,
color_discrete_sequence=colours_multi if is_multi else colours,
barmode='group',
template='plotly_white',
)
fig.update_layout(
showlegend=is_multi,
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''),
title=title,
xaxis=None,
yaxis=None,
font=dict(family=fontFamily, size=14),
)
# i can't believe i have to do that because a professor can't read a graph 🤦
if name == 'age':
fig.update_layout(
xaxis=dict(tickvals=[13, 20, 30, 40, 50, 60, 70]),
)
for trace in fig.select_traces():
trace.update(
hovertemplate='%{x}
%{y:.2f}%' + ('
%{meta}' if is_multi else '') + '',
meta=trace.offsetgroup,
hoverlabel_font=dict(family=fontFamily, size=12),
visible='legendonly' if trace.name == 'Ogół' else True,
)
file_path = outputDir / group / (name + '.html')
pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn')
with open(file_path, 'r') as fr:
content = fr.read().replace('', graphHead + '')
with open(file_path, 'w') as fw:
fw.write(content)
def percent(value: int, size: int, precision: int = 2) -> float:
return round(100 * value / size, precision)
def ensure_empty_dir(dir: Path) -> Path:
if os.path.exists(dir):
shutil.rmtree(dir)
os.makedirs(dir, exist_ok=True)
return dir
def analyse(group: str, df: pd.DataFrame, full_df: pd.DataFrame, echo: bool = False, diffs: List[int] = None):
ensure_empty_dir(outputDir / group)
df['18_'] = df['18_'].str.rstrip('.')
df['25_'] = df['25_'].str.replace('', '').str.replace('', '')
df['28_'] = df['28_'].str.replace('', '').str.replace('', '')
df['29_'] = df['29_'].str.replace('', '').str.replace('', '')
df['30_'] = df['30_'].str.replace('', '').str.replace('', '')
df_plural = df[df['7_aggr_mnogie'] == 1]
df_neuter = df[df['7_rodzaj neutralny'] == 1]
df_transition = df[df['24_'] == 'tak']
df_english = df[df['19_nie znam / nie używam angielskiego'] != 1].drop('19_nie znam / nie używam angielskiego', axis=1)
df_attraction_split = df[df['21_'] == 'tak']
stats = {
'size': len(df),
'size_subset_percent': percent(len(df), len(full_df)),
'size_plural': len(df_plural),
'size_plural_percent': percent(len(df_plural), len(df)),
'size_neuter': len(df_neuter),
'size_neuter_percent': percent(len(df_neuter), len(df)),
'size_transition': len(df_transition),
'size_transition_subset_percent': percent(len(df_transition), len(df)),
'size_english': len(df_english),
'age': pd.Series(build_ages_histogram(df)),
'ageStats': {
'avg': round(df['age'].mean(), 1),
'median': round(df['age'].median(), 1),
'std': round(df['age'].std(), 1),
'adults': percent(len(df[df['age'] >= 18]), len(df)),
'adults_count': len(df[df['age'] >= 18]),
'under_30': percent(len(df[df['age'] < 30]), len(df)),
'under_30_count': len(df[df['age'] < 30]),
'over_30': percent(len(df[df['age'] >= 30]), len(df)),
'over_30_count': len(df[df['age'] >= 30]),
'under_25': percent(len(df[df['age'] < 25]), len(df)),
'under_25_count': len(df[df['age'] < 25]),
'over_25': percent(len(df[df['age'] >= 25]), len(df)),
'over_25_count': len(df[df['age'] >= 25]),
},
'neuter': extract_question(df, 6),
'neuterByUsers': extract_question(df_neuter, 6),
'pronounGroups': extract_question(df, 7),
'pronounGroupsAggr': extract_question(df, 7, include_answers=False, include_aggregates=True),
'pluralNouns': extract_question_single(df_plural, 8),
'pluralNonGendered': extract_question_single(df_plural, 9),
'pronouns': extract_question(df, 10),
'pronounsAggr': extract_question(df, 10, include_answers=False, include_aggregates=True),
'nouns': extract_question(df, 11),
'honorifics': extract_question(df, 12, include_aggregates=True),
'obstacles': extract_question(df, 13),
'groups': extract_question(df, 14),
'reasons': extract_question(df, 15),
'namesGender': extract_question_single(df, 16), # TODO missing aggregate
'names': extract_question_single(df, 17),
'namesAggr': extract_question(df, 17, include_answers=False, include_aggregates=True),
'namesDeclension': extract_question_single(df, 18),
'english': extract_question(df, 19, include_aggregates=True), # TODO use df_english
'labelsGender': extract_question(df, 20, include_aggregates=True, remove_underscores=False),
'labelsAttractionSplit': extract_question_single(df, 21),
'labelsSexuality': extract_question(df, 22, include_aggregates=True, remove_underscores=False),
'labelsRomantic': extract_question(df_attraction_split, 23, include_aggregates=True, remove_underscores=False),
'transtionAnswered': extract_question_single(df, 24),
'transitionSocial': extract_question_single(df_transition, 25),
'transitionSocialSentiment': extract_question_single_sentiment(df_transition, 25, sentiment_map=transition_sentiment),
'transitionName': extract_question_single(df_transition, 26),
'transitionNameSentiment': extract_question_single_sentiment(df_transition, 26, sentiment_map=transition_sentiment),
'transitionMarker': extract_question_single(df_transition, 27),
'transitionMarkerSentiment': extract_question_single_sentiment(df_transition, 27, sentiment_map=transition_sentiment),
'transitionPhysical': extract_question_single(df_transition, 28),
'transitionPhysicalSentiment': extract_question_single_sentiment(df_transition, 28, sentiment_map=transition_sentiment),
'transitionHormonal': extract_question_single(df_transition, 29),
'transitionHormonalSentiment': extract_question_single_sentiment(df_transition, 29, sentiment_map=transition_sentiment),
'transitionSurgical': extract_question_single(df_transition, 30),
'transitionSurgicalSentiment': extract_question_single_sentiment(df_transition, 30, sentiment_map=transition_sentiment),
}
stats_json = {
k: v.to_dict() if type(v) is pd.Series else v
for k, v
in stats.items()
}
stats_json['diff'] = {}
for prev_year in (diffs or []):
prev_year_key = f'spis-{prev_year}'
file_path = outputDir.parent / prev_year_key / group / 'stats.json'
if not file_path.exists():
continue
with open(file_path, 'r') as f:
prev_stats = json.load(f)
stats_json['diff'][prev_year_key] = {}
for k, v in stats_json.items():
if type(v) != dict or k == 'diff' or k not in prev_stats:
continue
stats_json['diff'][prev_year_key][k] = {}
for kk, vv in v.items():
if kk not in prev_stats[k]:
continue
stats_json['diff'][prev_year_key][k][kk] = round(vv - prev_stats[k][kk], 1)
stats_json = json.dumps(stats_json, indent=4)
if echo:
print('--- Group: %s ---' % group)
print(stats_json)
with open(outputDir / group / 'stats.json', 'w') as f:
f.write(stats_json + '\n')
return stats
def build_ages_histogram(df: pd.DataFrame) -> pd.Series:
ages = [int(a) for a in df['age'].to_list() if a > 0]
ages_hist = {i: 0 for i in range(min(ages), max(ages) + 1)}
for age in ages:
ages_hist[age] += 1
s = len(ages)
return pd.Series({
age: percent(count, s, 3)
for age, count
in ages_hist.items()
})
def generate_yearly_comparison(data: dict, show: bool = False):
df = pd.DataFrame(data)
# Reshape to long format
df_long = df.melt(id_vars="Form", var_name="Year", value_name="Percentage")
# Plot
fig = px.line(df_long,
x="Year", y="Percentage", color="Form",
title="Trendy na przestrzeni lat",
template="plotly_white",
markers=True,
#color_discrete_sequence=colours_multi,
)
fig.update_traces(line=dict(width=4), marker=dict(size=8))
for trace in fig.select_traces():
trace.update(
hovertemplate='%{x}
%{y:.2f}%', # + '',
)
df_last_year = df_long[df_long["Year"] == df_long["Year"].max()]
for _, row in df_last_year.iterrows():
if row["Form"] == "Neutratywy":
position = "bottom left"
else:
position = "top left"
fig.add_trace(go.Scatter(
x=[row["Year"]],
y=[row["Percentage"]],
text=["" + row["Form"] + ""],
mode="text",
textposition=position,
showlegend=False,
hoverinfo="skip",
textfont=dict(family=fontFamily, size=16),
))
fig.update_layout(
xaxis=None,
yaxis_title="Procent (%)",
showlegend=False,
#legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''),
font=dict(family=fontFamily, size=14),
)
file_path = outputDir / 'year_by_year.html'
pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn')
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-s', '--show', dest='show', default=False, nargs='?', const=True)
parser.add_argument('-e', '--echo', dest='echo', default=False, nargs='?', const=True)
args = parser.parse_args()
if args.show:
openFigs = True
df = pd.read_csv(inputDir / 'export.csv')
df = df[df['0_'].isin(['osobą niebinarną', 'nie wiem'])]
df.loc[:, 'age'] = year - df['3_']
df.loc[df['age'] > 100, 'age'] = None
diffs = [2021, 2022, 2023, 2024]
generate_yearly_comparison({
"Form": [
"Rodzaj neutralny",
"Rodzaj postpłciowy",
"Wyłącznie formy binarne",
"Wyłącznie formy niebinarne",
"Neutratywy",
"Osobatywy"
],
"2021": [25.4, 8, 53.6, 8.4, 12, 66.7],
"2022": [43.1, 11.8, 15.7, 15.5, 14.3, 65.4],
"2023": [48, 9.3, 15.6, 18.2, 11.7, 67.5],
"2024": [45.9, 6.2, 18, 19.4, 12.3, 68],
"2025": [52.3, 8.6, 15.5, 18.9, 15.4, 74.4]
})
stats = {
'general': analyse('general', df, df, args.echo, diffs),
'location_poland': analyse('location_poland', df[df['4_'] == 'w Polsce'], df, args.echo, diffs),
'location_abroad': analyse('location_abroad', df[df['4_'] == 'za granicą'], df, args.echo, diffs),
'agab_f': analyse('agab_f', df[df['1_'] == 'żeńską'], df, args.echo, diffs),
'agab_m': analyse('agab_m', df[df['1_'] == 'męską'], df, args.echo, diffs),
# 'agab_x': analyse('agab_x', df[df['1_'] == 'inną (w jurysdykcjach, gdzie to możliwe)'], df, args.echo, diffs),
'younger': analyse('younger', df[df['age'] < 25], df, args.echo, diffs),
'older': analyse('older', df[df['age'] >= 25], df, args.echo, diffs),
'name_f': analyse('name_f', df[df['16_'] == 'żeńskie'], df, args.echo, diffs),
'name_m': analyse('name_m', df[df['16_'] == 'męskie'], df, args.echo, diffs),
'name_n': analyse('name_n', df[df['16_'] == 'neutralne płciowo'], df, args.echo, diffs),
}
comparisons = {
'by_location': {
'general': 'Ogół',
'location_poland': 'Polska',
'location_abroad': 'Zagranica',
},
'by_agab': {
'general': 'Ogół',
'agab_f': 'AFAB',
'agab_m': 'AMAB',
},
'by_age': {
'general': 'Ogół',
'younger': 'Grupa młodsza',
'older': 'Grupa starsza',
},
'by_name': {
'name_f': 'Imię żeńskie',
'name_m': 'Imię męskie',
'name_n': 'Imię neutralne płciowo',
},
}
graphs = {
'age': 'Wiek osób respondenckich',
'neuter': 'Preferowana nazwa rodzaju gramatycznego',
'neuterByUsers': 'Preferowana nazwa rodzaju gramatycznego wśród osób go używających',
'pronounGroups': 'Rodzaj gramatyczny używany w mowie',
'pronouns': 'Zaimki używane w piśmie',
'pluralNouns': 'Czy rzeczowniki również w liczbie mnogiej?',
'pluralNonGendered': 'Czy nieupłciowione formy również w liczbie mnogiej?',
'pronounsAggr': 'Zaimki używane w piśmie (zgrupowane)',
'nouns': 'Rzeczowniki',
'honorifics': 'Formy grzecznościowe',
'obstacles': 'Dlaczego nie formy niebinarne?',
'reasons': 'Co wpływa na wybór form?',
'groups': 'Formy do opisu grup mieszanych',
'namesGender': 'Upłciowienie używanego imienia',
'names': 'Wybór używanego imienia',
'namesDeclension': 'Odmiana używanego imienia',
'english': 'Zaimki w języku angielskim',
'labelsGender': 'Etykietki opisujące płeć',
'labelsSexuality': 'Etykietki opisujące orientację seksualną',
'labelsRomantic': 'Etykietki opisujące orientację romantyczną',
'transitionSocial': 'Tranzycja społeczna',
'transitionName': 'Tranzycja prawna – imię',
'transitionMarker': 'Tranzycja prawna – znacznik płci',
'transitionPhysical': 'Tranzycja medyczna – bindery, packery, itp.',
'transitionHormonal': 'Tranzycja medyczna – hormony',
'transitionSurgical': 'Tranzycja medyczna – zmiany chirurgiczne',
}
for group, group_stats in stats.items():
for graph, graph_label in graphs.items():
generate_graph(group_stats[graph], group, graph, graph_label)
for comparison_key, comparison_groups in comparisons.items():
ensure_empty_dir(outputDir / comparison_key)
for graph, graph_label in graphs.items():
if comparison_key == 'by_name' and graph != 'namesDeclension':
continue
data = pd.DataFrame({
groupLabel: stats[group][graph]
for group, groupLabel
in comparison_groups.items()
})
generate_graph(data, comparison_key, graph, graph_label)
by_year = {}
for prev_year in [*diffs, year]:
file_path = outputDir.parent / f'spis-{prev_year}' / 'general' / 'stats.json'
if not file_path.exists():
continue
with open(file_path, 'r') as f:
by_year[prev_year] = json.load(f)
ensure_empty_dir(outputDir / 'by_year')
for graph, graph_label in graphs.items():
data = pd.DataFrame({
column_year: year_data[graph]
for column_year, year_data
in by_year.items()
if graph in year_data
})
generate_graph(data, 'by_year', graph, graph_label)
write_ins_dir = outputDir / 'write_ins'
write_ins_dir.mkdir(parents=True, exist_ok=True)
for column in df.columns:
if not column.endswith('__writein') and column != '31_':
continue
print(f'Extracting write-ins for question {column}')
writeins = df[column].dropna().value_counts().reset_index()
writeins.columns = ['write-in', 'count']
writeins = writeins[['count', 'write-in']]
writeins = writeins.sort_values(by=['count'], ascending=False)
writeins.to_csv(write_ins_dir / f'{column}.tsv', index=False, sep='\t')