from typing import Union, List, Dict import pandas as pd from pathlib import Path import plotly.express as px import plotly.io as pio import plotly.graph_objects as go import os from datetime import datetime import json from argparse import ArgumentParser import shutil year = datetime.now().year projectDir = Path(__file__).parent inputDir = projectDir / 'input' outputDir = projectDir.parent / 'locale' / 'pl' / 'docs' / f'spis-{year}' openFigs = False colours = ['#c71585'] colours_multi = ['#c71585', '#8b0f7a', '#15c79c', '#20a0d7'] pd.options.mode.chained_assignment = None fontFamily = 'Nunito, "Open Sans", sans-serif' graphHead = ''' ''' transition_sentiment = { 'już przeszł_m proces': 1, 'jestem w trakcie': 1, 'stosuję microdosing': 1, 'w części miejsc tak, w części nie': 1, 'zamierzam zacząć w przyszłości': 1, 'zamierzam przejść w przyszłości': 1, 'już przeszł_m proces zmiany na opcję neutraną': 1, 'tak': 1, 'jestem w trakcie procesu zmiany na opcję neutraną': 1, 'jestem w trakcie procesu zmiany na drugi znacznik binarny': 1, 'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie możliwe usunięcie znacznika': 1, 'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie dostępna opcja neutralna': 1, 'zamierzam przejść w przyszłości, na znacznik binarny': 1, 'już przeszł_m proces zmiany na drugi znacznik binarny': 1, 'przeszł_m część z zabiegów, które chcę przejść': 1, 'przyjmuję hormony': 1, 'stosuję microdosing': 1, 'przechodzę, we wszystkich obszarach życia': 1, 'przeszł_m już wszystkie zabiegi, które chcę przejść': 1, 'jeszcze nie wiem': 0, 'nie chcę odpowiadać': 0, 'nie zamierzam': -1, 'przechodzę/przeszł_m detranzycję': -1, 'przechodzę/przeszł_m detranzycję': -1, } def calculate_percentages(df: pd.DataFrame, counts: pd.Series) -> pd.Series: percentages = counts.apply(lambda x: round(100 * x / len(df), 1)) return pd.Series({ **{k: percentages[k] for k in percentages.index}, **{f"{k}__count": counts[k] for k in counts.index} }) def extract_question( df: pd.DataFrame, question_number: int, include_answers: bool = True, include_aggregates: bool = False, remove_underscores: bool = True ) -> pd.Series: print('Extracting question', question_number) question_series = df.filter(regex='^%s_%s(?!_writein)' % ( question_number, ('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)' )) question_series.columns = [ c[len(str(question_number)) + 1:] .replace('aggr_', 'łącznie: ') .replace('_', ' ' if remove_underscores else '_') .replace('łącznie: trans_', 'łącznie: trans*') for c in question_series.columns ] return calculate_percentages(df, question_series.sum()) def extract_question_single( df: pd.DataFrame, question_number: int, include_answers: bool = True, include_aggregates: bool = False ) -> pd.Series: print('Extracting question', question_number) question_series = df.filter(regex='^%s_%s(?!_writein)' % ( question_number, ('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)' )) return calculate_percentages(df, question_series.groupby(f'{question_number}_')[f'{question_number}_'].count()) def extract_question_single_sentiment( df: pd.DataFrame, question_number: int, sentiment_map: Dict[str, int], include_answers: bool = True, include_aggregates: bool = False ) -> pd.Series: question_series = extract_question_single(df, question_number, include_answers, include_aggregates) question_series.index = question_series.index.map(sentiment_map) question_series = question_series.groupby(question_series.index).sum() sentiment_series = pd.Series( data=[ round(question_series[1], 1), round(question_series[0], 1), round(question_series[-1], 1), round(question_series[0] + question_series[-1], 1) ], index=['positive', 'neutral', 'negative', 'non-positive'], ) return sentiment_series def rename_index(data: Union[pd.DataFrame, pd.Series], new_index: List[str]) -> Union[pd.DataFrame, pd.Series]: if type(data) is pd.Series: data = data.copy() data.columns = new_index data.index = new_index return data return data.set_axis(labels=new_index, axis=0) def generate_graph( data: Union[pd.DataFrame, pd.Series], group: str, name: str, title: str, show: bool = False ): print('Generating graph', group, name) if isinstance(data, pd.DataFrame): data = data.loc[[idx for idx in data.index if not str(idx).endswith('__count')]] elif isinstance(data, pd.Series): data = data[[not str(k).endswith('__count') for k in data.index]].reindex() is_multi = type(data) is pd.DataFrame and len(data.columns) > 1 if all([type(c) is str and c.startswith('łącznie: ') for c in data.index]): data = rename_index(data, [c.replace('łącznie: ', '') for c in data.index]) if 'nic, używam imienia nadanego mi przez rodziców' in data.index: data = rename_index(data, [ 'nadane – ale przeciwna płeć', 'nadane – ale wersja unisex', 'nadane – bez zmian', 'wybrane – rzeczownik', 'wybrane – binarne', 'wybrane – unisex', ]) if 'zamierzam zacząć w przyszłości' in data.index: data = data.reindex(transition_sentiment.keys()).dropna() fig = px.bar( data, color_discrete_sequence=colours_multi if is_multi else colours, barmode='group', template='plotly_white', ) fig.update_layout( showlegend=is_multi, legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''), title=title, xaxis=None, yaxis=None, font=dict(family=fontFamily, size=14), ) # i can't believe i have to do that because a professor can't read a graph 🤦 if name == 'age': fig.update_layout( xaxis=dict(tickvals=[13, 20, 30, 40, 50, 60, 70]), ) for trace in fig.select_traces(): trace.update( hovertemplate='%{x}
%{y:.2f}%' + ('
%{meta}' if is_multi else '') + '', meta=trace.offsetgroup, hoverlabel_font=dict(family=fontFamily, size=12), visible='legendonly' if trace.name == 'Ogół' else True, ) file_path = outputDir / group / (name + '.html') pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn') with open(file_path, 'r') as fr: content = fr.read().replace('', graphHead + '') with open(file_path, 'w') as fw: fw.write(content) def percent(value: int, size: int, precision: int = 2) -> float: return round(100 * value / size, precision) def ensure_empty_dir(dir: Path) -> Path: if os.path.exists(dir): shutil.rmtree(dir) os.makedirs(dir, exist_ok=True) return dir def analyse(group: str, df: pd.DataFrame, full_df: pd.DataFrame, echo: bool = False, diffs: List[int] = None): ensure_empty_dir(outputDir / group) df['18_'] = df['18_'].str.rstrip('.') df['25_'] = df['25_'].str.replace('', '').str.replace('', '') df['28_'] = df['28_'].str.replace('', '').str.replace('', '') df['29_'] = df['29_'].str.replace('', '').str.replace('', '') df['30_'] = df['30_'].str.replace('', '').str.replace('', '') df_plural = df[df['7_aggr_mnogie'] == 1] df_neuter = df[df['7_rodzaj neutralny'] == 1] df_transition = df[df['24_'] == 'tak'] df_english = df[df['19_nie znam / nie używam angielskiego'] != 1].drop('19_nie znam / nie używam angielskiego', axis=1) df_attraction_split = df[df['21_'] == 'tak'] stats = { 'size': len(df), 'size_subset_percent': percent(len(df), len(full_df)), 'size_plural': len(df_plural), 'size_plural_percent': percent(len(df_plural), len(df)), 'size_neuter': len(df_neuter), 'size_neuter_percent': percent(len(df_neuter), len(df)), 'size_transition': len(df_transition), 'size_transition_subset_percent': percent(len(df_transition), len(df)), 'size_english': len(df_english), 'age': pd.Series(build_ages_histogram(df)), 'ageStats': { 'avg': round(df['age'].mean(), 1), 'median': round(df['age'].median(), 1), 'std': round(df['age'].std(), 1), 'adults': percent(len(df[df['age'] >= 18]), len(df)), 'adults_count': len(df[df['age'] >= 18]), 'under_30': percent(len(df[df['age'] < 30]), len(df)), 'under_30_count': len(df[df['age'] < 30]), 'over_30': percent(len(df[df['age'] >= 30]), len(df)), 'over_30_count': len(df[df['age'] >= 30]), 'under_25': percent(len(df[df['age'] < 25]), len(df)), 'under_25_count': len(df[df['age'] < 25]), 'over_25': percent(len(df[df['age'] >= 25]), len(df)), 'over_25_count': len(df[df['age'] >= 25]), }, 'neuter': extract_question(df, 6), 'neuterByUsers': extract_question(df_neuter, 6), 'pronounGroups': extract_question(df, 7), 'pronounGroupsAggr': extract_question(df, 7, include_answers=False, include_aggregates=True), 'pluralNouns': extract_question_single(df_plural, 8), 'pluralNonGendered': extract_question_single(df_plural, 9), 'pronouns': extract_question(df, 10), 'pronounsAggr': extract_question(df, 10, include_answers=False, include_aggregates=True), 'nouns': extract_question(df, 11), 'honorifics': extract_question(df, 12, include_aggregates=True), 'obstacles': extract_question(df, 13), 'groups': extract_question(df, 14), 'reasons': extract_question(df, 15), 'namesGender': extract_question_single(df, 16), # TODO missing aggregate 'names': extract_question_single(df, 17), 'namesAggr': extract_question(df, 17, include_answers=False, include_aggregates=True), 'namesDeclension': extract_question_single(df, 18), 'english': extract_question(df, 19, include_aggregates=True), # TODO use df_english 'labelsGender': extract_question(df, 20, include_aggregates=True, remove_underscores=False), 'labelsAttractionSplit': extract_question_single(df, 21), 'labelsSexuality': extract_question(df, 22, include_aggregates=True, remove_underscores=False), 'labelsRomantic': extract_question(df_attraction_split, 23, include_aggregates=True, remove_underscores=False), 'transtionAnswered': extract_question_single(df, 24), 'transitionSocial': extract_question_single(df_transition, 25), 'transitionSocialSentiment': extract_question_single_sentiment(df_transition, 25, sentiment_map=transition_sentiment), 'transitionName': extract_question_single(df_transition, 26), 'transitionNameSentiment': extract_question_single_sentiment(df_transition, 26, sentiment_map=transition_sentiment), 'transitionMarker': extract_question_single(df_transition, 27), 'transitionMarkerSentiment': extract_question_single_sentiment(df_transition, 27, sentiment_map=transition_sentiment), 'transitionPhysical': extract_question_single(df_transition, 28), 'transitionPhysicalSentiment': extract_question_single_sentiment(df_transition, 28, sentiment_map=transition_sentiment), 'transitionHormonal': extract_question_single(df_transition, 29), 'transitionHormonalSentiment': extract_question_single_sentiment(df_transition, 29, sentiment_map=transition_sentiment), 'transitionSurgical': extract_question_single(df_transition, 30), 'transitionSurgicalSentiment': extract_question_single_sentiment(df_transition, 30, sentiment_map=transition_sentiment), } stats_json = { k: v.to_dict() if type(v) is pd.Series else v for k, v in stats.items() } stats_json['diff'] = {} for prev_year in (diffs or []): prev_year_key = f'spis-{prev_year}' file_path = outputDir.parent / prev_year_key / group / 'stats.json' if not file_path.exists(): continue with open(file_path, 'r') as f: prev_stats = json.load(f) stats_json['diff'][prev_year_key] = {} for k, v in stats_json.items(): if type(v) != dict or k == 'diff' or k not in prev_stats: continue stats_json['diff'][prev_year_key][k] = {} for kk, vv in v.items(): if kk not in prev_stats[k]: continue stats_json['diff'][prev_year_key][k][kk] = round(vv - prev_stats[k][kk], 1) stats_json = json.dumps(stats_json, indent=4) if echo: print('--- Group: %s ---' % group) print(stats_json) with open(outputDir / group / 'stats.json', 'w') as f: f.write(stats_json + '\n') return stats def build_ages_histogram(df: pd.DataFrame) -> pd.Series: ages = [int(a) for a in df['age'].to_list() if a > 0] ages_hist = {i: 0 for i in range(min(ages), max(ages) + 1)} for age in ages: ages_hist[age] += 1 s = len(ages) return pd.Series({ age: percent(count, s, 3) for age, count in ages_hist.items() }) def generate_yearly_comparison(data: dict, show: bool = False): df = pd.DataFrame(data) # Reshape to long format df_long = df.melt(id_vars="Form", var_name="Year", value_name="Percentage") # Plot fig = px.line(df_long, x="Year", y="Percentage", color="Form", title="Trendy na przestrzeni lat", template="plotly_white", markers=True, #color_discrete_sequence=colours_multi, ) fig.update_traces(line=dict(width=4), marker=dict(size=8)) for trace in fig.select_traces(): trace.update( hovertemplate='%{x}
%{y:.2f}%', # + '', ) df_last_year = df_long[df_long["Year"] == df_long["Year"].max()] for _, row in df_last_year.iterrows(): if row["Form"] == "Neutratywy": position = "bottom left" else: position = "top left" fig.add_trace(go.Scatter( x=[row["Year"]], y=[row["Percentage"]], text=["" + row["Form"] + ""], mode="text", textposition=position, showlegend=False, hoverinfo="skip", textfont=dict(family=fontFamily, size=16), )) fig.update_layout( xaxis=None, yaxis_title="Procent (%)", showlegend=False, #legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''), font=dict(family=fontFamily, size=14), ) file_path = outputDir / 'year_by_year.html' pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn') if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('-s', '--show', dest='show', default=False, nargs='?', const=True) parser.add_argument('-e', '--echo', dest='echo', default=False, nargs='?', const=True) args = parser.parse_args() if args.show: openFigs = True df = pd.read_csv(inputDir / 'export.csv') df = df[df['0_'].isin(['osobą niebinarną', 'nie wiem'])] df.loc[:, 'age'] = year - df['3_'] df.loc[df['age'] > 100, 'age'] = None diffs = [2021, 2022, 2023, 2024] generate_yearly_comparison({ "Form": [ "Rodzaj neutralny", "Rodzaj postpłciowy", "Wyłącznie formy binarne", "Wyłącznie formy niebinarne", "Neutratywy", "Osobatywy" ], "2021": [25.4, 8, 53.6, 8.4, 12, 66.7], "2022": [43.1, 11.8, 15.7, 15.5, 14.3, 65.4], "2023": [48, 9.3, 15.6, 18.2, 11.7, 67.5], "2024": [45.9, 6.2, 18, 19.4, 12.3, 68], "2025": [52.3, 8.6, 15.5, 18.9, 15.4, 74.4] }) stats = { 'general': analyse('general', df, df, args.echo, diffs), 'location_poland': analyse('location_poland', df[df['4_'] == 'w Polsce'], df, args.echo, diffs), 'location_abroad': analyse('location_abroad', df[df['4_'] == 'za granicą'], df, args.echo, diffs), 'agab_f': analyse('agab_f', df[df['1_'] == 'żeńską'], df, args.echo, diffs), 'agab_m': analyse('agab_m', df[df['1_'] == 'męską'], df, args.echo, diffs), # 'agab_x': analyse('agab_x', df[df['1_'] == 'inną (w jurysdykcjach, gdzie to możliwe)'], df, args.echo, diffs), 'younger': analyse('younger', df[df['age'] < 25], df, args.echo, diffs), 'older': analyse('older', df[df['age'] >= 25], df, args.echo, diffs), 'name_f': analyse('name_f', df[df['16_'] == 'żeńskie'], df, args.echo, diffs), 'name_m': analyse('name_m', df[df['16_'] == 'męskie'], df, args.echo, diffs), 'name_n': analyse('name_n', df[df['16_'] == 'neutralne płciowo'], df, args.echo, diffs), } comparisons = { 'by_location': { 'general': 'Ogół', 'location_poland': 'Polska', 'location_abroad': 'Zagranica', }, 'by_agab': { 'general': 'Ogół', 'agab_f': 'AFAB', 'agab_m': 'AMAB', }, 'by_age': { 'general': 'Ogół', 'younger': 'Grupa młodsza', 'older': 'Grupa starsza', }, 'by_name': { 'name_f': 'Imię żeńskie', 'name_m': 'Imię męskie', 'name_n': 'Imię neutralne płciowo', }, } graphs = { 'age': 'Wiek osób respondenckich', 'neuter': 'Preferowana nazwa rodzaju gramatycznego', 'neuterByUsers': 'Preferowana nazwa rodzaju gramatycznego wśród osób go używających', 'pronounGroups': 'Rodzaj gramatyczny używany w mowie', 'pronouns': 'Zaimki używane w piśmie', 'pluralNouns': 'Czy rzeczowniki również w liczbie mnogiej?', 'pluralNonGendered': 'Czy nieupłciowione formy również w liczbie mnogiej?', 'pronounsAggr': 'Zaimki używane w piśmie (zgrupowane)', 'nouns': 'Rzeczowniki', 'honorifics': 'Formy grzecznościowe', 'obstacles': 'Dlaczego nie formy niebinarne?', 'reasons': 'Co wpływa na wybór form?', 'groups': 'Formy do opisu grup mieszanych', 'namesGender': 'Upłciowienie używanego imienia', 'names': 'Wybór używanego imienia', 'namesDeclension': 'Odmiana używanego imienia', 'english': 'Zaimki w języku angielskim', 'labelsGender': 'Etykietki opisujące płeć', 'labelsSexuality': 'Etykietki opisujące orientację seksualną', 'labelsRomantic': 'Etykietki opisujące orientację romantyczną', 'transitionSocial': 'Tranzycja społeczna', 'transitionName': 'Tranzycja prawna – imię', 'transitionMarker': 'Tranzycja prawna – znacznik płci', 'transitionPhysical': 'Tranzycja medyczna – bindery, packery, itp.', 'transitionHormonal': 'Tranzycja medyczna – hormony', 'transitionSurgical': 'Tranzycja medyczna – zmiany chirurgiczne', } for group, group_stats in stats.items(): for graph, graph_label in graphs.items(): generate_graph(group_stats[graph], group, graph, graph_label) for comparison_key, comparison_groups in comparisons.items(): ensure_empty_dir(outputDir / comparison_key) for graph, graph_label in graphs.items(): if comparison_key == 'by_name' and graph != 'namesDeclension': continue data = pd.DataFrame({ groupLabel: stats[group][graph] for group, groupLabel in comparison_groups.items() }) generate_graph(data, comparison_key, graph, graph_label) by_year = {} for prev_year in [*diffs, year]: file_path = outputDir.parent / f'spis-{prev_year}' / 'general' / 'stats.json' if not file_path.exists(): continue with open(file_path, 'r') as f: by_year[prev_year] = json.load(f) ensure_empty_dir(outputDir / 'by_year') for graph, graph_label in graphs.items(): data = pd.DataFrame({ column_year: year_data[graph] for column_year, year_data in by_year.items() if graph in year_data }) generate_graph(data, 'by_year', graph, graph_label) write_ins_dir = outputDir / 'write_ins' write_ins_dir.mkdir(parents=True, exist_ok=True) for column in df.columns: if not column.endswith('__writein') and column != '31_': continue print(f'Extracting write-ins for question {column}') writeins = df[column].dropna().value_counts().reset_index() writeins.columns = ['write-in', 'count'] writeins = writeins[['count', 'write-in']] writeins = writeins.sort_values(by=['count'], ascending=False) writeins.to_csv(write_ins_dir / f'{column}.tsv', index=False, sep='\t')