from typing import Union, List, Dict import pandas as pd from pathlib import Path import plotly.express as px import plotly.io as pio import os from datetime import datetime import json from argparse import ArgumentParser import shutil year = datetime.now().year projectDir = Path(__file__).parent inputDir = projectDir / 'input' outputDir = projectDir.parent / 'locale' / 'pl' / 'docs' / f'spis-{year}' openFigs = False colours = ['#c71585'] colours_multi = ['#c71585', '#8b0f7a', '#15c79c', '#20a0d7'] pd.options.mode.chained_assignment = None fontFamily = 'Nunito, "Open Sans", sans-serif' graphHead = ''' ''' transition_sentiment = { 'już przeszł_m proces': 1, 'jestem w trakcie': 1, 'stosuję microdosing': 1, 'w części miejsc tak, w części nie': 1, 'zamierzam zacząć w przyszłości': 1, 'jeszcze nie wiem': 0, 'nie chcę odpowiadać': 0, 'nie zamierzam': -1, 'przechodzę/przeszł_m detranzycję': -1, } def extract_question( df: pd.DataFrame, question_number: int, include_answers: bool = True, include_aggregates: bool = False, remove_underscores: bool = True ) -> pd.Series: print('Extracting question', question_number) question_series = df.filter(regex='^%s_%s(?!_writein)' % ( question_number, ('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)' )) question_series.columns = [ c[len(str(question_number)) + 1:] .replace('aggr_', 'łącznie: ') .replace('_', ' ' if remove_underscores else '_') .replace('łącznie: trans_', 'łącznie: trans*') for c in question_series.columns ] question_series = question_series.sum() question_series = question_series.apply(lambda x: round(100 * x / len(df), 1)) return question_series def extract_question_single( df: pd.DataFrame, question_number: int, include_answers: bool = True, include_aggregates: bool = False ) -> pd.Series: print('Extracting question', question_number) question_series = df.filter(regex='^%s_%s(?!_writein)' % ( question_number, ('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)' )) question_series = question_series.groupby(f'{question_number}_')[f'{question_number}_'].count() question_series = question_series.apply(lambda x: round(100 * x / len(df), 1)) return question_series def extract_question_single_sentiment( df: pd.DataFrame, question_number: int, sentiment_map: Dict[str, int], include_answers: bool = True, include_aggregates: bool = False ) -> pd.Series: question_series = extract_question_single(df, question_number, include_answers, include_aggregates) question_series.index = question_series.index.map(sentiment_map) question_series = question_series.groupby(question_series.index).sum() sentiment_series = pd.Series( data=[ round(question_series[1], 1), round(question_series[0], 1), round(question_series[-1], 1), round(question_series[0] + question_series[-1], 1) ], index=['positive', 'neutral', 'negative', 'non-positive'], ) return sentiment_series def rename_index(data: Union[pd.DataFrame, pd.Series], new_index: List[str]) -> Union[pd.DataFrame, pd.Series]: if type(data) is pd.Series: data = data.copy() data.columns = new_index data.index = new_index return data return data.set_axis(labels=new_index, axis=0) def generate_graph( data: Union[pd.DataFrame, pd.Series], group: str, name: str, title: str, show: bool = False ): print('Generating graph', group, name) is_multi = type(data) is pd.DataFrame and len(data.columns) > 1 if all([type(c) is str and c.startswith('łącznie: ') for c in data.index]): data = rename_index(data, [c.replace('łącznie: ', '') for c in data.index]) if 'nic, używam imienia nadanego mi przez rodziców' in data.index: data = rename_index(data, [ 'nadane – ale przeciwna płeć', 'nadane – ale wersja unisex', 'nadane – bez zmian', 'wybrane – rzeczownik', 'wybrane – binarne', 'wybrane – unisex', ]) if 'zamierzam zacząć w przyszłości' in data.index: data = data.reindex(transition_sentiment.keys()).dropna() fig = px.bar( data, color_discrete_sequence=colours_multi if is_multi else colours, barmode='group', template='plotly_white', ) fig.update_layout( showlegend=is_multi, legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''), title=title, xaxis=None, yaxis=None, font=dict(family=fontFamily, size=14), ) # i can't believe i have to do that because a professor can't read a graph 🤦 if name == 'age': fig.update_layout( xaxis=dict(tickvals=[13, 20, 30, 40, 50, 60, 70]), ) for trace in fig.select_traces(): trace.update( hovertemplate='%{x}
%{y:.2f}%' + ('
%{meta}' if is_multi else '') + '', meta=trace.offsetgroup, hoverlabel_font=dict(family=fontFamily, size=12), visible='legendonly' if trace.name == 'Ogół' else True, ) file_path = outputDir / group / (name + '.html') pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn') with open(file_path, 'r') as fr: content = fr.read().replace('', graphHead + '') with open(file_path, 'w') as fw: fw.write(content) def percent(value: int, size: int, precision: int = 2) -> float: return round(100 * value / size, precision) def ensure_empty_dir(dir: Path) -> Path: if os.path.exists(dir): shutil.rmtree(dir) os.makedirs(dir, exist_ok=True) return dir def analyse(group: str, df: pd.DataFrame, full_df: pd.DataFrame, echo: bool = False, diffs: List[int] = None): ensure_empty_dir(outputDir / group) df_plural = df[df['7_aggr_mnogie'] == 1] df_neuter = df[df['7_rodzaj neutralny'] == 1] df_transition = df[df['22_'] == 'tak'] stats = { 'size': len(df), 'size_subset_percent': percent(len(df), len(full_df)), 'size_plural': len(df_plural), 'size_plural_percent': percent(len(df_plural), len(df)), 'size_neuter': len(df_neuter), 'size_neuter_percent': percent(len(df_neuter), len(df)), 'size_transition': len(df_transition), 'size_transition_subset_percent': percent(len(df_transition), len(df)), 'age': pd.Series(build_ages_histogram(df)), 'ageStats': { 'avg': round(df['age'].mean(), 1), 'median': round(df['age'].median(), 1), 'std': round(df['age'].std(), 1), 'adults': percent(len(df[df['age'] >= 18]), len(df)), 'adults_count': len(df[df['age'] >= 18]), 'under_30': percent(len(df[df['age'] < 30]), len(df)), 'under_30_count': len(df[df['age'] < 30]), 'over_30': percent(len(df[df['age'] >= 30]), len(df)), 'over_30_count': len(df[df['age'] >= 30]), 'under_25': percent(len(df[df['age'] < 25]), len(df)), 'under_25_count': len(df[df['age'] < 25]), 'over_25': percent(len(df[df['age'] >= 25]), len(df)), 'over_25_count': len(df[df['age'] >= 25]), }, 'neuter': extract_question_single(df, 6), 'neuterByUsers': extract_question_single(df_neuter, 6), 'pronounGroups': extract_question(df, 7), 'pronounGroupsAggr': extract_question(df, 7, include_answers=False, include_aggregates=True), 'pluralNouns': extract_question_single(df_plural, 8), 'pluralNonGendered': extract_question_single(df_plural, 9), 'pronouns': extract_question(df, 10), 'pronounsAggr': extract_question(df, 10, include_answers=False, include_aggregates=True), 'nouns': extract_question(df, 11), 'honorifics': extract_question(df, 12, include_aggregates=True), 'obstacles': extract_question(df, 13), 'groups': extract_question(df, 14), 'reasons': extract_question(df, 15), 'names': extract_question_single(df, 16), 'namesAggr': extract_question(df, 16, include_answers=False, include_aggregates=True), 'english': extract_question(df, 17, include_aggregates=True), 'labelsGender': extract_question(df, 18, include_aggregates=True, remove_underscores=False), 'labelsAttractionSplit': extract_question_single(df, 19), 'labelsSexuality': extract_question(df, 20, include_aggregates=True, remove_underscores=False), 'labelsRomantic': extract_question(df, 21, include_aggregates=True, remove_underscores=False), 'transtionAnswered': extract_question_single(df, 22), 'transitionSocial': extract_question_single(df_transition, 23), 'transitionSocialSentiment': extract_question_single_sentiment(df_transition, 23, sentiment_map=transition_sentiment), 'transitionName': extract_question_single(df_transition, 24), 'transitionNameSentiment': extract_question_single_sentiment(df_transition, 24, sentiment_map=transition_sentiment), 'transitionMarker': extract_question_single(df_transition, 25), 'transitionMarkerSentiment': extract_question_single_sentiment(df_transition, 25, sentiment_map=transition_sentiment), 'transitionPhysical': extract_question_single(df_transition, 26), 'transitionPhysicalSentiment': extract_question_single_sentiment(df_transition, 26, sentiment_map=transition_sentiment), 'transitionHormonal': extract_question_single(df_transition, 27), 'transitionHormonalSentiment': extract_question_single_sentiment(df_transition, 27, sentiment_map=transition_sentiment), 'transitionSurgical': extract_question_single(df_transition, 28), 'transitionSurgicalSentiment': extract_question_single_sentiment(df_transition, 28, sentiment_map=transition_sentiment), } stats_json = { k: v.to_dict() if type(v) is pd.Series else v for k, v in stats.items() } stats_json['diff'] = {} for prev_year in (diffs or []): prev_year_key = f'spis-{prev_year}' file_path = outputDir.parent / prev_year_key / group / 'stats.json' if not file_path.exists(): continue with open(file_path, 'r') as f: prev_stats = json.load(f) stats_json['diff'][prev_year_key] = {} for k, v in stats_json.items(): if type(v) != dict or k == 'diff' or k not in prev_stats: continue stats_json['diff'][prev_year_key][k] = {} for kk, vv in v.items(): if kk not in prev_stats[k]: continue stats_json['diff'][prev_year_key][k][kk] = round(vv - prev_stats[k][kk], 1) stats_json = json.dumps(stats_json, indent=4) if echo: print('--- Group: %s ---' % group) print(stats_json) with open(outputDir / group / 'stats.json', 'w') as f: f.write(stats_json + '\n') return stats def build_ages_histogram(df: pd.DataFrame) -> pd.Series: ages = [int(a) for a in df['age'].to_list() if a > 0] ages_hist = {i: 0 for i in range(min(ages), max(ages) + 1)} for age in ages: ages_hist[age] += 1 s = len(ages) return pd.Series({ age: percent(count, s, 3) for age, count in ages_hist.items() }) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('-s', '--show', dest='show', default=False, nargs='?', const=True) parser.add_argument('-e', '--echo', dest='echo', default=False, nargs='?', const=True) args = parser.parse_args() if args.show: openFigs = True df = pd.read_csv(inputDir / 'export.csv') df = df[df['0_'].isin(['osobą niebinarną', 'nie wiem'])] df.loc[:, 'age'] = year - df['3_'] df.loc[df['age'] > 100, 'age'] = None diffs = [2021, 2022, 2023] stats = { 'general': analyse('general', df, df, args.echo, diffs), 'location_poland': analyse('location_poland', df[df['4_'] == 'w Polsce'], df, args.echo, diffs), 'location_abroad': analyse('location_abroad', df[df['4_'] == 'za granicą'], df, args.echo, diffs), 'agab_f': analyse('agab_f', df[df['1_'] == 'żeńską'], df, args.echo, diffs), 'agab_m': analyse('agab_m', df[df['1_'] == 'męską'], df, args.echo, diffs), # 'agab_x': analyse('agab_x', df[df['1_'] == 'inną (w jurysdykcjach, gdzie to możliwe)'], df, args.echo, diffs), 'younger': analyse('younger', df[df['age'] < 25], df, args.echo, diffs), 'older': analyse('older', df[df['age'] >= 25], df, args.echo, diffs), } comparisons = { 'by_location': { 'general': 'Ogół', 'location_poland': 'Polska', 'location_abroad': 'Zagranica', }, 'by_agab': { 'general': 'Ogół', 'agab_f': 'AFAB', 'agab_m': 'AMAB', }, 'by_age': { 'general': 'Ogół', 'younger': 'Grupa młodsza', 'older': 'Grupa starsza', }, } graphs = { 'age': 'Wiek osób respondenckich', 'neuter': 'Preferowana nazwa rodzaju gramatycznego', 'neuterByUsers': 'Preferowana nazwa rodzaju gramatycznego wśród osób go używających', 'pronounGroups': 'Rodzaj gramatyczny używany w mowie', 'pronouns': 'Zaimki używane w piśmie', 'pluralNouns': 'Czy rzeczowniki również w liczbie mnogiej?', 'pluralNonGendered': 'Czy nieupłciowione formy również w liczbie mnogiej?', 'pronounsAggr': 'Zaimki używane w piśmie (zgrupowane)', 'nouns': 'Rzeczowniki', 'honorifics': 'Formy grzecznościowe', 'obstacles': 'Dlaczego nie formy niebinarne?', 'reasons': 'Co wpływa na wybór form?', 'groups': 'Formy do opisu grup mieszanych', 'names': 'Używane imię', 'english': 'Zaimki w języku angielskim', 'labelsGender': 'Etykietki opisujące płeć', 'labelsSexuality': 'Etykietki opisujące orientację seksualną', 'labelsRomantic': 'Etykietki opisujące orientację romantyczną', 'transitionSocial': 'Tranzycja społeczna', 'transitionName': 'Tranzycja prawna – imię', 'transitionMarker': 'Tranzycja prawna – znacznik płci', 'transitionPhysical': 'Tranzycja medyczna – bindery, packery, itp.', 'transitionHormonal': 'Tranzycja medyczna – hormony', 'transitionSurgical': 'Tranzycja medyczna – zmiany chirurgiczne', } for group, group_stats in stats.items(): for graph, graph_label in graphs.items(): generate_graph(group_stats[graph], group, graph, graph_label) for comparison_key, comparison_groups in comparisons.items(): ensure_empty_dir(outputDir / comparison_key) for graph, graph_label in graphs.items(): data = pd.DataFrame({ groupLabel: stats[group][graph] for group, groupLabel in comparison_groups.items() }) generate_graph(data, comparison_key, graph, graph_label) by_year = {} for prev_year in [*diffs, year]: file_path = outputDir.parent / f'spis-{prev_year}' / 'general' / 'stats.json' if not file_path.exists(): continue with open(file_path, 'r') as f: by_year[prev_year] = json.load(f) ensure_empty_dir(outputDir / 'by_year') for graph, graph_label in graphs.items(): data = pd.DataFrame({ column_year: year_data[graph] for column_year, year_data in by_year.items() if graph in year_data }) generate_graph(data, 'by_year', graph, graph_label) write_ins_dir = outputDir / 'write_ins' write_ins_dir.mkdir(parents=True, exist_ok=True) for column in df.columns: if not column.endswith('__writein') and column != '29_': continue print(f'Extracting write-ins for question {column}') writeins = df[column].dropna().value_counts().reset_index() writeins.columns = ['write-in', 'count'] writeins = writeins[['count', 'write-in']] writeins = writeins.sort_values(by=['count'], ascending=False) writeins.to_csv(write_ins_dir / f'{column}.tsv', index=False, sep='\t')