mirror of
https://gitlab.com/PronounsPage/PronounsPage.git
synced 2025-09-07 22:40:27 -04:00
545 lines
22 KiB
Python
545 lines
22 KiB
Python
from typing import Union, List, Dict
|
||
import pandas as pd
|
||
from pathlib import Path
|
||
import plotly.express as px
|
||
import plotly.io as pio
|
||
import plotly.graph_objects as go
|
||
import os
|
||
from datetime import datetime
|
||
import json
|
||
from argparse import ArgumentParser
|
||
import shutil
|
||
|
||
year = datetime.now().year
|
||
projectDir = Path(__file__).parent
|
||
inputDir = projectDir / 'input'
|
||
outputDir = projectDir.parent / 'locale' / 'pl' / 'docs' / f'spis-{year}'
|
||
openFigs = False
|
||
|
||
colours = ['#c71585']
|
||
colours_multi = ['#c71585', '#8b0f7a', '#15c79c', '#20a0d7']
|
||
pd.options.mode.chained_assignment = None
|
||
fontFamily = 'Nunito, "Open Sans", sans-serif'
|
||
graphHead = '''
|
||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link href="https://fonts.googleapis.com/css2?family=Nunito:wght@500&display=swap" rel="stylesheet">
|
||
'''
|
||
|
||
transition_sentiment = {
|
||
'już przeszł_m proces': 1,
|
||
'jestem w trakcie': 1,
|
||
'stosuję microdosing': 1,
|
||
'w części miejsc tak, w części nie': 1,
|
||
'zamierzam zacząć w przyszłości': 1,
|
||
'zamierzam przejść w przyszłości': 1,
|
||
'już przeszł_m proces zmiany na opcję neutraną': 1,
|
||
'tak': 1,
|
||
'jestem w trakcie procesu zmiany na opcję neutraną': 1,
|
||
'jestem w trakcie procesu zmiany na drugi znacznik binarny': 1,
|
||
'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie możliwe usunięcie znacznika': 1,
|
||
'zamierzam przejść w przyszłości, tylko jeśli w Polsce będzie dostępna opcja neutralna': 1,
|
||
'zamierzam przejść w przyszłości, na znacznik binarny': 1,
|
||
'już przeszł_m proces zmiany na drugi znacznik binarny': 1,
|
||
'przeszł_m część z zabiegów, które chcę przejść': 1,
|
||
'przyjmuję hormony': 1,
|
||
'stosuję microdosing': 1,
|
||
'przechodzę, we wszystkich obszarach życia': 1,
|
||
'przeszł_m już wszystkie zabiegi, które chcę przejść': 1,
|
||
'jeszcze nie wiem': 0,
|
||
'nie chcę odpowiadać': 0,
|
||
'nie zamierzam': -1,
|
||
'przechodzę/przeszł_m detranzycję': -1,
|
||
'przechodzę/przeszł_m <u>de</u>tranzycję': -1,
|
||
}
|
||
|
||
|
||
def calculate_percentages(df: pd.DataFrame, counts: pd.Series) -> pd.Series:
|
||
percentages = counts.apply(lambda x: round(100 * x / len(df), 1))
|
||
|
||
return pd.Series({
|
||
**{k: percentages[k] for k in percentages.index},
|
||
**{f"{k}__count": counts[k] for k in counts.index}
|
||
})
|
||
|
||
|
||
def extract_question(
|
||
df: pd.DataFrame,
|
||
question_number: int,
|
||
include_answers: bool = True,
|
||
include_aggregates: bool = False,
|
||
remove_underscores: bool = True
|
||
) -> pd.Series:
|
||
print('Extracting question', question_number)
|
||
question_series = df.filter(regex='^%s_%s(?!_writein)' % (
|
||
question_number,
|
||
('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)'
|
||
))
|
||
question_series.columns = [
|
||
c[len(str(question_number)) + 1:]
|
||
.replace('aggr_', 'łącznie: ')
|
||
.replace('_', ' ' if remove_underscores else '_')
|
||
.replace('łącznie: trans_', 'łącznie: trans*')
|
||
for c in question_series.columns
|
||
]
|
||
|
||
return calculate_percentages(df, question_series.sum())
|
||
|
||
|
||
def extract_question_single(
|
||
df: pd.DataFrame,
|
||
question_number: int,
|
||
include_answers: bool = True,
|
||
include_aggregates: bool = False
|
||
) -> pd.Series:
|
||
print('Extracting question', question_number)
|
||
question_series = df.filter(regex='^%s_%s(?!_writein)' % (
|
||
question_number,
|
||
('' if include_answers else 'aggr_') if include_aggregates else '(?!aggr)'
|
||
))
|
||
|
||
return calculate_percentages(df, question_series.groupby(f'{question_number}_')[f'{question_number}_'].count())
|
||
|
||
|
||
def extract_question_single_sentiment(
|
||
df: pd.DataFrame,
|
||
question_number: int,
|
||
sentiment_map: Dict[str, int],
|
||
include_answers: bool = True,
|
||
include_aggregates: bool = False
|
||
) -> pd.Series:
|
||
question_series = extract_question_single(df, question_number, include_answers, include_aggregates)
|
||
question_series.index = question_series.index.map(sentiment_map)
|
||
question_series = question_series.groupby(question_series.index).sum()
|
||
|
||
sentiment_series = pd.Series(
|
||
data=[
|
||
round(question_series[1], 1),
|
||
round(question_series[0], 1),
|
||
round(question_series[-1], 1),
|
||
round(question_series[0] + question_series[-1], 1)
|
||
],
|
||
index=['positive', 'neutral', 'negative', 'non-positive'],
|
||
)
|
||
|
||
return sentiment_series
|
||
|
||
def rename_index(data: Union[pd.DataFrame, pd.Series], new_index: List[str]) -> Union[pd.DataFrame, pd.Series]:
|
||
if type(data) is pd.Series:
|
||
data = data.copy()
|
||
data.columns = new_index
|
||
data.index = new_index
|
||
return data
|
||
|
||
return data.set_axis(labels=new_index, axis=0)
|
||
|
||
|
||
def generate_graph(
|
||
data: Union[pd.DataFrame, pd.Series],
|
||
group: str,
|
||
name: str,
|
||
title: str,
|
||
show: bool = False
|
||
):
|
||
print('Generating graph', group, name)
|
||
|
||
if isinstance(data, pd.DataFrame):
|
||
data = data.loc[[idx for idx in data.index if not str(idx).endswith('__count')]]
|
||
elif isinstance(data, pd.Series):
|
||
data = data[[not str(k).endswith('__count') for k in data.index]].reindex()
|
||
|
||
is_multi = type(data) is pd.DataFrame and len(data.columns) > 1
|
||
|
||
if all([type(c) is str and c.startswith('łącznie: ') for c in data.index]):
|
||
data = rename_index(data, [c.replace('łącznie: ', '') for c in data.index])
|
||
|
||
if 'nic, używam imienia nadanego mi przez rodziców' in data.index:
|
||
data = rename_index(data, [
|
||
'nadane – ale przeciwna płeć',
|
||
'nadane – ale wersja unisex',
|
||
'nadane – bez zmian',
|
||
'wybrane – rzeczownik',
|
||
'wybrane – binarne',
|
||
'wybrane – unisex',
|
||
])
|
||
|
||
if 'zamierzam zacząć w przyszłości' in data.index:
|
||
data = data.reindex(transition_sentiment.keys()).dropna()
|
||
|
||
fig = px.bar(
|
||
data,
|
||
color_discrete_sequence=colours_multi if is_multi else colours,
|
||
barmode='group',
|
||
template='plotly_white',
|
||
)
|
||
|
||
fig.update_layout(
|
||
showlegend=is_multi,
|
||
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''),
|
||
title=title,
|
||
xaxis=None,
|
||
yaxis=None,
|
||
font=dict(family=fontFamily, size=14),
|
||
)
|
||
# i can't believe i have to do that because a professor can't read a graph 🤦
|
||
if name == 'age':
|
||
fig.update_layout(
|
||
xaxis=dict(tickvals=[13, 20, 30, 40, 50, 60, 70]),
|
||
)
|
||
for trace in fig.select_traces():
|
||
trace.update(
|
||
hovertemplate='%{x}<br>%{y:.2f}%' + ('<br>%{meta}' if is_multi else '') + '<extra></extra>',
|
||
meta=trace.offsetgroup,
|
||
hoverlabel_font=dict(family=fontFamily, size=12),
|
||
visible='legendonly' if trace.name == 'Ogół' else True,
|
||
)
|
||
|
||
file_path = outputDir / group / (name + '.html')
|
||
pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn')
|
||
|
||
with open(file_path, 'r') as fr:
|
||
content = fr.read().replace('</head>', graphHead + '</head>')
|
||
with open(file_path, 'w') as fw:
|
||
fw.write(content)
|
||
|
||
|
||
def percent(value: int, size: int, precision: int = 2) -> float:
|
||
return round(100 * value / size, precision)
|
||
|
||
|
||
def ensure_empty_dir(dir: Path) -> Path:
|
||
if os.path.exists(dir):
|
||
shutil.rmtree(dir)
|
||
os.makedirs(dir, exist_ok=True)
|
||
|
||
return dir
|
||
|
||
|
||
def analyse(group: str, df: pd.DataFrame, full_df: pd.DataFrame, echo: bool = False, diffs: List[int] = None):
|
||
ensure_empty_dir(outputDir / group)
|
||
|
||
df['18_'] = df['18_'].str.rstrip('.')
|
||
df['25_'] = df['25_'].str.replace('<u>', '').str.replace('</u>', '')
|
||
df['28_'] = df['28_'].str.replace('<u>', '').str.replace('</u>', '')
|
||
df['29_'] = df['29_'].str.replace('<u>', '').str.replace('</u>', '')
|
||
df['30_'] = df['30_'].str.replace('<u>', '').str.replace('</u>', '')
|
||
|
||
df_plural = df[df['7_aggr_mnogie'] == 1]
|
||
df_neuter = df[df['7_rodzaj neutralny'] == 1]
|
||
df_transition = df[df['24_'] == 'tak']
|
||
df_english = df[df['19_nie znam / nie używam angielskiego'] != 1].drop('19_nie znam / nie używam angielskiego', axis=1)
|
||
df_attraction_split = df[df['21_'] == 'tak']
|
||
|
||
stats = {
|
||
'size': len(df),
|
||
'size_subset_percent': percent(len(df), len(full_df)),
|
||
'size_plural': len(df_plural),
|
||
'size_plural_percent': percent(len(df_plural), len(df)),
|
||
'size_neuter': len(df_neuter),
|
||
'size_neuter_percent': percent(len(df_neuter), len(df)),
|
||
'size_transition': len(df_transition),
|
||
'size_transition_subset_percent': percent(len(df_transition), len(df)),
|
||
'size_english': len(df_english),
|
||
'age': pd.Series(build_ages_histogram(df)),
|
||
'ageStats': {
|
||
'avg': round(df['age'].mean(), 1),
|
||
'median': round(df['age'].median(), 1),
|
||
'std': round(df['age'].std(), 1),
|
||
'adults': percent(len(df[df['age'] >= 18]), len(df)),
|
||
'adults_count': len(df[df['age'] >= 18]),
|
||
'under_30': percent(len(df[df['age'] < 30]), len(df)),
|
||
'under_30_count': len(df[df['age'] < 30]),
|
||
'over_30': percent(len(df[df['age'] >= 30]), len(df)),
|
||
'over_30_count': len(df[df['age'] >= 30]),
|
||
'under_25': percent(len(df[df['age'] < 25]), len(df)),
|
||
'under_25_count': len(df[df['age'] < 25]),
|
||
'over_25': percent(len(df[df['age'] >= 25]), len(df)),
|
||
'over_25_count': len(df[df['age'] >= 25]),
|
||
},
|
||
'neuter': extract_question(df, 6),
|
||
'neuterByUsers': extract_question(df_neuter, 6),
|
||
'pronounGroups': extract_question(df, 7),
|
||
'pronounGroupsAggr': extract_question(df, 7, include_answers=False, include_aggregates=True),
|
||
'pluralNouns': extract_question_single(df_plural, 8),
|
||
'pluralNonGendered': extract_question_single(df_plural, 9),
|
||
'pronouns': extract_question(df, 10),
|
||
'pronounsAggr': extract_question(df, 10, include_answers=False, include_aggregates=True),
|
||
'nouns': extract_question(df, 11),
|
||
'honorifics': extract_question(df, 12, include_aggregates=True),
|
||
'obstacles': extract_question(df, 13),
|
||
'groups': extract_question(df, 14),
|
||
'reasons': extract_question(df, 15),
|
||
'namesGender': extract_question_single(df, 16), # TODO missing aggregate
|
||
'names': extract_question_single(df, 17),
|
||
'namesAggr': extract_question(df, 17, include_answers=False, include_aggregates=True),
|
||
'namesDeclension': extract_question_single(df, 18),
|
||
'english': extract_question(df, 19, include_aggregates=True), # TODO use df_english
|
||
'labelsGender': extract_question(df, 20, include_aggregates=True, remove_underscores=False),
|
||
'labelsAttractionSplit': extract_question_single(df, 21),
|
||
'labelsSexuality': extract_question(df, 22, include_aggregates=True, remove_underscores=False),
|
||
'labelsRomantic': extract_question(df_attraction_split, 23, include_aggregates=True, remove_underscores=False),
|
||
'transtionAnswered': extract_question_single(df, 24),
|
||
'transitionSocial': extract_question_single(df_transition, 25),
|
||
'transitionSocialSentiment': extract_question_single_sentiment(df_transition, 25, sentiment_map=transition_sentiment),
|
||
'transitionName': extract_question_single(df_transition, 26),
|
||
'transitionNameSentiment': extract_question_single_sentiment(df_transition, 26, sentiment_map=transition_sentiment),
|
||
'transitionMarker': extract_question_single(df_transition, 27),
|
||
'transitionMarkerSentiment': extract_question_single_sentiment(df_transition, 27, sentiment_map=transition_sentiment),
|
||
'transitionPhysical': extract_question_single(df_transition, 28),
|
||
'transitionPhysicalSentiment': extract_question_single_sentiment(df_transition, 28, sentiment_map=transition_sentiment),
|
||
'transitionHormonal': extract_question_single(df_transition, 29),
|
||
'transitionHormonalSentiment': extract_question_single_sentiment(df_transition, 29, sentiment_map=transition_sentiment),
|
||
'transitionSurgical': extract_question_single(df_transition, 30),
|
||
'transitionSurgicalSentiment': extract_question_single_sentiment(df_transition, 30, sentiment_map=transition_sentiment),
|
||
}
|
||
|
||
stats_json = {
|
||
k: v.to_dict() if type(v) is pd.Series else v
|
||
for k, v
|
||
in stats.items()
|
||
}
|
||
|
||
stats_json['diff'] = {}
|
||
for prev_year in (diffs or []):
|
||
prev_year_key = f'spis-{prev_year}'
|
||
file_path = outputDir.parent / prev_year_key / group / 'stats.json'
|
||
if not file_path.exists():
|
||
continue
|
||
with open(file_path, 'r') as f:
|
||
prev_stats = json.load(f)
|
||
stats_json['diff'][prev_year_key] = {}
|
||
for k, v in stats_json.items():
|
||
if type(v) != dict or k == 'diff' or k not in prev_stats:
|
||
continue
|
||
stats_json['diff'][prev_year_key][k] = {}
|
||
for kk, vv in v.items():
|
||
if kk not in prev_stats[k]:
|
||
continue
|
||
stats_json['diff'][prev_year_key][k][kk] = round(vv - prev_stats[k][kk], 1)
|
||
|
||
stats_json = json.dumps(stats_json, indent=4)
|
||
|
||
if echo:
|
||
print('--- Group: %s ---' % group)
|
||
print(stats_json)
|
||
|
||
with open(outputDir / group / 'stats.json', 'w') as f:
|
||
f.write(stats_json + '\n')
|
||
|
||
return stats
|
||
|
||
|
||
def build_ages_histogram(df: pd.DataFrame) -> pd.Series:
|
||
ages = [int(a) for a in df['age'].to_list() if a > 0]
|
||
ages_hist = {i: 0 for i in range(min(ages), max(ages) + 1)}
|
||
for age in ages:
|
||
ages_hist[age] += 1
|
||
s = len(ages)
|
||
|
||
return pd.Series({
|
||
age: percent(count, s, 3)
|
||
for age, count
|
||
in ages_hist.items()
|
||
})
|
||
|
||
|
||
def generate_yearly_comparison(data: dict, show: bool = False):
|
||
df = pd.DataFrame(data)
|
||
|
||
# Reshape to long format
|
||
df_long = df.melt(id_vars="Form", var_name="Year", value_name="Percentage")
|
||
|
||
# Plot
|
||
fig = px.line(df_long,
|
||
x="Year", y="Percentage", color="Form",
|
||
title="Trendy na przestrzeni lat",
|
||
template="plotly_white",
|
||
markers=True,
|
||
#color_discrete_sequence=colours_multi,
|
||
)
|
||
|
||
fig.update_traces(line=dict(width=4), marker=dict(size=8))
|
||
|
||
for trace in fig.select_traces():
|
||
trace.update(
|
||
hovertemplate='%{x}<br>%{y:.2f}%', # + '<extra></extra>',
|
||
)
|
||
|
||
df_last_year = df_long[df_long["Year"] == df_long["Year"].max()]
|
||
for _, row in df_last_year.iterrows():
|
||
if row["Form"] == "Neutratywy":
|
||
position = "bottom left"
|
||
else:
|
||
position = "top left"
|
||
|
||
fig.add_trace(go.Scatter(
|
||
x=[row["Year"]],
|
||
y=[row["Percentage"]],
|
||
text=["<b>" + row["Form"] + "</b>"],
|
||
mode="text",
|
||
textposition=position,
|
||
showlegend=False,
|
||
hoverinfo="skip",
|
||
textfont=dict(family=fontFamily, size=16),
|
||
))
|
||
|
||
fig.update_layout(
|
||
xaxis=None,
|
||
yaxis_title="Procent (%)",
|
||
showlegend=False,
|
||
#legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''),
|
||
font=dict(family=fontFamily, size=14),
|
||
)
|
||
|
||
file_path = outputDir / 'year_by_year.html'
|
||
pio.write_html(fig, file=file_path, auto_open=show or openFigs, include_plotlyjs='cdn')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
parser = ArgumentParser()
|
||
parser.add_argument('-s', '--show', dest='show', default=False, nargs='?', const=True)
|
||
parser.add_argument('-e', '--echo', dest='echo', default=False, nargs='?', const=True)
|
||
args = parser.parse_args()
|
||
|
||
if args.show:
|
||
openFigs = True
|
||
|
||
df = pd.read_csv(inputDir / 'export.csv')
|
||
df = df[df['0_'].isin(['osobą niebinarną', 'nie wiem'])]
|
||
df.loc[:, 'age'] = year - df['3_']
|
||
df.loc[df['age'] > 100, 'age'] = None
|
||
|
||
diffs = [2021, 2022, 2023, 2024]
|
||
|
||
generate_yearly_comparison({
|
||
"Form": [
|
||
"Rodzaj neutralny",
|
||
"Rodzaj postpłciowy",
|
||
"Wyłącznie formy binarne",
|
||
"Wyłącznie formy niebinarne",
|
||
"Neutratywy",
|
||
"Osobatywy"
|
||
],
|
||
"2021": [25.4, 8, 53.6, 8.4, 12, 66.7],
|
||
"2022": [43.1, 11.8, 15.7, 15.5, 14.3, 65.4],
|
||
"2023": [48, 9.3, 15.6, 18.2, 11.7, 67.5],
|
||
"2024": [45.9, 6.2, 18, 19.4, 12.3, 68],
|
||
"2025": [52.3, 8.6, 15.5, 18.9, 15.4, 74.4]
|
||
})
|
||
|
||
stats = {
|
||
'general': analyse('general', df, df, args.echo, diffs),
|
||
'location_poland': analyse('location_poland', df[df['4_'] == 'w Polsce'], df, args.echo, diffs),
|
||
'location_abroad': analyse('location_abroad', df[df['4_'] == 'za granicą'], df, args.echo, diffs),
|
||
'agab_f': analyse('agab_f', df[df['1_'] == 'żeńską'], df, args.echo, diffs),
|
||
'agab_m': analyse('agab_m', df[df['1_'] == 'męską'], df, args.echo, diffs),
|
||
# 'agab_x': analyse('agab_x', df[df['1_'] == 'inną (w jurysdykcjach, gdzie to możliwe)'], df, args.echo, diffs),
|
||
'younger': analyse('younger', df[df['age'] < 25], df, args.echo, diffs),
|
||
'older': analyse('older', df[df['age'] >= 25], df, args.echo, diffs),
|
||
'name_f': analyse('name_f', df[df['16_'] == 'żeńskie'], df, args.echo, diffs),
|
||
'name_m': analyse('name_m', df[df['16_'] == 'męskie'], df, args.echo, diffs),
|
||
'name_n': analyse('name_n', df[df['16_'] == 'neutralne płciowo'], df, args.echo, diffs),
|
||
}
|
||
|
||
comparisons = {
|
||
'by_location': {
|
||
'general': 'Ogół',
|
||
'location_poland': 'Polska',
|
||
'location_abroad': 'Zagranica',
|
||
},
|
||
'by_agab': {
|
||
'general': 'Ogół',
|
||
'agab_f': 'AFAB',
|
||
'agab_m': 'AMAB',
|
||
},
|
||
'by_age': {
|
||
'general': 'Ogół',
|
||
'younger': 'Grupa młodsza',
|
||
'older': 'Grupa starsza',
|
||
},
|
||
'by_name': {
|
||
'name_f': 'Imię żeńskie',
|
||
'name_m': 'Imię męskie',
|
||
'name_n': 'Imię neutralne płciowo',
|
||
},
|
||
}
|
||
|
||
graphs = {
|
||
'age': 'Wiek osób respondenckich',
|
||
'neuter': 'Preferowana nazwa rodzaju gramatycznego',
|
||
'neuterByUsers': 'Preferowana nazwa rodzaju gramatycznego wśród osób go używających',
|
||
'pronounGroups': 'Rodzaj gramatyczny używany w mowie',
|
||
'pronouns': 'Zaimki używane w piśmie',
|
||
'pluralNouns': 'Czy rzeczowniki również w liczbie mnogiej?',
|
||
'pluralNonGendered': 'Czy nieupłciowione formy również w liczbie mnogiej?',
|
||
'pronounsAggr': 'Zaimki używane w piśmie (zgrupowane)',
|
||
'nouns': 'Rzeczowniki',
|
||
'honorifics': 'Formy grzecznościowe',
|
||
'obstacles': 'Dlaczego nie formy niebinarne?',
|
||
'reasons': 'Co wpływa na wybór form?',
|
||
'groups': 'Formy do opisu grup mieszanych',
|
||
'namesGender': 'Upłciowienie używanego imienia',
|
||
'names': 'Wybór używanego imienia',
|
||
'namesDeclension': 'Odmiana używanego imienia',
|
||
'english': 'Zaimki w języku angielskim',
|
||
'labelsGender': 'Etykietki opisujące płeć',
|
||
'labelsSexuality': 'Etykietki opisujące orientację seksualną',
|
||
'labelsRomantic': 'Etykietki opisujące orientację romantyczną',
|
||
'transitionSocial': 'Tranzycja społeczna',
|
||
'transitionName': 'Tranzycja prawna – imię',
|
||
'transitionMarker': 'Tranzycja prawna – znacznik płci',
|
||
'transitionPhysical': 'Tranzycja medyczna – bindery, packery, itp.',
|
||
'transitionHormonal': 'Tranzycja medyczna – hormony',
|
||
'transitionSurgical': 'Tranzycja medyczna – zmiany chirurgiczne',
|
||
}
|
||
|
||
for group, group_stats in stats.items():
|
||
for graph, graph_label in graphs.items():
|
||
generate_graph(group_stats[graph], group, graph, graph_label)
|
||
|
||
for comparison_key, comparison_groups in comparisons.items():
|
||
ensure_empty_dir(outputDir / comparison_key)
|
||
for graph, graph_label in graphs.items():
|
||
if comparison_key == 'by_name' and graph != 'namesDeclension':
|
||
continue
|
||
|
||
data = pd.DataFrame({
|
||
groupLabel: stats[group][graph]
|
||
for group, groupLabel
|
||
in comparison_groups.items()
|
||
})
|
||
generate_graph(data, comparison_key, graph, graph_label)
|
||
|
||
by_year = {}
|
||
for prev_year in [*diffs, year]:
|
||
file_path = outputDir.parent / f'spis-{prev_year}' / 'general' / 'stats.json'
|
||
if not file_path.exists():
|
||
continue
|
||
with open(file_path, 'r') as f:
|
||
by_year[prev_year] = json.load(f)
|
||
|
||
ensure_empty_dir(outputDir / 'by_year')
|
||
for graph, graph_label in graphs.items():
|
||
data = pd.DataFrame({
|
||
column_year: year_data[graph]
|
||
for column_year, year_data
|
||
in by_year.items()
|
||
if graph in year_data
|
||
})
|
||
generate_graph(data, 'by_year', graph, graph_label)
|
||
|
||
write_ins_dir = outputDir / 'write_ins'
|
||
write_ins_dir.mkdir(parents=True, exist_ok=True)
|
||
for column in df.columns:
|
||
if not column.endswith('__writein') and column != '31_':
|
||
continue
|
||
|
||
print(f'Extracting write-ins for question {column}')
|
||
|
||
writeins = df[column].dropna().value_counts().reset_index()
|
||
writeins.columns = ['write-in', 'count']
|
||
writeins = writeins[['count', 'write-in']]
|
||
writeins = writeins.sort_values(by=['count'], ascending=False)
|
||
|
||
writeins.to_csv(write_ins_dir / f'{column}.tsv', index=False, sep='\t')
|