Source code for lost_years.ssa
#!/usr/bin/env python
import argparse
import logging
import sys
from importlib.resources import files
import pandas as pd
from .utils import closest, column_exists, fixup_columns
# Setup logger
logger = logging.getLogger(__name__)
SSA_DATA = files("lost_years") / "data" / "ssa" / "ssa.csv"
SSA_COLS = ["age", "male_life_expectancy", "female_life_expectancy", "year"]
[docs]
class LostYearsSSAData:
__df = None
@classmethod
def lost_years_ssa(cls, df: pd.DataFrame, cols: dict[str, str] | None = None) -> pd.DataFrame:
"""Appends Life expectancycolumn from SSA data to the input DataFrame
based on age, sex and year in the specific cols mapping
Args:
df: Pandas DataFrame containing the input data.
cols: Column mapping for age, sex, and year in DataFrame.
If None, uses default mapping: {'age': 'age', 'sex': 'sex', 'year': 'year'}
Returns:
Pandas DataFrame with life expectancy columns:
'ssa_age', 'ssa_year', 'ssa_life_expectancy'
"""
df_cols = {}
for col in ["age", "sex", "year"]:
tcol = col if cols is None else cols[col]
if tcol not in df.columns:
logger.warning(f"No column `{tcol!s}` in the DataFrame")
return df
df_cols[col] = tcol
if cls.__df is None:
cls.__df = pd.read_csv(str(SSA_DATA), usecols=SSA_COLS)
out_list = []
index_list = []
for i, r in df.iterrows():
if r[df_cols["sex"]].lower() in ["m", "male"]:
ecol = "male_life_expectancy"
else:
ecol = "female_life_expectancy"
sdf = cls.__df[["age", "year", ecol]]
for c in ["age", "year"]:
sdf = sdf[sdf[c] == closest(sdf[c].unique(), r[df_cols[c]])]
if not sdf.empty:
odf = sdf[["age", "year", ecol]].copy()
odf.columns = ["ssa_age", "ssa_year", "ssa_life_expectancy"]
out_list.append(odf)
index_list.append(i)
if out_list:
out_df = pd.concat(out_list, ignore_index=True)
out_df["original_index"] = index_list
out_df.set_index("original_index", drop=True, inplace=True)
else:
out_df = pd.DataFrame()
rdf = df.join(out_df)
return rdf
lost_years_ssa = LostYearsSSAData.lost_years_ssa
[docs]
def main(argv: list[str] = sys.argv[1:]) -> int:
title = "Appends Lost Years data column(s) by age, sex and year"
parser = argparse.ArgumentParser(description=title)
parser.add_argument("input", default=None, help="Input file")
parser.add_argument(
"-a",
"--age",
default="age",
help="Columns name of age in the input file(default=`age`)",
)
parser.add_argument(
"-s",
"--sex",
default="sex",
help="Columns name of sex in the input file(default=`sex`)",
)
parser.add_argument(
"-y",
"--year",
default="year",
help="Columns name of year in the input file(default=`year`)",
)
parser.add_argument(
"-o",
"--output",
default="lost-years-output.csv",
help="Output file with Lost Years data column(s)",
)
args = parser.parse_args(argv)
logger.debug(args)
df = pd.read_csv(args.input)
if not column_exists(df, args.age):
logger.error(f"Column: `{args.age!s}` not found in the input file")
return -1
if not column_exists(df, args.sex):
logger.error(f"Column: `{args.sex!s}` not found in the input file")
return -1
if not column_exists(df, args.year):
logger.error(f"Column: `{args.year!s}` not found in the input file")
return -1
rdf = lost_years_ssa(df, cols={"age": args.age, "sex": args.sex, "year": args.year})
logger.info(f"Saving output to file: `{args.output:s}`")
rdf.columns = fixup_columns(rdf.columns) # type: ignore[arg-type]
rdf.to_csv(args.output, index=False)
return 0
if __name__ == "__main__":
sys.exit(main())