Source code for lost_years.hld

#!/usr/bin/env python
"""
HLD (Human Life-Table Database) module for lost_years package.

Provides access to historical life table data from lifetable.de
covering 142 countries from 1751-2023.
"""

import argparse
import logging
import sys
from importlib.resources import files
from pathlib import Path

import pandas as pd

from .utils import closest, column_exists, fixup_columns

# Setup logger
logger = logging.getLogger(__name__)

# HLD Configuration
HLD_DATA = files("lost_years") / "data" / "hld" / "hld.csv.gz"
HLD_COLS = [
    "Country",
    "Year1",
    "Sex",
    "Age",
    "e(x)",
]  # Essential columns for life expectancy


[docs] class LostYearsHLDData: """HLD data handler for life table information.""" __df = None @classmethod def lost_years_hld(cls, df: pd.DataFrame, cols: dict[str, str] | None = None) -> pd.DataFrame: """Appends Life expectancy column from HLD data to the input DataFrame based on country, age, sex and year in the specific cols mapping. Args: df: Pandas DataFrame containing the input data. cols: Column mapping for country, age, sex, and year in DataFrame. None for default mapping: {'country': 'country', 'age': 'age', 'sex': 'sex', 'year': 'year'}. Returns: Pandas DataFrame with HLD data columns: 'hld_country', 'hld_age', 'hld_sex', 'hld_year', 'hld_life_expectancy'. """ df_cols = {} for col in ["country", "age", "sex", "year"]: tcol = col if cols is None else cols[col] if tcol not in df.columns: logger.warning(f"No column `{tcol!s}` in the DataFrame") return df df_cols[col] = tcol if cls.__df is None: # Check if HLD data file exists hld_path = Path(str(HLD_DATA)) if not hld_path.exists(): logger.error(f"HLD data file not found: {HLD_DATA}") logger.info("Run: python lost_years/data/hld/update_hld_data.py") logger.info("Or manually download from: https://www.lifetable.de/") return df try: # Load HLD data logger.info("Loading HLD data (this may take a moment for 2M+ records)...") cls.__df = pd.read_csv( str(HLD_DATA), compression="gzip", usecols=HLD_COLS, low_memory=False ) if cls.__df.empty: logger.error("HLD data file is empty") return df # Clean and standardize the data cls.__df = cls.__df.dropna(subset=["Country", "Year1", "Sex", "e(x)"]) # Standardize column names for lookup cls.__df = cls.__df.rename( columns={ "Country": "country", "Year1": "year", "Sex": "sex", "Age": "age", "e(x)": "life_expectancy", } ) # Convert sex codes: 1=Male, 2=Female -> M/F for consistency cls.__df["sex"] = cls.__df["sex"].map({1: "M", 2: "F"}) # Convert data types cls.__df["year"] = pd.to_numeric(cls.__df["year"], errors="coerce") cls.__df["age"] = pd.to_numeric(cls.__df["age"], errors="coerce") cls.__df["life_expectancy"] = pd.to_numeric( cls.__df["life_expectancy"], errors="coerce" ) # Remove invalid records cls.__df = cls.__df.dropna() logger.info(f"Loaded HLD data: {len(cls.__df):,} records") logger.info(f"Countries: {cls.__df['country'].nunique()}") year_min = cls.__df["year"].min() year_max = cls.__df["year"].max() logger.info(f"Year range: {year_min:.0f}-{year_max:.0f}") except Exception as e: logger.error(f"Error loading HLD data: {e}") logger.error("The HLD data file may be corrupted or missing.") logger.info("Run: python lost_years/data/hld/update_hld_data.py") return df # Process input data # Convert sex to standard format df_temp = df.copy() df_temp["__temp_sex"] = df_temp[df_cols["sex"]].apply( lambda x: "M" if str(x).lower() in ["m", "male", "1"] else "F" ) out_df = pd.DataFrame() for i, r in df_temp.iterrows(): # Filter HLD data for this record sdf = cls.__df.copy() # Match country (try both exact and close matches) country_val = str(r[df_cols["country"]]).upper() country_matches = sdf[sdf["country"].str.upper() == country_val] if country_matches.empty: # Try partial matching for country codes country_matches = sdf[ sdf["country"].str.upper().str.contains(country_val, na=False) ] if country_matches.empty: # No country match found, skip this record empty_row = pd.DataFrame( { "hld_country": [None], "hld_age": [None], "hld_sex": [None], "hld_year": [None], "hld_life_expectancy": [None], "index": [i], } ) out_df = pd.concat([out_df, empty_row]) continue sdf = country_matches # Match other dimensions for c, col_name in [ ("sex", "__temp_sex"), ("age", df_cols["age"]), ("year", df_cols["year"]), ]: if c == "sex": target_val = r[col_name] else: target_val = r[col_name] if sdf[c].dtype in ["int32", "int64", "float64"]: # Numeric matching with closest value sdf = sdf[sdf[c] == closest(sdf[c].unique(), target_val)] else: # String matching sdf = sdf[sdf[c].astype(str).str.upper() == str(target_val).upper()] # Get the best match if not sdf.empty: # If multiple matches, take the first one best_match = sdf.iloc[0] odf = pd.DataFrame( { "hld_country": [best_match["country"]], "hld_age": [best_match["age"]], "hld_sex": [best_match["sex"]], "hld_year": [best_match["year"]], "hld_life_expectancy": [best_match["life_expectancy"]], "index": [i], } ) else: # No match found odf = pd.DataFrame( { "hld_country": [None], "hld_age": [None], "hld_sex": [None], "hld_year": [None], "hld_life_expectancy": [None], "index": [i], } ) out_df = pd.concat([out_df, odf]) # Clean up and join with original data out_df.set_index("index", drop=True, inplace=True) out_df = out_df.fillna("") # Replace NaN with empty string for cleaner output # Join with original DataFrame result_df = df.join(out_df) return result_df
# Export the function lost_years_hld = LostYearsHLDData.lost_years_hld
[docs] def main(argv: list[str] = sys.argv[1:]) -> int: """Main CLI function.""" title = "Appends Lost Years data from HLD (Human Life-Table Database)" parser = argparse.ArgumentParser(description=title) parser.add_argument("input", default=None, help="Input file") parser.add_argument( "-c", "--country", default="country", help="Column name of country in the input file (default=`country`)", ) parser.add_argument( "-a", "--age", default="age", help="Column name of age in the input file (default=`age`)", ) parser.add_argument( "-s", "--sex", default="sex", help="Column name of sex in the input file (default=`sex`)", ) parser.add_argument( "-y", "--year", default="year", help="Column name of year in the input file (default=`year`)", ) parser.add_argument( "-o", "--output", default="lost-years-hld-output.csv", help="Output file with Lost Years HLD data", ) args = parser.parse_args(argv) logger.debug(args) df = pd.read_csv(args.input) # Validate columns for _col_name, col_arg in [ ("country", args.country), ("age", args.age), ("sex", args.sex), ("year", args.year), ]: if not column_exists(df, col_arg): logger.error(f"Column: `{col_arg!s}` not found in the input file") return -1 # Apply HLD lookup result_df = lost_years_hld( df, cols={ "country": args.country, "age": args.age, "sex": args.sex, "year": args.year, }, ) # Save output logger.info(f"Saving output to file: `{args.output:s}`") result_df.columns = fixup_columns(result_df.columns.tolist()) result_df.to_csv(args.output, index=False) return 0
if __name__ == "__main__": sys.exit(main())