Source code for statqa.qa.templates

"""
Question templates for Q/A pair generation.

Defines templates for converting facts into question/answer pairs.
"""

from enum import Enum
from typing import Any



[docs]
class QuestionType(str, Enum):
    """Types of questions that can be generated."""

    DESCRIPTIVE = "descriptive"  # What is the average...?
    COMPARATIVE = "comparative"  # How does X compare to Y...?
    TEMPORAL = "temporal"  # Has X changed over time...?
    CAUSAL = "causal"  # What is the effect of X on Y...?
    CORRELATIONAL = "correlational"  # Are X and Y related...?
    DISTRIBUTIONAL = "distributional"  # What is the distribution of...?




[docs]
class QuestionTemplate:
    """
    Template for generating questions from statistical insights.

    Args:
        question_type: Type of question to generate
    """

    def __init__(self, question_type: QuestionType) -> None:
        self.question_type = question_type


[docs]
    def generate(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]:
        """
        Generate question/answer pairs from an insight.

        Args:
            insight: Statistical insight dictionary
            answer: Formatted natural language answer

        Returns:
            List of Q/A pair dictionaries

        Raises:
            ValueError: If question type is not supported
        """
        if self.question_type == QuestionType.DESCRIPTIVE:
            return self._generate_descriptive(insight, answer)
        elif self.question_type == QuestionType.COMPARATIVE:
            return self._generate_comparative(insight, answer)
        elif self.question_type == QuestionType.TEMPORAL:
            return self._generate_temporal(insight, answer)
        elif self.question_type == QuestionType.CAUSAL:
            return self._generate_causal(insight, answer)
        elif self.question_type == QuestionType.CORRELATIONAL:
            return self._generate_correlational(insight, answer)
        elif self.question_type == QuestionType.DISTRIBUTIONAL:
            return self._generate_distributional(insight, answer)

        # This should never be reached due to enum constraint
        raise ValueError(f"Unknown question type: {self.question_type}")


    def _generate_descriptive(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]:
        """Generate descriptive questions (univariate statistics)."""
        questions = []
        var_label = insight.get("label", insight.get("variable", "Variable"))

        if "mean" in insight:
            questions.extend(
                [
                    {
                        "question": f"What is the average {var_label}?",
                        "answer": answer,
                        "type": "descriptive",
                    },
                    {
                        "question": f"What is the mean value of {var_label}?",
                        "answer": answer,
                        "type": "descriptive",
                    },
                    {
                        "question": f"Describe the central tendency of {var_label}.",
                        "answer": answer,
                        "type": "descriptive",
                    },
                ]
            )

        if "mode" in insight:
            questions.extend(
                [
                    {
                        "question": f"What is the most common category for {var_label}?",
                        "answer": answer,
                        "type": "descriptive",
                    },
                    {
                        "question": f"Which {var_label} value appears most frequently?",
                        "answer": answer,
                        "type": "descriptive",
                    },
                ]
            )

        return questions

    def _generate_comparative(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]:
        """Generate comparative questions (group comparisons)."""
        questions = []

        var_cat = insight.get("var_categorical")
        var_num = insight.get("var_numeric")

        if var_cat and var_num:
            questions.extend(
                [
                    {
                        "question": f"How does {var_num} differ across {var_cat} groups?",
                        "answer": answer,
                        "type": "comparative",
                    },
                    {
                        "question": f"What is the relationship between {var_cat} and {var_num}?",
                        "answer": answer,
                        "type": "comparative",
                    },
                    {
                        "question": f"Does {var_num} vary by {var_cat}?",
                        "answer": answer,
                        "type": "comparative",
                    },
                ]
            )

        return questions

    def _generate_temporal(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]:
        """Generate temporal questions (trends over time)."""
        questions = []

        value_var = insight.get("value_variable", "Variable")
        time_var = insight.get("time_variable", "time")

        questions.extend(
            [
                {
                    "question": f"How has {value_var} changed over {time_var}?",
                    "answer": answer,
                    "type": "temporal",
                },
                {
                    "question": f"Is there a trend in {value_var} over {time_var}?",
                    "answer": answer,
                    "type": "temporal",
                },
                {
                    "question": f"Has {value_var} increased or decreased over {time_var}?",
                    "answer": answer,
                    "type": "temporal",
                },
            ]
        )

        return questions

    def _generate_causal(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]:
        """Generate causal questions (treatment effects)."""
        questions = []

        treatment = insight.get("treatment", "Treatment")
        outcome = insight.get("outcome", "Outcome")
        controls = insight.get("controls", [])

        base_questions = [
            {
                "question": f"What is the effect of {treatment} on {outcome}?",
                "answer": answer,
                "type": "causal",
            },
            {
                "question": f"How does {treatment} affect {outcome}?",
                "answer": answer,
                "type": "causal",
            },
        ]

        if controls:
            controls_str = ", ".join(controls)
            base_questions.append(
                {
                    "question": f"Controlling for {controls_str}, what is the effect of {treatment} on {outcome}?",
                    "answer": answer,
                    "type": "causal",
                }
            )

        questions.extend(base_questions)
        return questions

    def _generate_correlational(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]:
        """Generate correlational questions (associations)."""
        questions = []

        var1 = insight.get("var1", "Variable 1")
        var2 = insight.get("var2", "Variable 2")

        questions.extend(
            [
                {
                    "question": f"Are {var1} and {var2} correlated?",
                    "answer": answer,
                    "type": "correlational",
                },
                {
                    "question": f"What is the relationship between {var1} and {var2}?",
                    "answer": answer,
                    "type": "correlational",
                },
                {
                    "question": f"How strongly are {var1} and {var2} associated?",
                    "answer": answer,
                    "type": "correlational",
                },
            ]
        )

        return questions

    def _generate_distributional(
        self, insight: dict[str, Any], answer: str
    ) -> list[dict[str, str]]:
        """Generate distributional questions (shape, spread)."""
        questions = []
        var_label = insight.get("label", insight.get("variable", "Variable"))

        if "std" in insight or "skewness" in insight:
            questions.extend(
                [
                    {
                        "question": f"What is the distribution of {var_label}?",
                        "answer": answer,
                        "type": "distributional",
                    },
                    {
                        "question": f"How variable is {var_label}?",
                        "answer": answer,
                        "type": "distributional",
                    },
                ]
            )

        if "frequencies" in insight:
            questions.append(
                {
                    "question": f"What is the frequency distribution of {var_label}?",
                    "answer": answer,
                    "type": "distributional",
                }
            )

        return questions




[docs]
def infer_question_type(insight: dict[str, Any]) -> QuestionType:
    """
    Infer the appropriate question type from an insight.

    Args:
        insight: Statistical insight dictionary

    Returns:
        Inferred question type
    """
    analysis_type = insight.get("analysis_type", "")

    # Temporal
    if analysis_type in ["temporal_trend", "year_over_year"] or "mann_kendall" in insight:
        return QuestionType.TEMPORAL

    # Causal
    if analysis_type == "treatment_effect" or "treatment_effect" in insight:
        return QuestionType.CAUSAL

    # Correlational
    if analysis_type == "numeric_numeric" or "pearson" in insight:
        return QuestionType.CORRELATIONAL

    # Comparative
    if analysis_type == "categorical_numeric" or "group_stats" in insight:
        return QuestionType.COMPARATIVE

    # Distributional
    if "skewness" in insight or "frequencies" in insight:
        return QuestionType.DISTRIBUTIONAL

    # Default to descriptive
    return QuestionType.DESCRIPTIVE