Source code for statqa.qa.templates

"""
Question templates for Q/A pair generation.

Defines templates for converting facts into question/answer pairs.
"""

from enum import Enum
from typing import Any


[docs] class QuestionType(str, Enum): """Types of questions that can be generated.""" DESCRIPTIVE = "descriptive" # What is the average...? COMPARATIVE = "comparative" # How does X compare to Y...? TEMPORAL = "temporal" # Has X changed over time...? CAUSAL = "causal" # What is the effect of X on Y...? CORRELATIONAL = "correlational" # Are X and Y related...? DISTRIBUTIONAL = "distributional" # What is the distribution of...?
[docs] class QuestionTemplate: """ Template for generating questions from statistical insights. Args: question_type: Type of question to generate """ def __init__(self, question_type: QuestionType) -> None: self.question_type = question_type
[docs] def generate(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]: """ Generate question/answer pairs from an insight. Args: insight: Statistical insight dictionary answer: Formatted natural language answer Returns: List of Q/A pair dictionaries Raises: ValueError: If question type is not supported """ if self.question_type == QuestionType.DESCRIPTIVE: return self._generate_descriptive(insight, answer) elif self.question_type == QuestionType.COMPARATIVE: return self._generate_comparative(insight, answer) elif self.question_type == QuestionType.TEMPORAL: return self._generate_temporal(insight, answer) elif self.question_type == QuestionType.CAUSAL: return self._generate_causal(insight, answer) elif self.question_type == QuestionType.CORRELATIONAL: return self._generate_correlational(insight, answer) elif self.question_type == QuestionType.DISTRIBUTIONAL: return self._generate_distributional(insight, answer) # This should never be reached due to enum constraint raise ValueError(f"Unknown question type: {self.question_type}")
def _generate_descriptive(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]: """Generate descriptive questions (univariate statistics).""" questions = [] var_label = insight.get("label", insight.get("variable", "Variable")) if "mean" in insight: questions.extend( [ { "question": f"What is the average {var_label}?", "answer": answer, "type": "descriptive", }, { "question": f"What is the mean value of {var_label}?", "answer": answer, "type": "descriptive", }, { "question": f"Describe the central tendency of {var_label}.", "answer": answer, "type": "descriptive", }, ] ) if "mode" in insight: questions.extend( [ { "question": f"What is the most common category for {var_label}?", "answer": answer, "type": "descriptive", }, { "question": f"Which {var_label} value appears most frequently?", "answer": answer, "type": "descriptive", }, ] ) return questions def _generate_comparative(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]: """Generate comparative questions (group comparisons).""" questions = [] var_cat = insight.get("var_categorical") var_num = insight.get("var_numeric") if var_cat and var_num: questions.extend( [ { "question": f"How does {var_num} differ across {var_cat} groups?", "answer": answer, "type": "comparative", }, { "question": f"What is the relationship between {var_cat} and {var_num}?", "answer": answer, "type": "comparative", }, { "question": f"Does {var_num} vary by {var_cat}?", "answer": answer, "type": "comparative", }, ] ) return questions def _generate_temporal(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]: """Generate temporal questions (trends over time).""" questions = [] value_var = insight.get("value_variable", "Variable") time_var = insight.get("time_variable", "time") questions.extend( [ { "question": f"How has {value_var} changed over {time_var}?", "answer": answer, "type": "temporal", }, { "question": f"Is there a trend in {value_var} over {time_var}?", "answer": answer, "type": "temporal", }, { "question": f"Has {value_var} increased or decreased over {time_var}?", "answer": answer, "type": "temporal", }, ] ) return questions def _generate_causal(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]: """Generate causal questions (treatment effects).""" questions = [] treatment = insight.get("treatment", "Treatment") outcome = insight.get("outcome", "Outcome") controls = insight.get("controls", []) base_questions = [ { "question": f"What is the effect of {treatment} on {outcome}?", "answer": answer, "type": "causal", }, { "question": f"How does {treatment} affect {outcome}?", "answer": answer, "type": "causal", }, ] if controls: controls_str = ", ".join(controls) base_questions.append( { "question": f"Controlling for {controls_str}, what is the effect of {treatment} on {outcome}?", "answer": answer, "type": "causal", } ) questions.extend(base_questions) return questions def _generate_correlational(self, insight: dict[str, Any], answer: str) -> list[dict[str, str]]: """Generate correlational questions (associations).""" questions = [] var1 = insight.get("var1", "Variable 1") var2 = insight.get("var2", "Variable 2") questions.extend( [ { "question": f"Are {var1} and {var2} correlated?", "answer": answer, "type": "correlational", }, { "question": f"What is the relationship between {var1} and {var2}?", "answer": answer, "type": "correlational", }, { "question": f"How strongly are {var1} and {var2} associated?", "answer": answer, "type": "correlational", }, ] ) return questions def _generate_distributional( self, insight: dict[str, Any], answer: str ) -> list[dict[str, str]]: """Generate distributional questions (shape, spread).""" questions = [] var_label = insight.get("label", insight.get("variable", "Variable")) if "std" in insight or "skewness" in insight: questions.extend( [ { "question": f"What is the distribution of {var_label}?", "answer": answer, "type": "distributional", }, { "question": f"How variable is {var_label}?", "answer": answer, "type": "distributional", }, ] ) if "frequencies" in insight: questions.append( { "question": f"What is the frequency distribution of {var_label}?", "answer": answer, "type": "distributional", } ) return questions
[docs] def infer_question_type(insight: dict[str, Any]) -> QuestionType: """ Infer the appropriate question type from an insight. Args: insight: Statistical insight dictionary Returns: Inferred question type """ analysis_type = insight.get("analysis_type", "") # Temporal if analysis_type in ["temporal_trend", "year_over_year"] or "mann_kendall" in insight: return QuestionType.TEMPORAL # Causal if analysis_type == "treatment_effect" or "treatment_effect" in insight: return QuestionType.CAUSAL # Correlational if analysis_type == "numeric_numeric" or "pearson" in insight: return QuestionType.CORRELATIONAL # Comparative if analysis_type == "categorical_numeric" or "group_stats" in insight: return QuestionType.COMPARATIVE # Distributional if "skewness" in insight or "frequencies" in insight: return QuestionType.DISTRIBUTIONAL # Default to descriptive return QuestionType.DESCRIPTIVE