upbeatBytes/goodnews/taxonomy.py

"""Single source of truth for article topic/flavor categories.

Both the LLM response schema (enum constraints) and the post-hoc validation in
normalize_scores import from here, so the allowed values can never drift apart.
Adjusting a category here + re-running `classify` is all it takes to reshape the
browsable feeds.
"""

from __future__ import annotations


# Primary topic — exactly one per article. Used for ranking, brief balance, and
# source reports (the "machine organization" axis).
TOPICS: dict[str, str] = {
    "science": "research, discoveries, space, physics",
    "technology": "computing, AI, engineering, gadgets, digital tools",
    "environment": "conservation, climate solutions, ecosystems, clean energy",
    "health": "medicine, wellbeing, mental health, public health",
    "community": "local action, humanitarian work, social progress, kindness, fair work",
    "culture": "arts, history, heritage, sport, human-interest",
    "animals": "wildlife, nature discoveries, charming animal stories",
    "learning": "education, personal growth, practical knowledge, curiosity",
}

# Groupings — 1–4 per article, the "human wandering" axis. A controlled
# vocabulary (never free-form) organised into calm families for the Explore UI.
# Families live in code, not the DB. Tag slugs are lowercase, hyphenated.
FAMILIES: dict[str, dict] = {
    "Discovery & Wonder": {
        "description": "Awe, science, and the natural world.",
        "tags": ["science", "space", "animals", "nature", "archaeology", "technology", "curiosity"],
    },
    "People & Kindness": {
        "description": "Community, generosity, and human warmth.",
        "tags": ["community", "helping", "culture", "generosity", "resilience", "local-wins"],
    },
    "Solutions & Progress": {
        "description": "Problems being solved.",
        "tags": ["environment", "climate-solutions", "public-health", "cities", "clean-energy", "innovation"],
    },
    "Mind & Craft": {
        "description": "Ideas, learning, and making.",
        "tags": ["learning", "ideas", "arts", "books", "creativity", "perspective", "work-life", "food"],
    },
}

# Flat allowed-tag set (union of all families), for enum + validation.
ALLOWED_TAGS: tuple[str, ...] = tuple(dict.fromkeys(t for f in FAMILIES.values() for t in f["tags"]))
MAX_TAGS = 4

# Tonal axis: why the story is worth surfacing in a calm, uplifting digest.
FLAVORS: dict[str, str] = {
    "breakthrough": "a significant advance or innovation with clear public benefit",
    "discovery": "newly found or learned; calm and fascinating, low on agency",
    "solution": "people actively repairing, restoring, or solving a problem",
    "feelgood": "a heartwarming human, community, or kindness story",
    "perspective": "useful advice, insight, or framing the reader can apply",
}

DEFAULT_TOPIC = "science"
DEFAULT_FLAVOR = "discovery"


def coerce_topic(value: object) -> str:
    text = str(value or "").strip().lower()
    return text if text in TOPICS else DEFAULT_TOPIC


def coerce_flavor(value: object) -> str:
    text = str(value or "").strip().lower()
    return text if text in FLAVORS else DEFAULT_FLAVOR


def coerce_tags(value: object, max_tags: int = MAX_TAGS) -> list[str]:
    """Validate a model-supplied tag list against the controlled vocabulary."""
    if not isinstance(value, list):
        return []
    out: list[str] = []
    for item in value:
        tag = str(item).strip().lower()
        if tag in ALLOWED_TAGS and tag not in out:
            out.append(tag)
        if len(out) >= max_tags:
            break
    return out


def tags_prompt_block() -> str:
    return "\n".join(f"- {family}: {', '.join(d['tags'])}" for family, d in FAMILIES.items())


def _bullet_list(mapping: dict[str, str]) -> str:
    return "\n".join(f"- {key}: {desc}" for key, desc in mapping.items())


def topics_prompt_block() -> str:
    return _bullet_list(TOPICS)


def flavors_prompt_block() -> str:
    return _bullet_list(FLAVORS)