upbeatBytes/tests/test_enrich.py

import pytest

from goodnews.db import connect, init_db
from goodnews.enrich import og_image_from_html, _host_is_public, enrich_brief_images


def test_og_image_parser_handles_attr_order_and_fallback():
    assert og_image_from_html(b'<head><meta property="og:image" content="https://x.com/a.jpg"></head>') == "https://x.com/a.jpg"
    assert og_image_from_html(b'<meta content="https://x.com/b.jpg" property="og:image">') == "https://x.com/b.jpg"
    assert og_image_from_html(b'<meta name="twitter:image" content="https://x.com/c.jpg">') == "https://x.com/c.jpg"
    assert og_image_from_html(b"<html><body>nope</body></html>") is None


def test_og_image_only_reads_head():
    # a meta after </head> must be ignored
    assert og_image_from_html(b'<head></head><meta property="og:image" content="https://x.com/d.jpg">') is None


def test_host_public_guard_blocks_internal_ranges():
    assert _host_is_public("8.8.8.8")          # public
    assert not _host_is_public("127.0.0.1")    # loopback
    assert not _host_is_public("10.0.0.1")     # private
    assert not _host_is_public("192.168.1.1")  # private
    assert not _host_is_public("169.254.0.1")  # link-local
    assert not _host_is_public("0.0.0.0")      # unspecified
    assert not _host_is_public("")
    assert not _host_is_public(None)


@pytest.fixture
def conn():
    c = connect(":memory:"); init_db(c)
    c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
    c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'https://phys.org/x','t1','h1')")
    c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
    c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
    c.commit(); yield c; c.close()


def test_enrich_sets_image_and_stamps(conn):
    calls = []
    found = enrich_brief_images(conn, "2026-05-31", fetch=lambda u: calls.append(u) or "https://img.example/p.jpg")
    assert found == 1 and calls == ["https://phys.org/x"]
    r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
    assert r["image_url"] == "https://img.example/p.jpg" and r["image_checked_at"] is not None


def test_enrich_caches_failure_and_does_not_retry(conn):
    calls = []
    fail = lambda u: calls.append(u) or None
    assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
    r = conn.execute("SELECT image_url, image_checked_at FROM articles WHERE id=1").fetchone()
    assert r["image_url"] is None and r["image_checked_at"] is not None  # checked, cached
    assert enrich_brief_images(conn, "2026-05-31", fetch=fail) == 0
    assert len(calls) == 1  # not retried once checked


def test_enrich_upgrades_existing_feed_image(tmp_path):
    # A brief item with a (small) feed image should be upgraded to og:image.
    from goodnews.db import connect as _c, init_db as _i
    c = _c(":memory:"); _i(c)
    c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
    c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
              "VALUES (1,1,'https://bbc.com/x','t1','h1','https://bbc.com/tiny-thumb.jpg')")
    c.execute("INSERT INTO daily_briefs (id,brief_date,title) VALUES (1,'2026-05-31','B')")
    c.execute("INSERT INTO daily_brief_items (brief_id,article_id,rank) VALUES (1,1,1)")
    c.commit()
    found = enrich_brief_images(c, "2026-05-31", fetch=lambda u: "https://bbc.com/big-og.jpg")
    assert found == 1
    assert c.execute("SELECT image_url FROM articles WHERE id=1").fetchone()["image_url"] == "https://bbc.com/big-og.jpg"
    c.close()


def test_rejects_generic_share_images():
    # genuine placeholder/default share images are skipped
    assert og_image_from_html(b'<meta name="twitter:image" content="https://media.npr.org/include/images/facebook-default-wide-s.jpg">') is None
    assert og_image_from_html(b'<meta property="og:image" content="https://x.com/og-default.jpg">') is None
    # a real article image comes through
    assert og_image_from_html(b'<meta property="og:image" content="https://x.com/real-photo.jpg">') == "https://x.com/real-photo.jpg"
    # BBC's branded_news path is a real photo (logo baked in) — keep it, but swap
    # to the clean cpsprodpb variant so the hero isn't branded
    assert og_image_from_html(b'<meta property="og:image" content="https://ichef.bbci.co.uk/news/1024/branded_news/x.jpg">') == "https://ichef.bbci.co.uk/news/1024/cpsprodpb/x.jpg"
    # if the first og is a generic placeholder but a later one is real, take the real one
    html = (b'<meta property="og:image" content="https://x.com/og-default.jpg">'
            b'<meta property="og:image" content="https://x.com/article.jpg">')
    assert og_image_from_html(html) == "https://x.com/article.jpg"