Migrate to normalized FK schema (domain_host, domain_tld)

Replace source_host column with proper FK to domains table using
composite key (domain_host, domain_tld). This enables JOIN queries
instead of string concatenation for domain lookups.

Changes:
- Update Feed struct: SourceHost/TLD → DomainHost/DomainTLD
- Update all SQL queries to use domain_host/domain_tld columns
- Add column aliases (as source_host) for API backwards compatibility
- Update trigram index from source_host to domain_host
- Add getDomainHost() helper for extracting host from domain

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-02-01 22:36:25 -05:00
parent e7f6be2203
commit 7ec4207173
12 changed files with 193 additions and 214 deletions
+28 -92
View File
@@ -36,14 +36,17 @@ CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING GIN(host gin_t
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
domain_host TEXT NOT NULL,
domain_tld tld_enum NOT NULL,
type TEXT,
category TEXT DEFAULT 'main',
title TEXT,
description TEXT,
language TEXT,
site_url TEXT,
source_url TEXT,
discovered_at TIMESTAMP NOT NULL,
discovered_at TIMESTAMP NOT NULL DEFAULT NOW(),
last_checked_at TIMESTAMP, -- feed_check: when last checked for new items
next_check_at TIMESTAMP, -- feed_check: when to next check
last_build_date TIMESTAMP,
@@ -51,134 +54,67 @@ CREATE TABLE IF NOT EXISTS feeds (
etag TEXT,
last_modified TEXT,
status TEXT DEFAULT 'pass' CHECK(status IN ('hold', 'pass', 'skip')),
status TEXT NOT NULL DEFAULT 'pass',
last_error TEXT,
last_error_at TIMESTAMP,
source_url TEXT,
source_host TEXT,
tld TEXT,
item_count INTEGER,
item_count INTEGER NOT NULL DEFAULT 0,
oldest_item_date TIMESTAMP,
newest_item_date TIMESTAMP,
no_update INTEGER DEFAULT 0,
no_update INTEGER NOT NULL DEFAULT 0,
-- Publishing to PDS
publish_status TEXT DEFAULT 'hold' CHECK(publish_status IN ('hold', 'pass', 'skip')),
publish_status TEXT NOT NULL DEFAULT 'hold',
publish_account TEXT,
-- Full-text search vector
search_vector tsvector GENERATED ALWAYS AS (
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
setweight(to_tsvector('english', coalesce(description, '')), 'B') ||
setweight(to_tsvector('english', coalesce(url, '')), 'C')
) STORED
FOREIGN KEY (domain_host, domain_tld) REFERENCES domains(host, tld)
);
CREATE INDEX IF NOT EXISTS idx_feeds_source_host ON feeds(source_host);
CREATE INDEX IF NOT EXISTS idx_feeds_publish_status ON feeds(publish_status);
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_url ON feeds(source_host, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_source_host ON feeds(tld, source_host);
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING GIN(source_host gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector);
-- idx_feeds_to_check created in migrations after column rename
-- Indexes will be added as needed based on query patterns
CREATE TABLE IF NOT EXISTS items (
id BIGSERIAL PRIMARY KEY,
feed_url TEXT NOT NULL,
guid TEXT,
guid TEXT NOT NULL,
feed_url TEXT NOT NULL REFERENCES feeds(url) ON DELETE CASCADE,
title TEXT,
link TEXT,
description TEXT,
content TEXT,
author TEXT,
pub_date TIMESTAMP,
discovered_at TIMESTAMP NOT NULL,
discovered_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP,
-- Media attachments
enclosure_url TEXT,
enclosure_type TEXT,
enclosure_length BIGINT,
image_urls TEXT, -- JSON array of image URLs
tags TEXT, -- JSON array of category/tag strings
image_urls JSONB,
tags JSONB,
-- Publishing to PDS
published_at TIMESTAMP,
published_uri TEXT,
-- Full-text search vector
search_vector tsvector GENERATED ALWAYS AS (
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
setweight(to_tsvector('english', coalesce(description, '')), 'B') ||
setweight(to_tsvector('english', coalesce(content, '')), 'C') ||
setweight(to_tsvector('english', coalesce(author, '')), 'D')
) STORED,
UNIQUE(feed_url, guid)
PRIMARY KEY (guid, feed_url)
);
CREATE INDEX IF NOT EXISTS idx_items_feed_url ON items(feed_url);
CREATE INDEX IF NOT EXISTS idx_items_pub_date ON items(pub_date DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feed_url_pub_date ON items(feed_url, pub_date DESC);
CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feed_url, published_at) WHERE published_at IS NULL;
CREATE INDEX IF NOT EXISTS idx_items_search ON items USING GIN(search_vector);
-- Indexes will be added as needed based on query patterns
-- URL Shortener tables
CREATE TABLE IF NOT EXISTS short_urls (
code TEXT PRIMARY KEY,
original_url TEXT NOT NULL,
item_id BIGINT REFERENCES items(id),
feed_url TEXT,
created_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
click_count INTEGER DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_short_urls_original ON short_urls(original_url);
CREATE INDEX IF NOT EXISTS idx_short_urls_item_id ON short_urls(item_id);
CREATE INDEX IF NOT EXISTS idx_short_urls_feed_url ON short_urls(feed_url);
CREATE TABLE IF NOT EXISTS clicks (
id BIGSERIAL PRIMARY KEY,
short_code TEXT NOT NULL REFERENCES short_urls(code),
clicked_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
referrer TEXT,
user_agent TEXT,
ip_hash TEXT,
country TEXT
);
CREATE INDEX IF NOT EXISTS idx_clicks_short_code ON clicks(short_code);
CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at DESC);
-- OAuth sessions (persisted for login persistence across deploys)
CREATE TABLE IF NOT EXISTS oauth_sessions (
-- OAuth sessions
CREATE TABLE IF NOT EXISTS sessions (
id TEXT PRIMARY KEY,
did TEXT NOT NULL,
handle TEXT NOT NULL,
created_at TIMESTAMP NOT NULL,
expires_at TIMESTAMP NOT NULL,
access_token TEXT,
access_token TEXT NOT NULL,
refresh_token TEXT,
token_expiry TIMESTAMP,
dpop_private_jwk TEXT,
dpop_authserver_nonce TEXT,
dpop_pds_nonce TEXT,
pds_url TEXT,
authserver_iss TEXT
token_type TEXT NOT NULL DEFAULT 'DPoP',
expires_at TIMESTAMP NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
dpop_nonce TEXT,
dpop_private_jwk TEXT
);
CREATE INDEX IF NOT EXISTS idx_oauth_sessions_expires_at ON oauth_sessions(expires_at);
-- Trigger to normalize feed URLs on insert/update (strips https://, http://, www.)
CREATE OR REPLACE FUNCTION normalize_feed_url()
RETURNS TRIGGER AS $$
@@ -212,8 +148,8 @@ func OpenDatabase(connString string) (*DB, error) {
// Build from individual env vars
host := getEnvOrDefault("DB_HOST", "atproto-postgres")
port := getEnvOrDefault("DB_PORT", "5432")
user := getEnvOrDefault("DB_USER", "news_1440")
dbname := getEnvOrDefault("DB_NAME", "news_1440")
user := getEnvOrDefault("DB_USER", "dba_1440_news")
dbname := getEnvOrDefault("DB_NAME", "db_1440_news")
// Support Docker secrets (password file) or direct password
password := os.Getenv("DB_PASSWORD")
@@ -271,7 +207,7 @@ func OpenDatabase(connString string) (*DB, error) {
// Indexes must match LOWER() used in queries
pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm")
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING gin (LOWER(host) gin_trgm_ops)")
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING gin (LOWER(source_host) gin_trgm_ops)")
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_domain_host_trgm ON feeds USING gin (LOWER(domain_host) gin_trgm_ops)")
// Migration: rename feed columns for consistent terminology
// last_crawled_at -> last_checked_at (feed_check = checking feeds for new items)