From f307e6c845f7d4acab64e47ef8eb1db023797a38 Mon Sep 17 00:00:00 2001 From: primal Date: Sun, 1 Feb 2026 17:44:35 -0500 Subject: [PATCH] Add guards to skip migrations if already done Checks column types before running ALTER TYPE migrations to avoid slow table scans on every restart. Also guards column renames. Co-Authored-By: Claude Opus 4.5 --- db.go | 163 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 138 insertions(+), 25 deletions(-) diff --git a/db.go b/db.go index c816f80..4c9e9cd 100644 --- a/db.go +++ b/db.go @@ -13,21 +13,26 @@ import ( ) const schema = ` +-- Note: tld_enum type created separately from tlds.txt (1530 TLDs) +-- CREATE TYPE tld_enum AS ENUM ('aaa', 'aarp', ... 'zw'); +-- New TLDs synced daily from IANA: https://data.iana.org/TLD/tlds-alpha-by-domain.txt + CREATE TABLE IF NOT EXISTS domains ( - host TEXT PRIMARY KEY, - status TEXT NOT NULL DEFAULT 'hold', - discovered_at TIMESTAMPTZ NOT NULL, - last_checked_at TIMESTAMPTZ, - last_crawled_at TIMESTAMPTZ, + host TEXT NOT NULL, + tld tld_enum NOT NULL, + status TEXT NOT NULL DEFAULT 'hold', -- hold, pass, skip, dead (dead = retired TLD) + crawled_at TIMESTAMP NOT NULL DEFAULT '0001-01-01 00:00:00', -- domain_check: 0=unchecked, +1s=checked; feed_crawl: real timestamp feeds_found INTEGER DEFAULT 0, last_error TEXT, - tld TEXT + miss_count INTEGER NOT NULL DEFAULT 0, -- consecutive errors, set to hold at 100 + PRIMARY KEY (host, tld) ); CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status); CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld); CREATE INDEX IF NOT EXISTS idx_domains_feeds_found ON domains(feeds_found DESC) WHERE feeds_found > 0; -CREATE INDEX IF NOT EXISTS idx_domains_to_crawl ON domains(status) WHERE last_crawled_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_domains_to_crawl ON domains(crawled_at) WHERE crawled_at < '0001-01-02'; +CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING GIN(host gin_trgm_ops); CREATE TABLE IF NOT EXISTS feeds ( url TEXT PRIMARY KEY, @@ -38,25 +43,25 @@ CREATE TABLE IF NOT EXISTS feeds ( language TEXT, site_url TEXT, - discovered_at TIMESTAMPTZ NOT NULL, - last_crawled_at TIMESTAMPTZ, - next_crawl_at TIMESTAMPTZ, - last_build_date TIMESTAMPTZ, + discovered_at TIMESTAMP NOT NULL, + last_checked_at TIMESTAMP, -- feed_check: when last checked for new items + next_check_at TIMESTAMP, -- feed_check: when to next check + last_build_date TIMESTAMP, etag TEXT, last_modified TEXT, status TEXT DEFAULT 'pass' CHECK(status IN ('hold', 'pass', 'skip')), last_error TEXT, - last_error_at TIMESTAMPTZ, + last_error_at TIMESTAMP, source_url TEXT, source_host TEXT, tld TEXT, item_count INTEGER, - oldest_item_date TIMESTAMPTZ, - newest_item_date TIMESTAMPTZ, + oldest_item_date TIMESTAMP, + newest_item_date TIMESTAMP, no_update INTEGER DEFAULT 0, @@ -77,13 +82,14 @@ CREATE INDEX IF NOT EXISTS idx_feeds_publish_status ON feeds(publish_status); CREATE INDEX IF NOT EXISTS idx_feeds_source_host_url ON feeds(source_host, url); CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld); CREATE INDEX IF NOT EXISTS idx_feeds_tld_source_host ON feeds(tld, source_host); +CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING GIN(source_host gin_trgm_ops); CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type); CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category); CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status); CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at); CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title); CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector); -CREATE INDEX IF NOT EXISTS idx_feeds_due_check ON feeds(next_crawl_at, no_update DESC) WHERE status = 'pass'; +-- idx_feeds_to_check created in migrations after column rename CREATE TABLE IF NOT EXISTS items ( id BIGSERIAL PRIMARY KEY, @@ -94,9 +100,9 @@ CREATE TABLE IF NOT EXISTS items ( description TEXT, content TEXT, author TEXT, - pub_date TIMESTAMPTZ, - discovered_at TIMESTAMPTZ NOT NULL, - updated_at TIMESTAMPTZ, + pub_date TIMESTAMP, + discovered_at TIMESTAMP NOT NULL, + updated_at TIMESTAMP, -- Media attachments enclosure_url TEXT, @@ -106,7 +112,7 @@ CREATE TABLE IF NOT EXISTS items ( tags TEXT, -- JSON array of category/tag strings -- Publishing to PDS - published_at TIMESTAMPTZ, + published_at TIMESTAMP, published_uri TEXT, -- Full-text search vector @@ -133,7 +139,7 @@ CREATE TABLE IF NOT EXISTS short_urls ( original_url TEXT NOT NULL, item_id BIGINT REFERENCES items(id), feed_url TEXT, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'), click_count INTEGER DEFAULT 0 ); @@ -144,7 +150,7 @@ CREATE INDEX IF NOT EXISTS idx_short_urls_feed_url ON short_urls(feed_url); CREATE TABLE IF NOT EXISTS clicks ( id BIGSERIAL PRIMARY KEY, short_code TEXT NOT NULL REFERENCES short_urls(code), - clicked_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + clicked_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'), referrer TEXT, user_agent TEXT, ip_hash TEXT, @@ -159,11 +165,11 @@ CREATE TABLE IF NOT EXISTS oauth_sessions ( id TEXT PRIMARY KEY, did TEXT NOT NULL, handle TEXT NOT NULL, - created_at TIMESTAMPTZ NOT NULL, - expires_at TIMESTAMPTZ NOT NULL, + created_at TIMESTAMP NOT NULL, + expires_at TIMESTAMP NOT NULL, access_token TEXT, refresh_token TEXT, - token_expiry TIMESTAMPTZ, + token_expiry TIMESTAMP, dpop_private_jwk TEXT, dpop_authserver_nonce TEXT, dpop_pds_nonce TEXT, @@ -256,9 +262,101 @@ func OpenDatabase(connString string) (*DB, error) { pool.Close() return nil, fmt.Errorf("failed to create schema: %v", err) } + + // Migration: add miss_count column if not exists + pool.Exec(ctx, "ALTER TABLE domains ADD COLUMN IF NOT EXISTS miss_count INTEGER NOT NULL DEFAULT 0") + + // Migration: add trigram extension and indexes for fast LIKE searches + // Indexes must match LOWER() used in queries + pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm") + pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING gin (LOWER(host) gin_trgm_ops)") + pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING gin (LOWER(source_host) gin_trgm_ops)") + + // Migration: rename feed columns for consistent terminology + // last_crawled_at -> last_checked_at (feed_check = checking feeds for new items) + // next_crawl_at -> next_check_at + // Check if old column names exist before renaming + var colExists bool + pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_name='feeds' AND column_name='last_crawled_at')").Scan(&colExists) + if colExists { + pool.Exec(ctx, "ALTER TABLE feeds RENAME COLUMN last_crawled_at TO last_checked_at") + } + pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_name='feeds' AND column_name='next_crawl_at')").Scan(&colExists) + if colExists { + pool.Exec(ctx, "ALTER TABLE feeds RENAME COLUMN next_crawl_at TO next_check_at") + } + // Create index on next_check_at (must be after column rename) + pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_to_check ON feeds(next_check_at, no_update DESC) WHERE status = 'pass'") + // Drop old index name if it exists + pool.Exec(ctx, "DROP INDEX IF EXISTS idx_feeds_due_check") + + // Migration: convert TIMESTAMPTZ to TIMESTAMP (all times are GMT) + // Helper to check if column is already TIMESTAMP (skip if already migrated) + isTimestamp := func(table, column string) bool { + var dataType string + pool.QueryRow(ctx, ` + SELECT data_type FROM information_schema.columns + WHERE table_name = $1 AND column_name = $2 + `, table, column).Scan(&dataType) + return dataType == "timestamp without time zone" + } + // feeds table + if !isTimestamp("feeds", "discovered_at") { + pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN discovered_at TYPE TIMESTAMP USING discovered_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("feeds", "last_checked_at") { + pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN last_checked_at TYPE TIMESTAMP USING last_checked_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("feeds", "next_check_at") { + pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN next_check_at TYPE TIMESTAMP USING next_check_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("feeds", "last_build_date") { + pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN last_build_date TYPE TIMESTAMP USING last_build_date AT TIME ZONE 'UTC'") + } + if !isTimestamp("feeds", "last_error_at") { + pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN last_error_at TYPE TIMESTAMP USING last_error_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("feeds", "oldest_item_date") { + pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN oldest_item_date TYPE TIMESTAMP USING oldest_item_date AT TIME ZONE 'UTC'") + } + if !isTimestamp("feeds", "newest_item_date") { + pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN newest_item_date TYPE TIMESTAMP USING newest_item_date AT TIME ZONE 'UTC'") + } + // items table + if !isTimestamp("items", "pub_date") { + pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN pub_date TYPE TIMESTAMP USING pub_date AT TIME ZONE 'UTC'") + } + if !isTimestamp("items", "discovered_at") { + pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN discovered_at TYPE TIMESTAMP USING discovered_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("items", "updated_at") { + pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN updated_at TYPE TIMESTAMP USING updated_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("items", "published_at") { + pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN published_at TYPE TIMESTAMP USING published_at AT TIME ZONE 'UTC'") + } + // short_urls table + if !isTimestamp("short_urls", "created_at") { + pool.Exec(ctx, "ALTER TABLE short_urls ALTER COLUMN created_at TYPE TIMESTAMP USING created_at AT TIME ZONE 'UTC'") + } + // clicks table + if !isTimestamp("clicks", "clicked_at") { + pool.Exec(ctx, "ALTER TABLE clicks ALTER COLUMN clicked_at TYPE TIMESTAMP USING clicked_at AT TIME ZONE 'UTC'") + } + // oauth_sessions table + if !isTimestamp("oauth_sessions", "created_at") { + pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN created_at TYPE TIMESTAMP USING created_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("oauth_sessions", "expires_at") { + pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN expires_at TYPE TIMESTAMP USING expires_at AT TIME ZONE 'UTC'") + } + if !isTimestamp("oauth_sessions", "token_expiry") { + pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN token_expiry TYPE TIMESTAMP USING token_expiry AT TIME ZONE 'UTC'") + } + fmt.Println(" Schema OK") - // Run stats in background + // Run stats and background index creation go func() { var domainCount, feedCount int pool.QueryRow(context.Background(), "SELECT COUNT(*) FROM domains").Scan(&domainCount) @@ -271,6 +369,21 @@ func OpenDatabase(connString string) (*DB, error) { } else { fmt.Println(" ANALYZE complete") } + + // Create trigram index on items.title in background (CONCURRENTLY = no table lock) + // Check if index already exists first + var indexExists bool + pool.QueryRow(context.Background(), + "SELECT EXISTS(SELECT 1 FROM pg_indexes WHERE indexname = 'idx_items_title_trgm')").Scan(&indexExists) + if !indexExists { + fmt.Println(" Creating trigram index on items.title (background, may take a while)...") + if _, err := pool.Exec(context.Background(), + "CREATE INDEX CONCURRENTLY idx_items_title_trgm ON items USING gin (LOWER(title) gin_trgm_ops)"); err != nil { + fmt.Printf(" Warning: items title trigram index failed: %v\n", err) + } else { + fmt.Println(" Trigram index on items.title complete") + } + } }() return db, nil