Add guards to skip migrations if already done
Checks column types before running ALTER TYPE migrations to avoid slow table scans on every restart. Also guards column renames. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -13,21 +13,26 @@ import (
|
||||
)
|
||||
|
||||
const schema = `
|
||||
-- Note: tld_enum type created separately from tlds.txt (1530 TLDs)
|
||||
-- CREATE TYPE tld_enum AS ENUM ('aaa', 'aarp', ... 'zw');
|
||||
-- New TLDs synced daily from IANA: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
||||
|
||||
CREATE TABLE IF NOT EXISTS domains (
|
||||
host TEXT PRIMARY KEY,
|
||||
status TEXT NOT NULL DEFAULT 'hold',
|
||||
discovered_at TIMESTAMPTZ NOT NULL,
|
||||
last_checked_at TIMESTAMPTZ,
|
||||
last_crawled_at TIMESTAMPTZ,
|
||||
host TEXT NOT NULL,
|
||||
tld tld_enum NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'hold', -- hold, pass, skip, dead (dead = retired TLD)
|
||||
crawled_at TIMESTAMP NOT NULL DEFAULT '0001-01-01 00:00:00', -- domain_check: 0=unchecked, +1s=checked; feed_crawl: real timestamp
|
||||
feeds_found INTEGER DEFAULT 0,
|
||||
last_error TEXT,
|
||||
tld TEXT
|
||||
miss_count INTEGER NOT NULL DEFAULT 0, -- consecutive errors, set to hold at 100
|
||||
PRIMARY KEY (host, tld)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld);
|
||||
CREATE INDEX IF NOT EXISTS idx_domains_feeds_found ON domains(feeds_found DESC) WHERE feeds_found > 0;
|
||||
CREATE INDEX IF NOT EXISTS idx_domains_to_crawl ON domains(status) WHERE last_crawled_at IS NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_domains_to_crawl ON domains(crawled_at) WHERE crawled_at < '0001-01-02';
|
||||
CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING GIN(host gin_trgm_ops);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS feeds (
|
||||
url TEXT PRIMARY KEY,
|
||||
@@ -38,25 +43,25 @@ CREATE TABLE IF NOT EXISTS feeds (
|
||||
language TEXT,
|
||||
site_url TEXT,
|
||||
|
||||
discovered_at TIMESTAMPTZ NOT NULL,
|
||||
last_crawled_at TIMESTAMPTZ,
|
||||
next_crawl_at TIMESTAMPTZ,
|
||||
last_build_date TIMESTAMPTZ,
|
||||
discovered_at TIMESTAMP NOT NULL,
|
||||
last_checked_at TIMESTAMP, -- feed_check: when last checked for new items
|
||||
next_check_at TIMESTAMP, -- feed_check: when to next check
|
||||
last_build_date TIMESTAMP,
|
||||
|
||||
etag TEXT,
|
||||
last_modified TEXT,
|
||||
|
||||
status TEXT DEFAULT 'pass' CHECK(status IN ('hold', 'pass', 'skip')),
|
||||
last_error TEXT,
|
||||
last_error_at TIMESTAMPTZ,
|
||||
last_error_at TIMESTAMP,
|
||||
|
||||
source_url TEXT,
|
||||
source_host TEXT,
|
||||
tld TEXT,
|
||||
|
||||
item_count INTEGER,
|
||||
oldest_item_date TIMESTAMPTZ,
|
||||
newest_item_date TIMESTAMPTZ,
|
||||
oldest_item_date TIMESTAMP,
|
||||
newest_item_date TIMESTAMP,
|
||||
|
||||
no_update INTEGER DEFAULT 0,
|
||||
|
||||
@@ -77,13 +82,14 @@ CREATE INDEX IF NOT EXISTS idx_feeds_publish_status ON feeds(publish_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_url ON feeds(source_host, url);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_tld_source_host ON feeds(tld, source_host);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING GIN(source_host gin_trgm_ops);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector);
|
||||
CREATE INDEX IF NOT EXISTS idx_feeds_due_check ON feeds(next_crawl_at, no_update DESC) WHERE status = 'pass';
|
||||
-- idx_feeds_to_check created in migrations after column rename
|
||||
|
||||
CREATE TABLE IF NOT EXISTS items (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
@@ -94,9 +100,9 @@ CREATE TABLE IF NOT EXISTS items (
|
||||
description TEXT,
|
||||
content TEXT,
|
||||
author TEXT,
|
||||
pub_date TIMESTAMPTZ,
|
||||
discovered_at TIMESTAMPTZ NOT NULL,
|
||||
updated_at TIMESTAMPTZ,
|
||||
pub_date TIMESTAMP,
|
||||
discovered_at TIMESTAMP NOT NULL,
|
||||
updated_at TIMESTAMP,
|
||||
|
||||
-- Media attachments
|
||||
enclosure_url TEXT,
|
||||
@@ -106,7 +112,7 @@ CREATE TABLE IF NOT EXISTS items (
|
||||
tags TEXT, -- JSON array of category/tag strings
|
||||
|
||||
-- Publishing to PDS
|
||||
published_at TIMESTAMPTZ,
|
||||
published_at TIMESTAMP,
|
||||
published_uri TEXT,
|
||||
|
||||
-- Full-text search vector
|
||||
@@ -133,7 +139,7 @@ CREATE TABLE IF NOT EXISTS short_urls (
|
||||
original_url TEXT NOT NULL,
|
||||
item_id BIGINT REFERENCES items(id),
|
||||
feed_url TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
|
||||
click_count INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
@@ -144,7 +150,7 @@ CREATE INDEX IF NOT EXISTS idx_short_urls_feed_url ON short_urls(feed_url);
|
||||
CREATE TABLE IF NOT EXISTS clicks (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
short_code TEXT NOT NULL REFERENCES short_urls(code),
|
||||
clicked_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
clicked_at TIMESTAMP NOT NULL DEFAULT (NOW() AT TIME ZONE 'UTC'),
|
||||
referrer TEXT,
|
||||
user_agent TEXT,
|
||||
ip_hash TEXT,
|
||||
@@ -159,11 +165,11 @@ CREATE TABLE IF NOT EXISTS oauth_sessions (
|
||||
id TEXT PRIMARY KEY,
|
||||
did TEXT NOT NULL,
|
||||
handle TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL,
|
||||
expires_at TIMESTAMPTZ NOT NULL,
|
||||
created_at TIMESTAMP NOT NULL,
|
||||
expires_at TIMESTAMP NOT NULL,
|
||||
access_token TEXT,
|
||||
refresh_token TEXT,
|
||||
token_expiry TIMESTAMPTZ,
|
||||
token_expiry TIMESTAMP,
|
||||
dpop_private_jwk TEXT,
|
||||
dpop_authserver_nonce TEXT,
|
||||
dpop_pds_nonce TEXT,
|
||||
@@ -256,9 +262,101 @@ func OpenDatabase(connString string) (*DB, error) {
|
||||
pool.Close()
|
||||
return nil, fmt.Errorf("failed to create schema: %v", err)
|
||||
}
|
||||
|
||||
// Migration: add miss_count column if not exists
|
||||
pool.Exec(ctx, "ALTER TABLE domains ADD COLUMN IF NOT EXISTS miss_count INTEGER NOT NULL DEFAULT 0")
|
||||
|
||||
// Migration: add trigram extension and indexes for fast LIKE searches
|
||||
// Indexes must match LOWER() used in queries
|
||||
pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm")
|
||||
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_domains_host_trgm ON domains USING gin (LOWER(host) gin_trgm_ops)")
|
||||
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_source_host_trgm ON feeds USING gin (LOWER(source_host) gin_trgm_ops)")
|
||||
|
||||
// Migration: rename feed columns for consistent terminology
|
||||
// last_crawled_at -> last_checked_at (feed_check = checking feeds for new items)
|
||||
// next_crawl_at -> next_check_at
|
||||
// Check if old column names exist before renaming
|
||||
var colExists bool
|
||||
pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_name='feeds' AND column_name='last_crawled_at')").Scan(&colExists)
|
||||
if colExists {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds RENAME COLUMN last_crawled_at TO last_checked_at")
|
||||
}
|
||||
pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_name='feeds' AND column_name='next_crawl_at')").Scan(&colExists)
|
||||
if colExists {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds RENAME COLUMN next_crawl_at TO next_check_at")
|
||||
}
|
||||
// Create index on next_check_at (must be after column rename)
|
||||
pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_to_check ON feeds(next_check_at, no_update DESC) WHERE status = 'pass'")
|
||||
// Drop old index name if it exists
|
||||
pool.Exec(ctx, "DROP INDEX IF EXISTS idx_feeds_due_check")
|
||||
|
||||
// Migration: convert TIMESTAMPTZ to TIMESTAMP (all times are GMT)
|
||||
// Helper to check if column is already TIMESTAMP (skip if already migrated)
|
||||
isTimestamp := func(table, column string) bool {
|
||||
var dataType string
|
||||
pool.QueryRow(ctx, `
|
||||
SELECT data_type FROM information_schema.columns
|
||||
WHERE table_name = $1 AND column_name = $2
|
||||
`, table, column).Scan(&dataType)
|
||||
return dataType == "timestamp without time zone"
|
||||
}
|
||||
// feeds table
|
||||
if !isTimestamp("feeds", "discovered_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN discovered_at TYPE TIMESTAMP USING discovered_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("feeds", "last_checked_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN last_checked_at TYPE TIMESTAMP USING last_checked_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("feeds", "next_check_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN next_check_at TYPE TIMESTAMP USING next_check_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("feeds", "last_build_date") {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN last_build_date TYPE TIMESTAMP USING last_build_date AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("feeds", "last_error_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN last_error_at TYPE TIMESTAMP USING last_error_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("feeds", "oldest_item_date") {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN oldest_item_date TYPE TIMESTAMP USING oldest_item_date AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("feeds", "newest_item_date") {
|
||||
pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN newest_item_date TYPE TIMESTAMP USING newest_item_date AT TIME ZONE 'UTC'")
|
||||
}
|
||||
// items table
|
||||
if !isTimestamp("items", "pub_date") {
|
||||
pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN pub_date TYPE TIMESTAMP USING pub_date AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("items", "discovered_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN discovered_at TYPE TIMESTAMP USING discovered_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("items", "updated_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN updated_at TYPE TIMESTAMP USING updated_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("items", "published_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN published_at TYPE TIMESTAMP USING published_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
// short_urls table
|
||||
if !isTimestamp("short_urls", "created_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE short_urls ALTER COLUMN created_at TYPE TIMESTAMP USING created_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
// clicks table
|
||||
if !isTimestamp("clicks", "clicked_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE clicks ALTER COLUMN clicked_at TYPE TIMESTAMP USING clicked_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
// oauth_sessions table
|
||||
if !isTimestamp("oauth_sessions", "created_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN created_at TYPE TIMESTAMP USING created_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("oauth_sessions", "expires_at") {
|
||||
pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN expires_at TYPE TIMESTAMP USING expires_at AT TIME ZONE 'UTC'")
|
||||
}
|
||||
if !isTimestamp("oauth_sessions", "token_expiry") {
|
||||
pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN token_expiry TYPE TIMESTAMP USING token_expiry AT TIME ZONE 'UTC'")
|
||||
}
|
||||
|
||||
fmt.Println(" Schema OK")
|
||||
|
||||
// Run stats in background
|
||||
// Run stats and background index creation
|
||||
go func() {
|
||||
var domainCount, feedCount int
|
||||
pool.QueryRow(context.Background(), "SELECT COUNT(*) FROM domains").Scan(&domainCount)
|
||||
@@ -271,6 +369,21 @@ func OpenDatabase(connString string) (*DB, error) {
|
||||
} else {
|
||||
fmt.Println(" ANALYZE complete")
|
||||
}
|
||||
|
||||
// Create trigram index on items.title in background (CONCURRENTLY = no table lock)
|
||||
// Check if index already exists first
|
||||
var indexExists bool
|
||||
pool.QueryRow(context.Background(),
|
||||
"SELECT EXISTS(SELECT 1 FROM pg_indexes WHERE indexname = 'idx_items_title_trgm')").Scan(&indexExists)
|
||||
if !indexExists {
|
||||
fmt.Println(" Creating trigram index on items.title (background, may take a while)...")
|
||||
if _, err := pool.Exec(context.Background(),
|
||||
"CREATE INDEX CONCURRENTLY idx_items_title_trgm ON items USING gin (LOWER(title) gin_trgm_ops)"); err != nil {
|
||||
fmt.Printf(" Warning: items title trigram index failed: %v\n", err)
|
||||
} else {
|
||||
fmt.Println(" Trigram index on items.title complete")
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return db, nil
|
||||
|
||||
Reference in New Issue
Block a user