crawler/db.go

package main

import (
	"context"
	"fmt"
	"net/url"
	"os"
	"strings"
	"time"

	"github.com/jackc/pgx/v5"
	"github.com/jackc/pgx/v5/pgxpool"
)

const schema = `
CREATE TABLE IF NOT EXISTS feeds (
	url TEXT PRIMARY KEY,
	type TEXT,
	title TEXT,
	description TEXT,
	language TEXT,

	last_checked_at TIMESTAMP,  -- feed_check: when last checked for new items

	etag TEXT,
	last_modified TEXT,

	-- Status: PUBLISH, STANDBY, IGNORE
	status TEXT NOT NULL DEFAULT 'STANDBY',
	last_error TEXT,

	miss_count INTEGER NOT NULL DEFAULT 0,

	-- Publishing to PDS
	publish_account TEXT
);

-- Indexes will be added as needed based on query patterns

CREATE TABLE IF NOT EXISTS items (
	link TEXT NOT NULL,
	feed_url TEXT NOT NULL REFERENCES feeds(url) ON DELETE CASCADE,
	title TEXT,
	description TEXT,
	author TEXT,
	pub_date TIMESTAMP,

	-- Media attachments
	enclosure_url TEXT,
	enclosure_type TEXT,
	image_urls JSONB,
	tags JSONB,

	-- Item status: 'pass' (default, eligible for publishing), 'fail' (rejected)
	status TEXT NOT NULL DEFAULT 'pass',

	-- Publishing to PDS
	published_at TIMESTAMP,
	published_uri TEXT,

	PRIMARY KEY (link, feed_url)
);

-- Indexes will be added as needed based on query patterns

-- OAuth sessions
CREATE TABLE IF NOT EXISTS oauth_sessions (
	id TEXT PRIMARY KEY,
	did TEXT NOT NULL,
	handle TEXT NOT NULL,
	access_token TEXT,
	refresh_token TEXT,
	token_type TEXT NOT NULL DEFAULT 'DPoP',
	expires_at TIMESTAMP NOT NULL,
	created_at TIMESTAMP NOT NULL DEFAULT NOW(),
	dpop_private_jwk TEXT,
	dpop_authserver_nonce TEXT,
	dpop_pds_nonce TEXT,
	pds_url TEXT,
	authserver_iss TEXT,
	token_expiry TIMESTAMP
);

-- CDX parquet file processing tracker
CREATE TABLE IF NOT EXISTS cdx_parquet_files (
	crawl_id TEXT NOT NULL,
	file_name TEXT NOT NULL,
	feeds_found INTEGER NOT NULL DEFAULT 0,
	processed_at TIMESTAMP NOT NULL DEFAULT NOW(),
	PRIMARY KEY (crawl_id, file_name)
);

-- Trigger to normalize feed URLs on insert/update (strips https://, http://, www.)
CREATE OR REPLACE FUNCTION normalize_feed_url()
RETURNS TRIGGER AS $$
BEGIN
    NEW.url = regexp_replace(NEW.url, '^https?://', '');
    NEW.url = regexp_replace(NEW.url, '^www\.', '');
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

DROP TRIGGER IF EXISTS normalize_feed_url_trigger ON feeds;
CREATE TRIGGER normalize_feed_url_trigger
    BEFORE INSERT OR UPDATE ON feeds
    FOR EACH ROW
    EXECUTE FUNCTION normalize_feed_url();
`

// DB wraps pgxpool.Pool with helper methods
type DB struct {
	*pgxpool.Pool
}

func OpenDatabase(connString string) (*DB, error) {
	fmt.Printf("Connecting to database...\n")

	// If connection string not provided, try environment variables
	if connString == "" {
		connString = os.Getenv("DATABASE_URL")
	}
	if connString == "" {
		// Build from individual env vars
		host := getEnvOrDefault("DB_HOST", "atproto-postgres")
		port := getEnvOrDefault("DB_PORT", "5432")
		user := getEnvOrDefault("DB_USER", "dba_1440_news")
		dbname := getEnvOrDefault("DB_NAME", "db_1440_news")

		// Support Docker secrets (password file) or direct password
		password := os.Getenv("DB_PASSWORD")
		if password == "" {
			if passwordFile := os.Getenv("DB_PASSWORD_FILE"); passwordFile != "" {
				data, err := os.ReadFile(passwordFile)
				if err != nil {
					return nil, fmt.Errorf("failed to read password file: %v", err)
				}
				password = strings.TrimSpace(string(data))
			}
		}

		connString = fmt.Sprintf("postgres://%s:%s@%s:%s/%s?sslmode=disable",
			user, url.QueryEscape(password), host, port, dbname)
	}

	config, err := pgxpool.ParseConfig(connString)
	if err != nil {
		return nil, fmt.Errorf("failed to parse connection string: %v", err)
	}

	// Connection pool settings
	config.MaxConns = 10
	config.MinConns = 0 // Don't pre-create connections to avoid schema race conditions
	config.MaxConnLifetime = 5 * time.Minute
	config.MaxConnIdleTime = 1 * time.Minute

	ctx := context.Background()
	pool, err := pgxpool.NewWithConfig(ctx, config)
	if err != nil {
		return nil, fmt.Errorf("failed to connect to database: %v", err)
	}

	// Verify connection
	if err := pool.Ping(ctx); err != nil {
		pool.Close()
		return nil, fmt.Errorf("failed to ping database: %v", err)
	}
	fmt.Println("  Connected to PostgreSQL")

	db := &DB{pool}

	// Check if schema already exists (check for feeds table)
	var tableExists bool
	pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'feeds')").Scan(&tableExists)

	if !tableExists {
		// Create schema only if tables don't exist
		if _, err := pool.Exec(ctx, schema); err != nil {
			pool.Close()
			return nil, fmt.Errorf("failed to create schema: %v", err)
		}
	}
	fmt.Println("  Schema OK")

	// Migration: add trigram extension for fast LIKE searches
	pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm")

	// Migration: drop domains table (no longer used - feeds are imported from CDX)
	pool.Exec(ctx, "DROP TABLE IF EXISTS domains CASCADE")

	// Migration: drop domain_host and domain_tld columns from feeds (use URL for domain searches)
	pool.Exec(ctx, "DROP INDEX IF EXISTS idx_feeds_domain_host_trgm")
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS domain_host")
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS domain_tld")

	// Migration: drop category column from feeds (not used)
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS category")

	// Migration: drop site_url column from feeds (derive from feed URL instead)
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS site_url")

	// Migration: drop last_error_at column from feeds (last_checked_at + miss_count sufficient)
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS last_error_at")

	// Migration: drop item_count column from feeds (compute from items table instead)
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS item_count")

	// Migration: drop content column from items (only description is used for posts)
	pool.Exec(ctx, "ALTER TABLE items DROP COLUMN IF EXISTS content")

	// Migration: replace guid with link as primary key for items
	// Check if guid column still exists
	var guidExists bool
	pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_name = 'items' AND column_name = 'guid')").Scan(&guidExists)
	if guidExists {
		// Delete items without links (useless for publishing)
		pool.Exec(ctx, "DELETE FROM items WHERE link IS NULL OR link = ''")
		// Drop old primary key, drop guid, add new primary key
		pool.Exec(ctx, "ALTER TABLE items DROP CONSTRAINT IF EXISTS items_pkey")
		pool.Exec(ctx, "ALTER TABLE items DROP COLUMN guid")
		pool.Exec(ctx, "ALTER TABLE items ADD PRIMARY KEY (link, feed_url)")
		// Make link NOT NULL
		pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN link SET NOT NULL")
	}

	// Migration: rename old 'sessions' table to 'oauth_sessions'
	var oldSessionsExists, newSessionsExists bool
	pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'sessions')").Scan(&oldSessionsExists)
	pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'oauth_sessions')").Scan(&newSessionsExists)
	if oldSessionsExists && !newSessionsExists {
		pool.Exec(ctx, "ALTER TABLE sessions RENAME TO oauth_sessions")
	}
	// Add token_expiry column if missing (used by OAuth library)
	pool.Exec(ctx, "ALTER TABLE oauth_sessions ADD COLUMN IF NOT EXISTS token_expiry TIMESTAMP")
	// Make access_token nullable (session created before tokens obtained)
	pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN access_token DROP NOT NULL")
	// Add missing OAuth session columns
	pool.Exec(ctx, "ALTER TABLE oauth_sessions ADD COLUMN IF NOT EXISTS dpop_authserver_nonce TEXT")
	pool.Exec(ctx, "ALTER TABLE oauth_sessions ADD COLUMN IF NOT EXISTS dpop_pds_nonce TEXT")
	pool.Exec(ctx, "ALTER TABLE oauth_sessions ADD COLUMN IF NOT EXISTS pds_url TEXT")
	pool.Exec(ctx, "ALTER TABLE oauth_sessions ADD COLUMN IF NOT EXISTS authserver_iss TEXT")
	// Drop old dpop_nonce column if it exists
	pool.Exec(ctx, "ALTER TABLE oauth_sessions DROP COLUMN IF EXISTS dpop_nonce")

	// Migration: rename feed columns for consistent terminology
	// last_crawled_at -> last_checked_at (feed_check = checking feeds for new items)
	// Check if old column names exist before renaming
	var colExists bool
	pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_name='feeds' AND column_name='last_crawled_at')").Scan(&colExists)
	if colExists {
		pool.Exec(ctx, "ALTER TABLE feeds RENAME COLUMN last_crawled_at TO last_checked_at")
	}
	// Drop legacy columns if they exist (no longer used)
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS next_check_at")
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS source_url")
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS discovered_at")
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS last_build_date")
	pool.Exec(ctx, "ALTER TABLE items DROP COLUMN IF EXISTS discovered_at")
	// Create index for feed check scheduling
	pool.Exec(ctx, "DROP INDEX IF EXISTS idx_feeds_to_check")
	pool.Exec(ctx, "CREATE INDEX IF NOT EXISTS idx_feeds_to_check ON feeds(last_checked_at NULLS FIRST, miss_count) WHERE status IN ('PUBLISH', 'STANDBY')")
	// Drop old index name if it exists
	pool.Exec(ctx, "DROP INDEX IF EXISTS idx_feeds_due_check")

	// Migration: convert TIMESTAMPTZ to TIMESTAMP (all times are GMT)
	// Helper to check if column is already TIMESTAMP (skip if already migrated)
	isTimestamp := func(table, column string) bool {
		var dataType string
		pool.QueryRow(ctx, `
			SELECT data_type FROM information_schema.columns
			WHERE table_name = $1 AND column_name = $2
		`, table, column).Scan(&dataType)
		return dataType == "timestamp without time zone"
	}
	// feeds table
	if !isTimestamp("feeds", "last_checked_at") {
		pool.Exec(ctx, "ALTER TABLE feeds ALTER COLUMN last_checked_at TYPE TIMESTAMP USING last_checked_at AT TIME ZONE 'UTC'")
	}
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS oldest_item_date")
	pool.Exec(ctx, "ALTER TABLE feeds DROP COLUMN IF EXISTS newest_item_date")
	// items table
	if !isTimestamp("items", "pub_date") {
		pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN pub_date TYPE TIMESTAMP USING pub_date AT TIME ZONE 'UTC'")
	}
	// Migration: drop updated_at column from items (was never populated by parsers)
	pool.Exec(ctx, "ALTER TABLE items DROP COLUMN IF EXISTS updated_at")
	// Migration: drop enclosure_length column from items (was never used)
	pool.Exec(ctx, "ALTER TABLE items DROP COLUMN IF EXISTS enclosure_length")
	if !isTimestamp("items", "published_at") {
		pool.Exec(ctx, "ALTER TABLE items ALTER COLUMN published_at TYPE TIMESTAMP USING published_at AT TIME ZONE 'UTC'")
	}
	// short_urls table
	if !isTimestamp("short_urls", "created_at") {
		pool.Exec(ctx, "ALTER TABLE short_urls ALTER COLUMN created_at TYPE TIMESTAMP USING created_at AT TIME ZONE 'UTC'")
	}
	// clicks table
	if !isTimestamp("clicks", "clicked_at") {
		pool.Exec(ctx, "ALTER TABLE clicks ALTER COLUMN clicked_at TYPE TIMESTAMP USING clicked_at AT TIME ZONE 'UTC'")
	}
	// oauth_sessions table
	if !isTimestamp("oauth_sessions", "created_at") {
		pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN created_at TYPE TIMESTAMP USING created_at AT TIME ZONE 'UTC'")
	}
	if !isTimestamp("oauth_sessions", "expires_at") {
		pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN expires_at TYPE TIMESTAMP USING expires_at AT TIME ZONE 'UTC'")
	}
	if !isTimestamp("oauth_sessions", "token_expiry") {
		pool.Exec(ctx, "ALTER TABLE oauth_sessions ALTER COLUMN token_expiry TYPE TIMESTAMP USING token_expiry AT TIME ZONE 'UTC'")
	}

	// Migration: rename item_id to item_guid in short_urls table (items now use composite PK)
	pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM information_schema.columns WHERE table_name='short_urls' AND column_name='item_id')").Scan(&colExists)
	if colExists {
		// Drop the column and add item_guid instead (can't convert int64 to text meaningfully)
		pool.Exec(ctx, "ALTER TABLE short_urls DROP COLUMN IF EXISTS item_id")
		pool.Exec(ctx, "ALTER TABLE short_urls ADD COLUMN IF NOT EXISTS item_guid TEXT")
	}

	// Migration: add status column to items table (pass/fail for publishing)
	pool.Exec(ctx, "ALTER TABLE items ADD COLUMN IF NOT EXISTS status TEXT NOT NULL DEFAULT 'pass'")

	// Migration: add cdx_progress table to track current import progress
	// current_file empty = done with this crawl
	pool.Exec(ctx, `
		CREATE TABLE IF NOT EXISTS cdx_progress (
			id INTEGER PRIMARY KEY DEFAULT 1,
			crawl_id TEXT NOT NULL,
			current_file TEXT NOT NULL DEFAULT '',
			total_feeds INTEGER NOT NULL DEFAULT 0,
			updated_at TIMESTAMP NOT NULL DEFAULT NOW()
		)
	`)
	// Drop old tables if they exist (migrating to simpler approach)
	pool.Exec(ctx, "DROP TABLE IF EXISTS cdx_imports")
	pool.Exec(ctx, "DROP TABLE IF EXISTS cdx_parquet_files")

	fmt.Println("  Schema OK")

	// Run stats and background index creation
	go func() {
		var feedCount int
		pool.QueryRow(context.Background(), "SELECT COUNT(*) FROM feeds").Scan(&feedCount)
		fmt.Printf("  Existing data: %d feeds\n", feedCount)

		fmt.Println("  Running ANALYZE...")
		if _, err := pool.Exec(context.Background(), "ANALYZE"); err != nil {
			fmt.Printf("  Warning: ANALYZE failed: %v\n", err)
		} else {
			fmt.Println("  ANALYZE complete")
		}

		// Create trigram index on items.title in background (CONCURRENTLY = no table lock)
		// Check if index already exists first
		var indexExists bool
		pool.QueryRow(context.Background(),
			"SELECT EXISTS(SELECT 1 FROM pg_indexes WHERE indexname = 'idx_items_title_trgm')").Scan(&indexExists)
		if !indexExists {
			fmt.Println("  Creating trigram index on items.title (background, may take a while)...")
			if _, err := pool.Exec(context.Background(),
				"CREATE INDEX CONCURRENTLY idx_items_title_trgm ON items USING gin (LOWER(title) gin_trgm_ops)"); err != nil {
				fmt.Printf("  Warning: items title trigram index failed: %v\n", err)
			} else {
				fmt.Println("  Trigram index on items.title complete")
			}
		}
	}()

	return db, nil
}

func getEnvOrDefault(key, defaultVal string) string {
	if val := os.Getenv(key); val != "" {
		return val
	}
	return defaultVal
}

// QueryRow wraps pool.QueryRow for compatibility
func (db *DB) QueryRow(query string, args ...interface{}) pgx.Row {
	return db.Pool.QueryRow(context.Background(), query, args...)
}

// Query wraps pool.Query for compatibility
func (db *DB) Query(query string, args ...interface{}) (pgx.Rows, error) {
	return db.Pool.Query(context.Background(), query, args...)
}

// Exec wraps pool.Exec for compatibility
func (db *DB) Exec(query string, args ...interface{}) (int64, error) {
	result, err := db.Pool.Exec(context.Background(), query, args...)
	if err != nil {
		return 0, err
	}
	return result.RowsAffected(), nil
}

// Begin starts a transaction
func (db *DB) Begin() (pgx.Tx, error) {
	return db.Pool.Begin(context.Background())
}

// Close closes the connection pool
func (db *DB) Close() error {
	db.Pool.Close()
	return nil
}

// NullableString returns nil for empty strings, otherwise the string pointer
func NullableString(s string) *string {
	if s == "" {
		return nil
	}
	return &s
}

// NullableTime returns nil for zero times, otherwise the time pointer
func NullableTime(t time.Time) *time.Time {
	if t.IsZero() {
		return nil
	}
	return &t
}

// StringValue returns empty string for nil, otherwise the dereferenced value
func StringValue(s *string) string {
	if s == nil {
		return ""
	}
	return *s
}

// TimeValue returns zero time for nil, otherwise the dereferenced value
func TimeValue(t *time.Time) time.Time {
	if t == nil {
		return time.Time{}
	}
	return *t
}

// ToSearchQuery converts a user query to PostgreSQL tsquery format
func ToSearchQuery(query string) string {
	// Simple conversion: split on spaces and join with &
	words := strings.Fields(query)
	if len(words) == 0 {
		return ""
	}
	return strings.Join(words, " & ")
}

// CDXProgress represents the current CDX import progress
type CDXProgress struct {
	CrawlID     string
	CurrentFile string // empty = done
	TotalFeeds  int
}

// GetCDXProgress returns the current CDX import progress
func (db *DB) GetCDXProgress() CDXProgress {
	var p CDXProgress
	db.QueryRow(`
		SELECT COALESCE(crawl_id, ''), COALESCE(current_file, ''), COALESCE(total_feeds, 0)
		FROM cdx_progress WHERE id = 1
	`).Scan(&p.CrawlID, &p.CurrentFile, &p.TotalFeeds)
	return p
}

// SetCDXProgress updates the current CDX import progress
func (db *DB) SetCDXProgress(crawlID, currentFile string, totalFeeds int) error {
	_, err := db.Exec(`
		INSERT INTO cdx_progress (id, crawl_id, current_file, total_feeds, updated_at)
		VALUES (1, $1, $2, $3, NOW())
		ON CONFLICT (id) DO UPDATE SET
			crawl_id = $1,
			current_file = $2,
			total_feeds = $3,
			updated_at = NOW()
	`, crawlID, currentFile, totalFeeds)
	return err
}

// CompleteCDXProgress marks the current crawl as complete (empty current_file)
func (db *DB) CompleteCDXProgress(crawlID string, totalFeeds int) error {
	return db.SetCDXProgress(crawlID, "", totalFeeds)
}