package main import ( "context" "fmt" "net/url" "os" "strings" "time" "github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5/pgxpool" ) const schema = ` CREATE TABLE IF NOT EXISTS domains ( host TEXT PRIMARY KEY, status TEXT NOT NULL DEFAULT 'hold', discovered_at TIMESTAMPTZ NOT NULL, last_checked_at TIMESTAMPTZ, last_crawled_at TIMESTAMPTZ, feeds_found INTEGER DEFAULT 0, last_error TEXT, tld TEXT ); CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status); CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld); CREATE INDEX IF NOT EXISTS idx_domains_feeds_found ON domains(feeds_found DESC) WHERE feeds_found > 0; CREATE INDEX IF NOT EXISTS idx_domains_to_check ON domains(status) WHERE last_checked_at IS NULL; CREATE INDEX IF NOT EXISTS idx_domains_to_crawl ON domains(status) WHERE last_checked_at IS NOT NULL AND last_crawled_at IS NULL; CREATE TABLE IF NOT EXISTS feeds ( url TEXT PRIMARY KEY, type TEXT, category TEXT DEFAULT 'main', title TEXT, description TEXT, language TEXT, site_url TEXT, discovered_at TIMESTAMPTZ NOT NULL, last_crawled_at TIMESTAMPTZ, next_crawl_at TIMESTAMPTZ, last_build_date TIMESTAMPTZ, etag TEXT, last_modified TEXT, ttl_minutes INTEGER, update_period TEXT, update_freq INTEGER, status TEXT DEFAULT 'active', error_count INTEGER DEFAULT 0, last_error TEXT, last_error_at TIMESTAMPTZ, source_url TEXT, source_host TEXT, tld TEXT, item_count INTEGER, avg_post_freq_hrs DOUBLE PRECISION, oldest_item_date TIMESTAMPTZ, newest_item_date TIMESTAMPTZ, no_update INTEGER DEFAULT 0, -- Publishing to PDS publish_status TEXT DEFAULT 'hold' CHECK(publish_status IN ('hold', 'pass', 'skip')), publish_account TEXT, -- Full-text search vector search_vector tsvector GENERATED ALWAYS AS ( setweight(to_tsvector('english', coalesce(title, '')), 'A') || setweight(to_tsvector('english', coalesce(description, '')), 'B') || setweight(to_tsvector('english', coalesce(url, '')), 'C') ) STORED ); CREATE INDEX IF NOT EXISTS idx_feeds_source_host ON feeds(source_host); CREATE INDEX IF NOT EXISTS idx_feeds_publish_status ON feeds(publish_status); CREATE INDEX IF NOT EXISTS idx_feeds_source_host_url ON feeds(source_host, url); CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld); CREATE INDEX IF NOT EXISTS idx_feeds_tld_source_host ON feeds(tld, source_host); CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type); CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category); CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status); CREATE INDEX IF NOT EXISTS idx_feeds_discovered_at ON feeds(discovered_at); CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title); CREATE INDEX IF NOT EXISTS idx_feeds_search ON feeds USING GIN(search_vector); CREATE TABLE IF NOT EXISTS items ( id BIGSERIAL PRIMARY KEY, feed_url TEXT NOT NULL, guid TEXT, title TEXT, link TEXT, description TEXT, content TEXT, author TEXT, pub_date TIMESTAMPTZ, discovered_at TIMESTAMPTZ NOT NULL, updated_at TIMESTAMPTZ, -- Media attachments enclosure_url TEXT, enclosure_type TEXT, enclosure_length BIGINT, image_urls TEXT, -- JSON array of image URLs tags TEXT, -- JSON array of category/tag strings -- Publishing to PDS published_at TIMESTAMPTZ, published_uri TEXT, -- Full-text search vector search_vector tsvector GENERATED ALWAYS AS ( setweight(to_tsvector('english', coalesce(title, '')), 'A') || setweight(to_tsvector('english', coalesce(description, '')), 'B') || setweight(to_tsvector('english', coalesce(content, '')), 'C') || setweight(to_tsvector('english', coalesce(author, '')), 'D') ) STORED, UNIQUE(feed_url, guid) ); CREATE INDEX IF NOT EXISTS idx_items_feed_url ON items(feed_url); CREATE INDEX IF NOT EXISTS idx_items_pub_date ON items(pub_date DESC); CREATE INDEX IF NOT EXISTS idx_items_link ON items(link); CREATE INDEX IF NOT EXISTS idx_items_feed_url_pub_date ON items(feed_url, pub_date DESC); CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feed_url, published_at) WHERE published_at IS NULL; CREATE INDEX IF NOT EXISTS idx_items_search ON items USING GIN(search_vector); -- URL Shortener tables CREATE TABLE IF NOT EXISTS short_urls ( code TEXT PRIMARY KEY, original_url TEXT NOT NULL, item_id BIGINT REFERENCES items(id), feed_url TEXT, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), click_count INTEGER DEFAULT 0 ); CREATE INDEX IF NOT EXISTS idx_short_urls_original ON short_urls(original_url); CREATE INDEX IF NOT EXISTS idx_short_urls_item_id ON short_urls(item_id); CREATE INDEX IF NOT EXISTS idx_short_urls_feed_url ON short_urls(feed_url); CREATE TABLE IF NOT EXISTS clicks ( id BIGSERIAL PRIMARY KEY, short_code TEXT NOT NULL REFERENCES short_urls(code), clicked_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), referrer TEXT, user_agent TEXT, ip_hash TEXT, country TEXT ); CREATE INDEX IF NOT EXISTS idx_clicks_short_code ON clicks(short_code); CREATE INDEX IF NOT EXISTS idx_clicks_clicked_at ON clicks(clicked_at DESC); -- OAuth sessions (persisted for login persistence across deploys) CREATE TABLE IF NOT EXISTS oauth_sessions ( id TEXT PRIMARY KEY, did TEXT NOT NULL, handle TEXT NOT NULL, created_at TIMESTAMPTZ NOT NULL, expires_at TIMESTAMPTZ NOT NULL, access_token TEXT, refresh_token TEXT, token_expiry TIMESTAMPTZ, dpop_private_jwk TEXT, dpop_authserver_nonce TEXT, dpop_pds_nonce TEXT, pds_url TEXT, authserver_iss TEXT ); CREATE INDEX IF NOT EXISTS idx_oauth_sessions_expires_at ON oauth_sessions(expires_at); -- Trigger to normalize feed URLs on insert/update (strips https://, http://, www.) CREATE OR REPLACE FUNCTION normalize_feed_url() RETURNS TRIGGER AS $$ BEGIN NEW.url = regexp_replace(NEW.url, '^https?://', ''); NEW.url = regexp_replace(NEW.url, '^www\.', ''); RETURN NEW; END; $$ LANGUAGE plpgsql; DROP TRIGGER IF EXISTS normalize_feed_url_trigger ON feeds; CREATE TRIGGER normalize_feed_url_trigger BEFORE INSERT OR UPDATE ON feeds FOR EACH ROW EXECUTE FUNCTION normalize_feed_url(); ` // DB wraps pgxpool.Pool with helper methods type DB struct { *pgxpool.Pool } func OpenDatabase(connString string) (*DB, error) { fmt.Printf("Connecting to database...\n") // If connection string not provided, try environment variables if connString == "" { connString = os.Getenv("DATABASE_URL") } if connString == "" { // Build from individual env vars host := getEnvOrDefault("DB_HOST", "atproto-postgres") port := getEnvOrDefault("DB_PORT", "5432") user := getEnvOrDefault("DB_USER", "news_1440") dbname := getEnvOrDefault("DB_NAME", "news_1440") // Support Docker secrets (password file) or direct password password := os.Getenv("DB_PASSWORD") if password == "" { if passwordFile := os.Getenv("DB_PASSWORD_FILE"); passwordFile != "" { data, err := os.ReadFile(passwordFile) if err != nil { return nil, fmt.Errorf("failed to read password file: %v", err) } password = strings.TrimSpace(string(data)) } } connString = fmt.Sprintf("postgres://%s:%s@%s:%s/%s?sslmode=disable", user, url.QueryEscape(password), host, port, dbname) } config, err := pgxpool.ParseConfig(connString) if err != nil { return nil, fmt.Errorf("failed to parse connection string: %v", err) } // Connection pool settings config.MaxConns = 10 config.MinConns = 2 config.MaxConnLifetime = 5 * time.Minute config.MaxConnIdleTime = 1 * time.Minute ctx := context.Background() pool, err := pgxpool.NewWithConfig(ctx, config) if err != nil { return nil, fmt.Errorf("failed to connect to database: %v", err) } // Verify connection if err := pool.Ping(ctx); err != nil { pool.Close() return nil, fmt.Errorf("failed to ping database: %v", err) } fmt.Println(" Connected to PostgreSQL") db := &DB{pool} // Create schema if _, err := pool.Exec(ctx, schema); err != nil { pool.Close() return nil, fmt.Errorf("failed to create schema: %v", err) } fmt.Println(" Schema OK") // Run stats in background go func() { var domainCount, feedCount int pool.QueryRow(context.Background(), "SELECT COUNT(*) FROM domains").Scan(&domainCount) pool.QueryRow(context.Background(), "SELECT COUNT(*) FROM feeds").Scan(&feedCount) fmt.Printf(" Existing data: %d domains, %d feeds\n", domainCount, feedCount) fmt.Println(" Running ANALYZE...") if _, err := pool.Exec(context.Background(), "ANALYZE"); err != nil { fmt.Printf(" Warning: ANALYZE failed: %v\n", err) } else { fmt.Println(" ANALYZE complete") } }() return db, nil } func getEnvOrDefault(key, defaultVal string) string { if val := os.Getenv(key); val != "" { return val } return defaultVal } // QueryRow wraps pool.QueryRow for compatibility func (db *DB) QueryRow(query string, args ...interface{}) pgx.Row { return db.Pool.QueryRow(context.Background(), query, args...) } // Query wraps pool.Query for compatibility func (db *DB) Query(query string, args ...interface{}) (pgx.Rows, error) { return db.Pool.Query(context.Background(), query, args...) } // Exec wraps pool.Exec for compatibility func (db *DB) Exec(query string, args ...interface{}) (int64, error) { result, err := db.Pool.Exec(context.Background(), query, args...) if err != nil { return 0, err } return result.RowsAffected(), nil } // Begin starts a transaction func (db *DB) Begin() (pgx.Tx, error) { return db.Pool.Begin(context.Background()) } // Close closes the connection pool func (db *DB) Close() error { db.Pool.Close() return nil } // NullableString returns nil for empty strings, otherwise the string pointer func NullableString(s string) *string { if s == "" { return nil } return &s } // NullableTime returns nil for zero times, otherwise the time pointer func NullableTime(t time.Time) *time.Time { if t.IsZero() { return nil } return &t } // StringValue returns empty string for nil, otherwise the dereferenced value func StringValue(s *string) string { if s == nil { return "" } return *s } // TimeValue returns zero time for nil, otherwise the dereferenced value func TimeValue(t *time.Time) time.Time { if t == nil { return time.Time{} } return *t } // ToSearchQuery converts a user query to PostgreSQL tsquery format func ToSearchQuery(query string) string { // Simple conversion: split on spaces and join with & words := strings.Fields(query) if len(words) == 0 { return "" } return strings.Join(words, " & ") }