Migrate from SQLite to PostgreSQL

- Replace modernc.org/sqlite with jackc/pgx/v5
- Update all SQL queries for PostgreSQL syntax ($1, $2 placeholders)
- Use snake_case column names throughout
- Replace SQLite FTS5 with PostgreSQL tsvector/tsquery full-text search
- Add connection pooling with pgxpool
- Support Docker secrets for database password
- Add trigger to normalize feed URLs (strip https://, http://, www.)
- Fix anchor feed detection regex to avoid false positives
- Connect app container to atproto network for PostgreSQL access
- Add version indicator to dashboard UI

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-28 20:38:13 -05:00
parent 75835d771d
commit f4afb29980
11 changed files with 1525 additions and 1137 deletions
+104 -105
View File
@@ -3,13 +3,15 @@ package main
import (
"bufio"
"compress/gzip"
"database/sql"
"context"
"fmt"
"io"
"os"
"strings"
"sync/atomic"
"time"
"github.com/jackc/pgx/v5"
)
// Domain represents a host to be crawled for feeds
@@ -23,78 +25,74 @@ type Domain struct {
TLD string `json:"tld,omitempty"`
}
// saveDomain stores a domain in SQLite
// saveDomain stores a domain in PostgreSQL
func (c *Crawler) saveDomain(domain *Domain) error {
_, err := c.db.Exec(`
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
VALUES (?, ?, ?, ?, ?, ?, ?)
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT(host) DO UPDATE SET
status = excluded.status,
lastCrawledAt = excluded.lastCrawledAt,
feedsFound = excluded.feedsFound,
lastError = excluded.lastError,
tld = excluded.tld
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
status = EXCLUDED.status,
last_crawled_at = EXCLUDED.last_crawled_at,
feeds_found = EXCLUDED.feeds_found,
last_error = EXCLUDED.last_error,
tld = EXCLUDED.tld
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
return err
}
// saveDomainTx stores a domain using a transaction
func (c *Crawler) saveDomainTx(tx *sql.Tx, domain *Domain) error {
_, err := tx.Exec(`
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
VALUES (?, ?, ?, ?, ?, ?, ?)
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
_, err := tx.Exec(context.Background(), `
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT(host) DO NOTHING
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
return err
}
// domainExists checks if a domain already exists in the database
func (c *Crawler) domainExists(host string) bool {
var exists bool
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = ?)", normalizeHost(host)).Scan(&exists)
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = $1)", normalizeHost(host)).Scan(&exists)
return err == nil && exists
}
// getDomain retrieves a domain from SQLite
// getDomain retrieves a domain from PostgreSQL
func (c *Crawler) getDomain(host string) (*Domain, error) {
domain := &Domain{}
var lastCrawledAt sql.NullTime
var lastError sql.NullString
var lastCrawledAt *time.Time
var lastError *string
err := c.db.QueryRow(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
FROM domains WHERE host = ?
SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE host = $1
`, normalizeHost(host)).Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
&domain.FeedsFound, &lastError, &domain.TLD,
)
if err == sql.ErrNoRows {
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
if lastCrawledAt.Valid {
domain.LastCrawledAt = lastCrawledAt.Time
}
if lastError.Valid {
domain.LastError = lastError.String
}
domain.LastCrawledAt = TimeValue(lastCrawledAt)
domain.LastError = StringValue(lastError)
return domain, nil
}
// GetUncheckedDomains returns up to limit unchecked domains ordered by discoveredAt (FIFO)
// GetUncheckedDomains returns up to limit unchecked domains ordered by discovered_at (FIFO)
func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
rows, err := c.db.Query(`
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
FROM domains WHERE status = 'unchecked'
ORDER BY discoveredAt ASC
LIMIT ?
ORDER BY discovered_at ASC
LIMIT $1
`, limit)
if err != nil {
return nil, err
@@ -105,12 +103,12 @@ func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
}
// scanDomains is a helper to scan multiple domain rows
func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) {
func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
var domains []*Domain
for rows.Next() {
domain := &Domain{}
var lastCrawledAt sql.NullTime
var lastError sql.NullString
var lastCrawledAt *time.Time
var lastError *string
if err := rows.Scan(
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
@@ -119,12 +117,8 @@ func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) {
continue
}
if lastCrawledAt.Valid {
domain.LastCrawledAt = lastCrawledAt.Time
}
if lastError.Valid {
domain.LastError = lastError.String
}
domain.LastCrawledAt = TimeValue(lastCrawledAt)
domain.LastError = StringValue(lastError)
domains = append(domains, domain)
}
@@ -142,13 +136,13 @@ func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError strin
var err error
if lastError != "" {
_, err = c.db.Exec(`
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = ?
WHERE host = ?
UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = $4
WHERE host = $5
`, status, time.Now(), feedsFound, lastError, normalizeHost(host))
} else {
_, err = c.db.Exec(`
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = NULL
WHERE host = ?
UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = NULL
WHERE host = $4
`, status, time.Now(), feedsFound, normalizeHost(host))
}
return err
@@ -164,6 +158,23 @@ func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) {
return total, unchecked, err
}
// ImportTestDomains adds a list of specific domains for testing
func (c *Crawler) ImportTestDomains(domains []string) {
now := time.Now()
for _, host := range domains {
_, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'unchecked', $2, $3)
ON CONFLICT(host) DO NOTHING
`, host, now, getTLD(host))
if err != nil {
fmt.Printf("Error adding test domain %s: %v\n", host, err)
} else {
fmt.Printf("Added test domain: %s\n", host)
}
}
}
// ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked"
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
file, err := os.Open(filename)
@@ -212,7 +223,6 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
const batchSize = 1000
now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
totalImported := 0
batchCount := 0
@@ -240,31 +250,43 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
break
}
// Build bulk INSERT statement
var sb strings.Builder
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
args := make([]interface{}, 0, len(domains)*4)
for i, d := range domains {
if i > 0 {
sb.WriteString(",")
}
sb.WriteString("(?, 'unchecked', ?, ?)")
args = append(args, d.host, nowStr, d.tld)
}
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
// Execute bulk insert
result, err := c.db.Exec(sb.String(), args...)
imported := 0
// Use COPY for bulk insert (much faster than individual INSERTs)
ctx := context.Background()
conn, err := c.db.Acquire(ctx)
if err != nil {
fmt.Printf("Bulk insert error: %v\n", err)
} else {
rowsAffected, _ := result.RowsAffected()
imported = int(rowsAffected)
fmt.Printf("Failed to acquire connection: %v\n", err)
break
}
// Build rows for copy
rows := make([][]interface{}, len(domains))
for i, d := range domains {
rows[i] = []interface{}{d.host, "unchecked", now, d.tld}
}
// Use CopyFrom for bulk insert
imported, err := conn.CopyFrom(
ctx,
pgx.Identifier{"domains"},
[]string{"host", "status", "discovered_at", "tld"},
pgx.CopyFromRows(rows),
)
conn.Release()
if err != nil {
// Fall back to individual inserts with ON CONFLICT
for _, d := range domains {
c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'unchecked', $2, $3)
ON CONFLICT(host) DO NOTHING
`, d.host, now, d.tld)
}
imported = int64(len(domains))
}
batchCount++
totalImported += imported
totalImported += int(imported)
atomic.AddInt32(&c.domainsImported, int32(imported))
// Wait 1 second before the next batch
@@ -304,7 +326,6 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
scanner.Buffer(buf, 1024*1024)
now := time.Now()
nowStr := now.Format("2006-01-02 15:04:05")
count := 0
const batchSize = 1000
@@ -336,28 +357,21 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
break
}
// Build bulk INSERT statement
var sb strings.Builder
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
args := make([]interface{}, 0, len(domains)*4)
for i, d := range domains {
if i > 0 {
sb.WriteString(",")
// Insert with ON CONFLICT
for _, d := range domains {
result, err := c.db.Exec(`
INSERT INTO domains (host, status, discovered_at, tld)
VALUES ($1, 'unchecked', $2, $3)
ON CONFLICT(host) DO NOTHING
`, d.host, now, d.tld)
if err != nil {
skipped++
} else if result > 0 {
imported++
} else {
skipped++
}
sb.WriteString("(?, 'unchecked', ?, ?)")
args = append(args, d.host, nowStr, d.tld)
}
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
// Execute bulk insert
result, execErr := c.db.Exec(sb.String(), args...)
if execErr != nil {
skipped += len(domains)
continue
}
rowsAffected, _ := result.RowsAffected()
imported += int(rowsAffected)
skipped += len(domains) - int(rowsAffected)
if limit > 0 && count >= limit {
break
@@ -370,18 +384,3 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
return imported, skipped, nil
}
// Helper functions for SQL null handling
func nullTime(t time.Time) sql.NullTime {
if t.IsZero() {
return sql.NullTime{}
}
return sql.NullTime{Time: t, Valid: true}
}
func nullString(s string) sql.NullString {
if s == "" {
return sql.NullString{}
}
return sql.NullString{String: s, Valid: true}
}