Migrate from SQLite to PostgreSQL
- Replace modernc.org/sqlite with jackc/pgx/v5 - Update all SQL queries for PostgreSQL syntax ($1, $2 placeholders) - Use snake_case column names throughout - Replace SQLite FTS5 with PostgreSQL tsvector/tsquery full-text search - Add connection pooling with pgxpool - Support Docker secrets for database password - Add trigger to normalize feed URLs (strip https://, http://, www.) - Fix anchor feed detection regex to avoid false positives - Connect app container to atproto network for PostgreSQL access - Add version indicator to dashboard UI Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -3,13 +3,15 @@ package main
|
||||
import (
|
||||
"bufio"
|
||||
"compress/gzip"
|
||||
"database/sql"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
)
|
||||
|
||||
// Domain represents a host to be crawled for feeds
|
||||
@@ -23,78 +25,74 @@ type Domain struct {
|
||||
TLD string `json:"tld,omitempty"`
|
||||
}
|
||||
|
||||
// saveDomain stores a domain in SQLite
|
||||
// saveDomain stores a domain in PostgreSQL
|
||||
func (c *Crawler) saveDomain(domain *Domain) error {
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
ON CONFLICT(host) DO UPDATE SET
|
||||
status = excluded.status,
|
||||
lastCrawledAt = excluded.lastCrawledAt,
|
||||
feedsFound = excluded.feedsFound,
|
||||
lastError = excluded.lastError,
|
||||
tld = excluded.tld
|
||||
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
|
||||
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
|
||||
status = EXCLUDED.status,
|
||||
last_crawled_at = EXCLUDED.last_crawled_at,
|
||||
feeds_found = EXCLUDED.feeds_found,
|
||||
last_error = EXCLUDED.last_error,
|
||||
tld = EXCLUDED.tld
|
||||
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||
return err
|
||||
}
|
||||
|
||||
// saveDomainTx stores a domain using a transaction
|
||||
func (c *Crawler) saveDomainTx(tx *sql.Tx, domain *Domain) error {
|
||||
_, err := tx.Exec(`
|
||||
INSERT INTO domains (host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
func (c *Crawler) saveDomainTx(tx pgx.Tx, domain *Domain) error {
|
||||
_, err := tx.Exec(context.Background(), `
|
||||
INSERT INTO domains (host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
ON CONFLICT(host) DO NOTHING
|
||||
`, domain.Host, domain.Status, domain.DiscoveredAt, nullTime(domain.LastCrawledAt),
|
||||
domain.FeedsFound, nullString(domain.LastError), domain.TLD)
|
||||
`, domain.Host, domain.Status, domain.DiscoveredAt, NullableTime(domain.LastCrawledAt),
|
||||
domain.FeedsFound, NullableString(domain.LastError), domain.TLD)
|
||||
return err
|
||||
}
|
||||
|
||||
// domainExists checks if a domain already exists in the database
|
||||
func (c *Crawler) domainExists(host string) bool {
|
||||
var exists bool
|
||||
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = ?)", normalizeHost(host)).Scan(&exists)
|
||||
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM domains WHERE host = $1)", normalizeHost(host)).Scan(&exists)
|
||||
return err == nil && exists
|
||||
}
|
||||
|
||||
// getDomain retrieves a domain from SQLite
|
||||
// getDomain retrieves a domain from PostgreSQL
|
||||
func (c *Crawler) getDomain(host string) (*Domain, error) {
|
||||
domain := &Domain{}
|
||||
var lastCrawledAt sql.NullTime
|
||||
var lastError sql.NullString
|
||||
var lastCrawledAt *time.Time
|
||||
var lastError *string
|
||||
|
||||
err := c.db.QueryRow(`
|
||||
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
|
||||
FROM domains WHERE host = ?
|
||||
SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
|
||||
FROM domains WHERE host = $1
|
||||
`, normalizeHost(host)).Scan(
|
||||
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
|
||||
&domain.FeedsFound, &lastError, &domain.TLD,
|
||||
)
|
||||
|
||||
if err == sql.ErrNoRows {
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if lastCrawledAt.Valid {
|
||||
domain.LastCrawledAt = lastCrawledAt.Time
|
||||
}
|
||||
if lastError.Valid {
|
||||
domain.LastError = lastError.String
|
||||
}
|
||||
domain.LastCrawledAt = TimeValue(lastCrawledAt)
|
||||
domain.LastError = StringValue(lastError)
|
||||
|
||||
return domain, nil
|
||||
}
|
||||
|
||||
// GetUncheckedDomains returns up to limit unchecked domains ordered by discoveredAt (FIFO)
|
||||
// GetUncheckedDomains returns up to limit unchecked domains ordered by discovered_at (FIFO)
|
||||
func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
|
||||
rows, err := c.db.Query(`
|
||||
SELECT host, status, discoveredAt, lastCrawledAt, feedsFound, lastError, tld
|
||||
SELECT host, status, discovered_at, last_crawled_at, feeds_found, last_error, tld
|
||||
FROM domains WHERE status = 'unchecked'
|
||||
ORDER BY discoveredAt ASC
|
||||
LIMIT ?
|
||||
ORDER BY discovered_at ASC
|
||||
LIMIT $1
|
||||
`, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -105,12 +103,12 @@ func (c *Crawler) GetUncheckedDomains(limit int) ([]*Domain, error) {
|
||||
}
|
||||
|
||||
// scanDomains is a helper to scan multiple domain rows
|
||||
func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) {
|
||||
func (c *Crawler) scanDomains(rows pgx.Rows) ([]*Domain, error) {
|
||||
var domains []*Domain
|
||||
for rows.Next() {
|
||||
domain := &Domain{}
|
||||
var lastCrawledAt sql.NullTime
|
||||
var lastError sql.NullString
|
||||
var lastCrawledAt *time.Time
|
||||
var lastError *string
|
||||
|
||||
if err := rows.Scan(
|
||||
&domain.Host, &domain.Status, &domain.DiscoveredAt, &lastCrawledAt,
|
||||
@@ -119,12 +117,8 @@ func (c *Crawler) scanDomains(rows *sql.Rows) ([]*Domain, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
if lastCrawledAt.Valid {
|
||||
domain.LastCrawledAt = lastCrawledAt.Time
|
||||
}
|
||||
if lastError.Valid {
|
||||
domain.LastError = lastError.String
|
||||
}
|
||||
domain.LastCrawledAt = TimeValue(lastCrawledAt)
|
||||
domain.LastError = StringValue(lastError)
|
||||
|
||||
domains = append(domains, domain)
|
||||
}
|
||||
@@ -142,13 +136,13 @@ func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError strin
|
||||
var err error
|
||||
if lastError != "" {
|
||||
_, err = c.db.Exec(`
|
||||
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = ?
|
||||
WHERE host = ?
|
||||
UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = $4
|
||||
WHERE host = $5
|
||||
`, status, time.Now(), feedsFound, lastError, normalizeHost(host))
|
||||
} else {
|
||||
_, err = c.db.Exec(`
|
||||
UPDATE domains SET status = ?, lastCrawledAt = ?, feedsFound = ?, lastError = NULL
|
||||
WHERE host = ?
|
||||
UPDATE domains SET status = $1, last_crawled_at = $2, feeds_found = $3, last_error = NULL
|
||||
WHERE host = $4
|
||||
`, status, time.Now(), feedsFound, normalizeHost(host))
|
||||
}
|
||||
return err
|
||||
@@ -164,6 +158,23 @@ func (c *Crawler) GetDomainCount() (total int, unchecked int, err error) {
|
||||
return total, unchecked, err
|
||||
}
|
||||
|
||||
// ImportTestDomains adds a list of specific domains for testing
|
||||
func (c *Crawler) ImportTestDomains(domains []string) {
|
||||
now := time.Now()
|
||||
for _, host := range domains {
|
||||
_, err := c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discovered_at, tld)
|
||||
VALUES ($1, 'unchecked', $2, $3)
|
||||
ON CONFLICT(host) DO NOTHING
|
||||
`, host, now, getTLD(host))
|
||||
if err != nil {
|
||||
fmt.Printf("Error adding test domain %s: %v\n", host, err)
|
||||
} else {
|
||||
fmt.Printf("Added test domain: %s\n", host)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ImportDomainsFromFile reads a vertices file and stores new domains as "unchecked"
|
||||
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
|
||||
file, err := os.Open(filename)
|
||||
@@ -212,7 +223,6 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||
|
||||
const batchSize = 1000
|
||||
now := time.Now()
|
||||
nowStr := now.Format("2006-01-02 15:04:05")
|
||||
totalImported := 0
|
||||
batchCount := 0
|
||||
|
||||
@@ -240,31 +250,43 @@ func (c *Crawler) ImportDomainsInBackground(filename string) {
|
||||
break
|
||||
}
|
||||
|
||||
// Build bulk INSERT statement
|
||||
var sb strings.Builder
|
||||
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
|
||||
args := make([]interface{}, 0, len(domains)*4)
|
||||
for i, d := range domains {
|
||||
if i > 0 {
|
||||
sb.WriteString(",")
|
||||
}
|
||||
sb.WriteString("(?, 'unchecked', ?, ?)")
|
||||
args = append(args, d.host, nowStr, d.tld)
|
||||
}
|
||||
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
|
||||
|
||||
// Execute bulk insert
|
||||
result, err := c.db.Exec(sb.String(), args...)
|
||||
imported := 0
|
||||
// Use COPY for bulk insert (much faster than individual INSERTs)
|
||||
ctx := context.Background()
|
||||
conn, err := c.db.Acquire(ctx)
|
||||
if err != nil {
|
||||
fmt.Printf("Bulk insert error: %v\n", err)
|
||||
} else {
|
||||
rowsAffected, _ := result.RowsAffected()
|
||||
imported = int(rowsAffected)
|
||||
fmt.Printf("Failed to acquire connection: %v\n", err)
|
||||
break
|
||||
}
|
||||
|
||||
// Build rows for copy
|
||||
rows := make([][]interface{}, len(domains))
|
||||
for i, d := range domains {
|
||||
rows[i] = []interface{}{d.host, "unchecked", now, d.tld}
|
||||
}
|
||||
|
||||
// Use CopyFrom for bulk insert
|
||||
imported, err := conn.CopyFrom(
|
||||
ctx,
|
||||
pgx.Identifier{"domains"},
|
||||
[]string{"host", "status", "discovered_at", "tld"},
|
||||
pgx.CopyFromRows(rows),
|
||||
)
|
||||
conn.Release()
|
||||
|
||||
if err != nil {
|
||||
// Fall back to individual inserts with ON CONFLICT
|
||||
for _, d := range domains {
|
||||
c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discovered_at, tld)
|
||||
VALUES ($1, 'unchecked', $2, $3)
|
||||
ON CONFLICT(host) DO NOTHING
|
||||
`, d.host, now, d.tld)
|
||||
}
|
||||
imported = int64(len(domains))
|
||||
}
|
||||
|
||||
batchCount++
|
||||
totalImported += imported
|
||||
totalImported += int(imported)
|
||||
atomic.AddInt32(&c.domainsImported, int32(imported))
|
||||
|
||||
// Wait 1 second before the next batch
|
||||
@@ -304,7 +326,6 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
||||
scanner.Buffer(buf, 1024*1024)
|
||||
|
||||
now := time.Now()
|
||||
nowStr := now.Format("2006-01-02 15:04:05")
|
||||
count := 0
|
||||
const batchSize = 1000
|
||||
|
||||
@@ -336,28 +357,21 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
||||
break
|
||||
}
|
||||
|
||||
// Build bulk INSERT statement
|
||||
var sb strings.Builder
|
||||
sb.WriteString("INSERT INTO domains (host, status, discoveredAt, tld) VALUES ")
|
||||
args := make([]interface{}, 0, len(domains)*4)
|
||||
for i, d := range domains {
|
||||
if i > 0 {
|
||||
sb.WriteString(",")
|
||||
// Insert with ON CONFLICT
|
||||
for _, d := range domains {
|
||||
result, err := c.db.Exec(`
|
||||
INSERT INTO domains (host, status, discovered_at, tld)
|
||||
VALUES ($1, 'unchecked', $2, $3)
|
||||
ON CONFLICT(host) DO NOTHING
|
||||
`, d.host, now, d.tld)
|
||||
if err != nil {
|
||||
skipped++
|
||||
} else if result > 0 {
|
||||
imported++
|
||||
} else {
|
||||
skipped++
|
||||
}
|
||||
sb.WriteString("(?, 'unchecked', ?, ?)")
|
||||
args = append(args, d.host, nowStr, d.tld)
|
||||
}
|
||||
sb.WriteString(" ON CONFLICT(host) DO NOTHING")
|
||||
|
||||
// Execute bulk insert
|
||||
result, execErr := c.db.Exec(sb.String(), args...)
|
||||
if execErr != nil {
|
||||
skipped += len(domains)
|
||||
continue
|
||||
}
|
||||
rowsAffected, _ := result.RowsAffected()
|
||||
imported += int(rowsAffected)
|
||||
skipped += len(domains) - int(rowsAffected)
|
||||
|
||||
if limit > 0 && count >= limit {
|
||||
break
|
||||
@@ -370,18 +384,3 @@ func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported in
|
||||
|
||||
return imported, skipped, nil
|
||||
}
|
||||
|
||||
// Helper functions for SQL null handling
|
||||
func nullTime(t time.Time) sql.NullTime {
|
||||
if t.IsZero() {
|
||||
return sql.NullTime{}
|
||||
}
|
||||
return sql.NullTime{Time: t, Valid: true}
|
||||
}
|
||||
|
||||
func nullString(s string) sql.NullString {
|
||||
if s == "" {
|
||||
return sql.NullString{}
|
||||
}
|
||||
return sql.NullString{String: s, Valid: true}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user