Files
crawler/db.go
primal 75835d771d Add AT Protocol publishing, media support, and SQLite stability
Publishing:
- Add publisher.go for posting feed items to AT Protocol PDS
- Support deterministic rkeys from SHA256(guid + discoveredAt)
- Handle multiple URLs in posts with facets for each link
- Image embed support (app.bsky.embed.images) for up to 4 images
- External embed with thumbnail fallback
- Podcast/audio enclosure URLs included in post text

Media extraction:
- Parse RSS enclosures (audio, video, images)
- Extract Media RSS content and thumbnails
- Extract images from HTML content in descriptions
- Store enclosure and imageUrls in items table

SQLite stability improvements:
- Add synchronous=NORMAL and wal_autocheckpoint pragmas
- Connection pool tuning (idle conns, max lifetime)
- Periodic WAL checkpoint every 5 minutes
- Hourly integrity checks with PRAGMA quick_check
- Daily hot backup via VACUUM INTO
- Docker stop_grace_period: 30s for graceful shutdown

Dashboard:
- Feed publishing UI and API endpoints
- Account creation with invite codes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-28 15:30:02 -05:00

231 lines
7.1 KiB
Go

package main
import (
"database/sql"
"fmt"
"time"
_ "modernc.org/sqlite"
)
const schema = `
CREATE TABLE IF NOT EXISTS domains (
host TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'unchecked',
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
feedsFound INTEGER DEFAULT 0,
lastError TEXT,
tld TEXT
);
CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status);
CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld);
CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WHERE feedsFound > 0;
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
type TEXT,
category TEXT DEFAULT 'main',
title TEXT,
description TEXT,
language TEXT,
siteUrl TEXT,
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
nextCrawlAt DATETIME,
lastBuildDate DATETIME,
etag TEXT,
lastModified TEXT,
ttlMinutes INTEGER,
updatePeriod TEXT,
updateFreq INTEGER,
status TEXT DEFAULT 'active',
errorCount INTEGER DEFAULT 0,
lastError TEXT,
lastErrorAt DATETIME,
sourceUrl TEXT,
sourceHost TEXT,
tld TEXT,
itemCount INTEGER,
avgPostFreqHrs REAL,
oldestItemDate DATETIME,
newestItemDate DATETIME,
noUpdate INTEGER DEFAULT 0,
-- Publishing to PDS
publishStatus TEXT DEFAULT 'held' CHECK(publishStatus IN ('held', 'pass', 'fail')),
publishAccount TEXT
);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_publishStatus ON feeds(publishStatus);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
CREATE INDEX IF NOT EXISTS idx_feeds_category ON feeds(category);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
CREATE TABLE IF NOT EXISTS items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feedUrl TEXT NOT NULL,
guid TEXT,
title TEXT,
link TEXT,
description TEXT,
content TEXT,
author TEXT,
pubDate DATETIME,
discoveredAt DATETIME NOT NULL,
updatedAt DATETIME,
-- Media attachments
enclosureUrl TEXT,
enclosureType TEXT,
enclosureLength INTEGER,
imageUrls TEXT, -- JSON array of image URLs
-- Publishing to PDS
publishedAt DATETIME,
publishedUri TEXT,
UNIQUE(feedUrl, guid)
);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_unpublished ON items(feedUrl, publishedAt) WHERE publishedAt IS NULL;
-- Full-text search for feeds
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
url,
title,
description,
content='feeds',
content_rowid='rowid'
);
-- Triggers to keep FTS in sync
CREATE TRIGGER IF NOT EXISTS feeds_ai AFTER INSERT ON feeds BEGIN
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_ad AFTER DELETE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_au AFTER UPDATE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
-- Full-text search for items
CREATE VIRTUAL TABLE IF NOT EXISTS items_fts USING fts5(
title,
description,
content,
author,
content='items',
content_rowid='id'
);
-- Triggers to keep items FTS in sync
CREATE TRIGGER IF NOT EXISTS items_ai AFTER INSERT ON items BEGIN
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
CREATE TRIGGER IF NOT EXISTS items_ad AFTER DELETE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
END;
CREATE TRIGGER IF NOT EXISTS items_au AFTER UPDATE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
`
func OpenDatabase(dbPath string) (*sql.DB, error) {
fmt.Printf("Opening database: %s\n", dbPath)
// Use pragmas in connection string for consistent application
// - busy_timeout: wait up to 10s for locks instead of failing immediately
// - journal_mode: WAL for better concurrency and crash recovery
// - synchronous: NORMAL is safe with WAL (fsync at checkpoint, not every commit)
// - wal_autocheckpoint: checkpoint every 1000 pages (~4MB) to prevent WAL bloat
// - foreign_keys: enforce referential integrity
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=wal_autocheckpoint(1000)&_pragma=foreign_keys(ON)"
db, err := sql.Open("sqlite", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Connection pool settings for stability
db.SetMaxOpenConns(4) // Limit concurrent connections
db.SetMaxIdleConns(2) // Keep some connections warm
db.SetConnMaxLifetime(5 * time.Minute) // Recycle connections periodically
db.SetConnMaxIdleTime(1 * time.Minute) // Close idle connections
// Verify connection and show journal mode
var journalMode string
if err := db.QueryRow("PRAGMA journal_mode").Scan(&journalMode); err != nil {
fmt.Printf(" Warning: could not query journal_mode: %v\n", err)
} else {
fmt.Printf(" Journal mode: %s\n", journalMode)
}
// Create schema
if _, err := db.Exec(schema); err != nil {
db.Close()
return nil, fmt.Errorf("failed to create schema: %v", err)
}
fmt.Println(" Schema OK")
// Migrations for existing databases
migrations := []string{
"ALTER TABLE items ADD COLUMN enclosureUrl TEXT",
"ALTER TABLE items ADD COLUMN enclosureType TEXT",
"ALTER TABLE items ADD COLUMN enclosureLength INTEGER",
"ALTER TABLE items ADD COLUMN imageUrls TEXT",
}
for _, m := range migrations {
db.Exec(m) // Ignore errors (column may already exist)
}
// Run stats and ANALYZE in background to avoid blocking startup with large databases
go func() {
var domainCount, feedCount int
db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&domainCount)
db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&feedCount)
fmt.Printf(" Existing data: %d domains, %d feeds\n", domainCount, feedCount)
fmt.Println(" Running ANALYZE...")
if _, err := db.Exec("ANALYZE"); err != nil {
fmt.Printf(" Warning: ANALYZE failed: %v\n", err)
} else {
fmt.Println(" ANALYZE complete")
}
}()
return db, nil
}