Files
crawler/db.go
2026-01-26 16:02:05 -05:00

193 lines
5.6 KiB
Go

package main
import (
"database/sql"
"fmt"
_ "modernc.org/sqlite"
)
const schema = `
CREATE TABLE IF NOT EXISTS domains (
host TEXT PRIMARY KEY,
status TEXT NOT NULL DEFAULT 'unchecked',
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
feedsFound INTEGER DEFAULT 0,
lastError TEXT,
tld TEXT
);
CREATE INDEX IF NOT EXISTS idx_domains_status ON domains(status);
CREATE INDEX IF NOT EXISTS idx_domains_tld ON domains(tld);
CREATE INDEX IF NOT EXISTS idx_domains_feedsFound ON domains(feedsFound DESC) WHERE feedsFound > 0;
CREATE TABLE IF NOT EXISTS feeds (
url TEXT PRIMARY KEY,
type TEXT,
title TEXT,
description TEXT,
language TEXT,
siteUrl TEXT,
discoveredAt DATETIME NOT NULL,
lastCrawledAt DATETIME,
nextCrawlAt DATETIME,
lastBuildDate DATETIME,
etag TEXT,
lastModified TEXT,
ttlMinutes INTEGER,
updatePeriod TEXT,
updateFreq INTEGER,
status TEXT DEFAULT 'active',
errorCount INTEGER DEFAULT 0,
lastError TEXT,
lastErrorAt DATETIME,
sourceUrl TEXT,
sourceHost TEXT,
tld TEXT,
itemCount INTEGER,
avgPostFreqHrs REAL,
oldestItemDate DATETIME,
newestItemDate DATETIME,
noUpdate INTEGER DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost ON feeds(sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_sourceHost_url ON feeds(sourceHost, url);
CREATE INDEX IF NOT EXISTS idx_feeds_tld ON feeds(tld);
CREATE INDEX IF NOT EXISTS idx_feeds_tld_sourceHost ON feeds(tld, sourceHost);
CREATE INDEX IF NOT EXISTS idx_feeds_type ON feeds(type);
CREATE INDEX IF NOT EXISTS idx_feeds_status ON feeds(status);
CREATE INDEX IF NOT EXISTS idx_feeds_discoveredAt ON feeds(discoveredAt);
CREATE INDEX IF NOT EXISTS idx_feeds_title ON feeds(title);
CREATE TABLE IF NOT EXISTS items (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feedUrl TEXT NOT NULL,
guid TEXT,
title TEXT,
link TEXT,
description TEXT,
content TEXT,
author TEXT,
pubDate DATETIME,
discoveredAt DATETIME NOT NULL,
updatedAt DATETIME,
UNIQUE(feedUrl, guid)
);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl ON items(feedUrl);
CREATE INDEX IF NOT EXISTS idx_items_pubDate ON items(pubDate DESC);
CREATE INDEX IF NOT EXISTS idx_items_link ON items(link);
CREATE INDEX IF NOT EXISTS idx_items_feedUrl_pubDate ON items(feedUrl, pubDate DESC);
-- Full-text search for feeds
CREATE VIRTUAL TABLE IF NOT EXISTS feeds_fts USING fts5(
url,
title,
description,
content='feeds',
content_rowid='rowid'
);
-- Triggers to keep FTS in sync
CREATE TRIGGER IF NOT EXISTS feeds_ai AFTER INSERT ON feeds BEGIN
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_ad AFTER DELETE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
END;
CREATE TRIGGER IF NOT EXISTS feeds_au AFTER UPDATE ON feeds BEGIN
INSERT INTO feeds_fts(feeds_fts, rowid, url, title, description)
VALUES ('delete', OLD.rowid, OLD.url, OLD.title, OLD.description);
INSERT INTO feeds_fts(rowid, url, title, description)
VALUES (NEW.rowid, NEW.url, NEW.title, NEW.description);
END;
-- Full-text search for items
CREATE VIRTUAL TABLE IF NOT EXISTS items_fts USING fts5(
title,
description,
content,
author,
content='items',
content_rowid='id'
);
-- Triggers to keep items FTS in sync
CREATE TRIGGER IF NOT EXISTS items_ai AFTER INSERT ON items BEGIN
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
CREATE TRIGGER IF NOT EXISTS items_ad AFTER DELETE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
END;
CREATE TRIGGER IF NOT EXISTS items_au AFTER UPDATE ON items BEGIN
INSERT INTO items_fts(items_fts, rowid, title, description, content, author)
VALUES ('delete', OLD.id, OLD.title, OLD.description, OLD.content, OLD.author);
INSERT INTO items_fts(rowid, title, description, content, author)
VALUES (NEW.id, NEW.title, NEW.description, NEW.content, NEW.author);
END;
`
func OpenDatabase(dbPath string) (*sql.DB, error) {
fmt.Printf("Opening database: %s\n", dbPath)
// Use pragmas in connection string for consistent application
connStr := dbPath + "?_pragma=busy_timeout(10000)&_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)"
db, err := sql.Open("sqlite", connStr)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Allow multiple readers (WAL mode supports concurrent reads)
// SQLite is single-writer, but reads can happen concurrently
db.SetMaxOpenConns(4)
// Verify connection and show journal mode
var journalMode string
if err := db.QueryRow("PRAGMA journal_mode").Scan(&journalMode); err != nil {
fmt.Printf(" Warning: could not query journal_mode: %v\n", err)
} else {
fmt.Printf(" Journal mode: %s\n", journalMode)
}
// Create schema
if _, err := db.Exec(schema); err != nil {
db.Close()
return nil, fmt.Errorf("failed to create schema: %v", err)
}
fmt.Println(" Schema OK")
// Run stats and ANALYZE in background to avoid blocking startup with large databases
go func() {
var domainCount, feedCount int
db.QueryRow("SELECT COUNT(*) FROM domains").Scan(&domainCount)
db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&feedCount)
fmt.Printf(" Existing data: %d domains, %d feeds\n", domainCount, feedCount)
fmt.Println(" Running ANALYZE...")
if _, err := db.Exec("ANALYZE"); err != nil {
fmt.Printf(" Warning: ANALYZE failed: %v\n", err)
} else {
fmt.Println(" ANALYZE complete")
}
}()
return db, nil
}