- Add BulkImportFeedsFromTSV() for fast direct insertion from TSV - Skip HTTP verification, insert feeds with status='hold' - Check for pending TSV files on startup and auto-import - Imported 4.7M feeds in ~26 minutes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
55 lines
1.3 KiB
Go
55 lines
1.3 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
)
|
|
|
|
func main() {
|
|
// Connection string from environment (DATABASE_URL or DB_* vars)
|
|
crawler, err := NewCrawler("")
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
// Setup graceful shutdown
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
// Check for CDX TSV files to bulk import on startup
|
|
crawler.checkAndImportCDXFeeds()
|
|
|
|
// Start background loops
|
|
fmt.Println("Starting crawler loops...")
|
|
|
|
// Stats loop - updates crawler_stats table every second
|
|
go crawler.StartStatsLoop()
|
|
|
|
// CDX monthly import - discovers feeds from Common Crawl on 1st of each month
|
|
go crawler.StartCDXMonthlyLoop()
|
|
|
|
// Feed check loop - re-checks known feeds for new items
|
|
go crawler.StartFeedCheckLoop()
|
|
|
|
// Cleanup loop - removes items older than 12 months (weekly)
|
|
go crawler.StartCleanupLoop()
|
|
|
|
// Maintenance loop - ANALYZE hourly, VACUUM daily
|
|
go crawler.StartMaintenanceLoop()
|
|
|
|
// Wait for shutdown signal
|
|
sig := <-sigChan
|
|
fmt.Printf("\nReceived %v, shutting down gracefully...\n", sig)
|
|
|
|
// Close crawler (checkpoints WAL and closes database)
|
|
if err := crawler.Close(); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Error closing crawler: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
fmt.Println("Shutdown complete")
|
|
}
|