Files
crawler/main.go
primal 02378950f4 Add bulk import for CDX feeds
- Add BulkImportFeedsFromTSV() for fast direct insertion from TSV
- Skip HTTP verification, insert feeds with status='hold'
- Check for pending TSV files on startup and auto-import
- Imported 4.7M feeds in ~26 minutes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 10:42:52 -05:00

55 lines
1.3 KiB
Go

package main
import (
"fmt"
"os"
"os/signal"
"syscall"
)
func main() {
// Connection string from environment (DATABASE_URL or DB_* vars)
crawler, err := NewCrawler("")
if err != nil {
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
os.Exit(1)
}
// Setup graceful shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// Check for CDX TSV files to bulk import on startup
crawler.checkAndImportCDXFeeds()
// Start background loops
fmt.Println("Starting crawler loops...")
// Stats loop - updates crawler_stats table every second
go crawler.StartStatsLoop()
// CDX monthly import - discovers feeds from Common Crawl on 1st of each month
go crawler.StartCDXMonthlyLoop()
// Feed check loop - re-checks known feeds for new items
go crawler.StartFeedCheckLoop()
// Cleanup loop - removes items older than 12 months (weekly)
go crawler.StartCleanupLoop()
// Maintenance loop - ANALYZE hourly, VACUUM daily
go crawler.StartMaintenanceLoop()
// Wait for shutdown signal
sig := <-sigChan
fmt.Printf("\nReceived %v, shutting down gracefully...\n", sig)
// Close crawler (checkpoints WAL and closes database)
if err := crawler.Close(); err != nil {
fmt.Fprintf(os.Stderr, "Error closing crawler: %v\n", err)
os.Exit(1)
}
fmt.Println("Shutdown complete")
}