Files
crawler/crawler.go
primal d4a1928fa6 Increase feed check parallelism for 1Gbps bandwidth
- Workers: 1000 -> 4000
- Work channel buffer: 1000 -> 4000
- Fetch batch size: 1000 -> 4000
- MaxIdleConns: 100 -> 2000

Should improve throughput from ~15 feeds/sec to ~50-60 feeds/sec.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 12:11:58 -05:00

204 lines
4.6 KiB
Go

package main
import (
"crypto/tls"
"fmt"
"net"
"net/http"
"sync"
"sync/atomic"
"time"
)
type Crawler struct {
UserAgent string
client *http.Client
feedsMu sync.Mutex // protects feed insertion
feedsChecked int32 // feed_check: feeds checked for new items
db *DB
shutdownCh chan struct{} // closed on shutdown to signal goroutines
}
func NewCrawler(connString string) (*Crawler, error) {
db, err := OpenDatabase(connString)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// Custom transport with longer timeouts (HTTP/2 disabled for compatibility)
transport := &http.Transport{
TLSClientConfig: &tls.Config{
MinVersion: tls.VersionTLS12,
NextProtos: []string{"http/1.1"}, // Force HTTP/1.1 for compatibility
},
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
ForceAttemptHTTP2: false,
MaxIdleConns: 2000,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 30 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
ResponseHeaderTimeout: 60 * time.Second,
}
return &Crawler{
UserAgent: "Mozilla/5.0 (compatible; FeedCrawler/1.0; +https://1440.news)",
db: db,
shutdownCh: make(chan struct{}),
client: &http.Client{
Timeout: 60 * time.Second,
Transport: transport,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("stopped after 10 redirects")
}
return nil
},
},
}, nil
}
// IsShuttingDown returns true if shutdown has been initiated
func (c *Crawler) IsShuttingDown() bool {
select {
case <-c.shutdownCh:
return true
default:
return false
}
}
func (c *Crawler) Close() error {
// Signal all goroutines to stop
close(c.shutdownCh)
// Give goroutines time to finish current operations
fmt.Println("Waiting for goroutines to finish...")
time.Sleep(2 * time.Second)
if c.db != nil {
fmt.Println("Closing database...")
return c.db.Close()
}
return nil
}
// StartStatsLoop updates the crawler_stats table every second with per-second rates
func (c *Crawler) StartStatsLoop() {
var prevFeedsChecked int32
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
for {
select {
case <-c.shutdownCh:
return
case <-ticker.C:
feedsChecked := atomic.LoadInt32(&c.feedsChecked)
feedsCheckedRate := feedsChecked - prevFeedsChecked
prevFeedsChecked = feedsChecked
// Write to database
_, err := c.db.Exec(`
INSERT INTO crawler_stats (id, feeds_checked, updated_at)
VALUES (1, $1, NOW())
ON CONFLICT (id) DO UPDATE SET
feeds_checked = $1,
updated_at = NOW()
`, feedsCheckedRate)
if err != nil {
fmt.Printf("Error updating crawler_stats: %v\n", err)
}
}
}
}
// StartCleanupLoop runs item cleanup once per week
func (c *Crawler) StartCleanupLoop() {
for {
if c.IsShuttingDown() {
return
}
deleted, err := c.CleanupOldItems()
if err != nil {
fmt.Printf("Cleanup error: %v\n", err)
} else if deleted > 0 {
fmt.Printf("Cleanup: removed %d old items\n", deleted)
}
time.Sleep(7 * 24 * time.Hour)
}
}
// StartMaintenanceLoop performs periodic database maintenance
func (c *Crawler) StartMaintenanceLoop() {
vacuumTicker := time.NewTicker(24 * time.Hour)
analyzeTicker := time.NewTicker(1 * time.Hour)
defer vacuumTicker.Stop()
defer analyzeTicker.Stop()
for {
select {
case <-analyzeTicker.C:
// Update statistics for query planner
if _, err := c.db.Exec("ANALYZE"); err != nil {
fmt.Printf("ANALYZE error: %v\n", err)
}
case <-vacuumTicker.C:
// Reclaim dead tuple space (VACUUM is lighter than VACUUM FULL)
fmt.Println("Running VACUUM...")
if _, err := c.db.Exec("VACUUM"); err != nil {
fmt.Printf("VACUUM error: %v\n", err)
} else {
fmt.Println("VACUUM complete")
}
}
}
}
// StartFeedCheckLoop runs the feed_check loop (checking feeds for new items)
func (c *Crawler) StartFeedCheckLoop() {
numWorkers := 4000
// Buffered channel for feed work
workChan := make(chan *Feed, 4000)
// Start workers
for i := 0; i < numWorkers; i++ {
go func() {
for feed := range workChan {
c.CheckFeed(feed)
}
}()
}
const fetchSize = 4000
for {
if c.IsShuttingDown() {
close(workChan)
return
}
feeds, err := c.GetFeedsDueForCheck(fetchSize)
if err != nil {
fmt.Printf("Error fetching feeds: %v\n", err)
}
if len(feeds) == 0 {
time.Sleep(1 * time.Second)
continue
}
fmt.Printf("%s feed_check: %d feeds\n", time.Now().Format("15:04:05"), len(feeds))
for _, feed := range feeds {
workChan <- feed
}
time.Sleep(1 * time.Second)
}
}