This commit is contained in:
primal
2026-01-30 22:35:08 -05:00
parent f49fc2f0ad
commit be595cb403
14 changed files with 341 additions and 544 deletions
+21 -25
View File
@@ -32,7 +32,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
Category: classifyFeed(feedURL),
DiscoveredAt: now,
LastCrawledAt: now,
Status: "active",
Status: "pass",
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
ETag: headers.Get("ETag"),
@@ -88,7 +88,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
Type: feedType,
Category: classifyFeed(feedURL),
DiscoveredAt: now,
Status: "active",
Status: "pass",
SourceURL: normalizeURL(sourceURL),
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
@@ -149,16 +149,15 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
}
now := time.Now()
feed.LastCrawledAt = now
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.Status = "hold"
// Auto-hold feeds after 1000 consecutive failures/no-changes
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
}
c.saveFeed(feed)
return false, err
@@ -173,29 +172,28 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
feed.NoUpdate++
// Adaptive backoff: 100s base + 100s per consecutive no-change
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
feed.Status = "pass"
// Auto-hold feeds after 1000 consecutive no-changes
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
}
c.saveFeed(feed)
return false, nil
}
// Non-200 response
if resp.StatusCode != http.StatusOK {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = resp.Status
feed.LastErrorAt = now
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
feed.Status = "dead"
} else {
feed.Status = "error"
}
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.Status = "hold"
// Auto-hold feeds after 1000 consecutive failures/no-changes
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
}
c.saveFeed(feed)
return false, nil
@@ -204,16 +202,15 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
// 200 OK - feed has new content
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
// Auto-hold feeds that fail 100+ times
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
feed.Status = "hold"
// Auto-hold feeds after 1000 consecutive failures/no-changes
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
feed.PublishStatus = "hold"
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
}
c.saveFeed(feed)
return false, err
@@ -242,9 +239,8 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
// Content changed - reset backoff
feed.NoUpdate = 0
feed.NextCrawlAt = now.Add(100 * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
feed.Status = "pass"
c.saveFeed(feed)
// Save items