v100
This commit is contained in:
+21
-25
@@ -32,7 +32,7 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
Status: "pass",
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
ETag: headers.Get("ETag"),
|
||||
@@ -88,7 +88,7 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
Type: feedType,
|
||||
Category: classifyFeed(feedURL),
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
Status: "pass",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
@@ -149,16 +149,15 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
}
|
||||
now := time.Now()
|
||||
feed.LastCrawledAt = now
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.Status = "hold"
|
||||
// Auto-hold feeds after 1000 consecutive failures/no-changes
|
||||
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
@@ -173,29 +172,28 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
feed.NoUpdate++
|
||||
// Adaptive backoff: 100s base + 100s per consecutive no-change
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
feed.Status = "pass"
|
||||
// Auto-hold feeds after 1000 consecutive no-changes
|
||||
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Non-200 response
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = resp.Status
|
||||
feed.LastErrorAt = now
|
||||
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
|
||||
feed.Status = "dead"
|
||||
} else {
|
||||
feed.Status = "error"
|
||||
}
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.Status = "hold"
|
||||
// Auto-hold feeds after 1000 consecutive failures/no-changes
|
||||
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, nil
|
||||
@@ -204,16 +202,15 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
// 200 OK - feed has new content
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
feed.ErrorCount++
|
||||
feed.NoUpdate++
|
||||
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
|
||||
feed.LastError = err.Error()
|
||||
feed.LastErrorAt = now
|
||||
feed.Status = "error"
|
||||
// Auto-hold feeds that fail 100+ times
|
||||
if feed.ErrorCount >= 100 && feed.PublishStatus == "pass" {
|
||||
feed.Status = "hold"
|
||||
// Auto-hold feeds after 1000 consecutive failures/no-changes
|
||||
if feed.NoUpdate >= 1000 && feed.PublishStatus == "pass" {
|
||||
feed.PublishStatus = "hold"
|
||||
fmt.Printf("Feed auto-held after %d errors: %s\n", feed.ErrorCount, feed.URL)
|
||||
fmt.Printf("Feed auto-held after %d no-updates: %s\n", feed.NoUpdate, feed.URL)
|
||||
}
|
||||
c.saveFeed(feed)
|
||||
return false, err
|
||||
@@ -242,9 +239,8 @@ func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
||||
// Content changed - reset backoff
|
||||
feed.NoUpdate = 0
|
||||
feed.NextCrawlAt = now.Add(100 * time.Second)
|
||||
feed.ErrorCount = 0
|
||||
feed.LastError = ""
|
||||
feed.Status = "active"
|
||||
feed.Status = "pass"
|
||||
c.saveFeed(feed)
|
||||
|
||||
// Save items
|
||||
|
||||
Reference in New Issue
Block a user