This commit is contained in:
primal
2026-01-30 22:35:08 -05:00
parent f49fc2f0ad
commit be595cb403
14 changed files with 341 additions and 544 deletions
+2 -57
View File
@@ -163,9 +163,6 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
feed.Description = ch.Description
feed.Language = ch.Language
feed.SiteURL = normalizeURL(ch.Link)
feed.TTLMinutes = ch.TTL
feed.UpdatePeriod = ch.UpdatePeriod
feed.UpdateFreq = ch.UpdateFreq
feed.ItemCount = len(ch.Items)
// Detect podcast
@@ -251,10 +248,6 @@ func (c *Crawler) parseRSSMetadata(body string, feed *Feed) []*Item {
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
@@ -367,10 +360,6 @@ func (c *Crawler) parseAtomMetadata(body string, feed *Feed) []*Item {
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items
@@ -399,48 +388,8 @@ func parseRSSDate(s string) (time.Time, error) {
// calculateNextCrawl determines when to next crawl this feed
func (c *Crawler) calculateNextCrawl(feed *Feed) time.Time {
now := time.Now()
// If TTL is specified, use it
if feed.TTLMinutes > 0 {
return now.Add(time.Duration(feed.TTLMinutes) * time.Minute)
}
// If updatePeriod is specified
if feed.UpdatePeriod != "" {
freq := feed.UpdateFreq
if freq == 0 {
freq = 1
}
switch strings.ToLower(feed.UpdatePeriod) {
case "hourly":
return now.Add(time.Duration(freq) * time.Hour)
case "daily":
return now.Add(time.Duration(freq) * 24 * time.Hour)
case "weekly":
return now.Add(time.Duration(freq) * 7 * 24 * time.Hour)
case "monthly":
return now.Add(time.Duration(freq) * 30 * 24 * time.Hour)
case "yearly":
return now.Add(time.Duration(freq) * 365 * 24 * time.Hour)
}
}
// If we have average post frequency, use that
if feed.AvgPostFreqHrs > 0 {
// Crawl at half the average frequency, but at least every hour and at most once per day
crawlInterval := feed.AvgPostFreqHrs / 2
if crawlInterval < 1 {
crawlInterval = 1
}
if crawlInterval > 24 {
crawlInterval = 24
}
return now.Add(time.Duration(crawlInterval * float64(time.Hour)))
}
// Default: crawl every 6 hours
return now.Add(6 * time.Hour)
// Adaptive backoff: 100s base + 100s per consecutive no-change
return time.Now().Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
}
// extractItemImages extracts image URLs from an RSS item
@@ -661,10 +610,6 @@ func (c *Crawler) parseJSONFeedMetadata(body string, feed *Feed) []*Item {
feed.OldestItemDate = oldest
feed.NewestItemDate = newest
if len(dates) > 1 {
totalHours := newest.Sub(oldest).Hours()
feed.AvgPostFreqHrs = totalHours / float64(len(dates)-1)
}
}
return items