Add Docker support and refactor data layer

This commit is contained in:
primal
2026-01-26 16:02:05 -05:00
parent 398e7b3969
commit 143807378f
12 changed files with 2642 additions and 518 deletions
+734 -69
View File
@@ -1,15 +1,86 @@
package main
import (
"encoding/json"
"database/sql"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"sync/atomic"
"time"
"github.com/cockroachdb/pebble"
)
// shouldSkipFeed checks if a feed URL should be filtered out
// Returns true (and a reason) if the feed should be skipped
func shouldSkipFeed(feedURL string) (bool, string) {
lower := strings.ToLower(feedURL)
// Skip explicit comment feeds
if strings.Contains(lower, "/comment") {
return true, "comment feed"
}
u, err := url.Parse(feedURL)
if err != nil {
return false, ""
}
path := strings.ToLower(strings.TrimSuffix(u.Path, "/"))
// Skip category/tag feeds
categoryPatterns := []string{"/category/", "/tag/", "/tags/", "/categories/", "/topic/", "/topics/", "/author/"}
for _, pattern := range categoryPatterns {
if strings.Contains(path, pattern) {
return true, "category/tag feed"
}
}
// Check for article comment feeds (path ending in /feed with content before it)
if strings.HasSuffix(path, "/feed") {
basePath := strings.TrimSuffix(path, "/feed")
basePath = strings.Trim(basePath, "/")
if basePath == "" {
return false, "" // Just /feed - legitimate main feed
}
// Skip if path contains date patterns (likely article)
if matched, _ := regexp.MatchString(`/\d{4}/\d{2}`, basePath); matched {
return true, "article feed (date pattern)"
}
// Skip if path has multiple segments (likely article or nested content)
segments := strings.Split(basePath, "/")
if len(segments) >= 2 {
return true, "article feed (nested path)"
}
// Skip if single segment looks like an article slug (contains hyphens, is long)
if len(segments) == 1 && (strings.Contains(segments[0], "-") && len(segments[0]) > 20) {
return true, "article feed (slug pattern)"
}
}
return false, ""
}
// Item represents an individual entry/article from a feed
type Item struct {
ID int64 `json:"id,omitempty"`
FeedURL string `json:"feed_url"`
GUID string `json:"guid,omitempty"`
Title string `json:"title,omitempty"`
Link string `json:"link,omitempty"`
Description string `json:"description,omitempty"`
Content string `json:"content,omitempty"`
Author string `json:"author,omitempty"`
PubDate time.Time `json:"pub_date,omitempty"`
DiscoveredAt time.Time `json:"discovered_at"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
}
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
@@ -50,99 +121,548 @@ type Feed struct {
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
// Adaptive check interval
NoUpdate int `json:"no_update"` // Consecutive checks with no change
}
// saveFeed stores a feed in PebbleDB
// saveFeed stores a feed in SQLite
func (c *Crawler) saveFeed(feed *Feed) error {
data, err := json.Marshal(feed)
if err != nil {
return fmt.Errorf("failed to marshal feed: %v", err)
}
key := []byte("feed:" + feed.URL)
return c.db.Set(key, data, pebble.Sync)
_, err := c.db.Exec(`
INSERT INTO feeds (
url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
type = excluded.type,
title = excluded.title,
description = excluded.description,
language = excluded.language,
siteUrl = excluded.siteUrl,
lastCrawledAt = excluded.lastCrawledAt,
nextCrawlAt = excluded.nextCrawlAt,
lastBuildDate = excluded.lastBuildDate,
etag = excluded.etag,
lastModified = excluded.lastModified,
ttlMinutes = excluded.ttlMinutes,
updatePeriod = excluded.updatePeriod,
updateFreq = excluded.updateFreq,
status = excluded.status,
errorCount = excluded.errorCount,
lastError = excluded.lastError,
lastErrorAt = excluded.lastErrorAt,
itemCount = excluded.itemCount,
avgPostFreqHrs = excluded.avgPostFreqHrs,
oldestItemDate = excluded.oldestItemDate,
newestItemDate = excluded.newestItemDate,
noUpdate = excluded.noUpdate
`,
feed.URL, feed.Type, nullString(feed.Title), nullString(feed.Description),
nullString(feed.Language), nullString(feed.SiteURL),
feed.DiscoveredAt, nullTime(feed.LastCrawledAt), nullTime(feed.NextCrawlAt), nullTime(feed.LastBuildDate),
nullString(feed.ETag), nullString(feed.LastModified),
feed.TTLMinutes, nullString(feed.UpdatePeriod), feed.UpdateFreq,
feed.Status, feed.ErrorCount, nullString(feed.LastError), nullTime(feed.LastErrorAt),
nullString(feed.SourceURL), nullString(feed.SourceHost), nullString(feed.TLD),
feed.ItemCount, feed.AvgPostFreqHrs, nullTime(feed.OldestItemDate), nullTime(feed.NewestItemDate),
feed.NoUpdate,
)
return err
}
// getFeed retrieves a feed from PebbleDB
// getFeed retrieves a feed from SQLite
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
key := []byte("feed:" + normalizeURL(feedURL))
data, closer, err := c.db.Get(key)
feed := &Feed{}
var title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
err := c.db.QueryRow(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds WHERE url = ?
`, normalizeURL(feedURL)).Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil {
if err == pebble.ErrNotFound {
return nil, nil
}
return nil, err
}
defer closer.Close()
var feed Feed
if err := json.Unmarshal(data, &feed); err != nil {
return nil, fmt.Errorf("failed to unmarshal feed: %v", err)
// Handle nullable fields
if title.Valid {
feed.Title = title.String
}
return &feed, nil
if description.Valid {
feed.Description = description.String
}
if language.Valid {
feed.Language = language.String
}
if siteURL.Valid {
feed.SiteURL = siteURL.String
}
if lastCrawledAt.Valid {
feed.LastCrawledAt = lastCrawledAt.Time
}
if nextCrawlAt.Valid {
feed.NextCrawlAt = nextCrawlAt.Time
}
if lastBuildDate.Valid {
feed.LastBuildDate = lastBuildDate.Time
}
if etag.Valid {
feed.ETag = etag.String
}
if lastModified.Valid {
feed.LastModified = lastModified.String
}
if updatePeriod.Valid {
feed.UpdatePeriod = updatePeriod.String
}
if lastError.Valid {
feed.LastError = lastError.String
}
if lastErrorAt.Valid {
feed.LastErrorAt = lastErrorAt.Time
}
if sourceURL.Valid {
feed.SourceURL = sourceURL.String
}
if sourceHost.Valid {
feed.SourceHost = sourceHost.String
}
if tld.Valid {
feed.TLD = tld.String
}
if avgPostFreqHrs.Valid {
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
feed.OldestItemDate = oldestItemDate.Time
}
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
return feed, nil
}
// feedExists checks if a feed URL already exists in the database
func (c *Crawler) feedExists(feedURL string) bool {
key := []byte("feed:" + normalizeURL(feedURL))
_, closer, err := c.db.Get(key)
if err != nil {
return false
}
closer.Close()
return true
var exists bool
err := c.db.QueryRow("SELECT EXISTS(SELECT 1 FROM feeds WHERE url = ?)", normalizeURL(feedURL)).Scan(&exists)
return err == nil && exists
}
// GetAllFeeds returns all feeds from the database
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
var feeds []*Feed
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("feed:"),
UpperBound: []byte("feed:\xff"),
})
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds
`)
if err != nil {
return nil, err
}
defer iter.Close()
defer rows.Close()
for iter.First(); iter.Valid(); iter.Next() {
var feed Feed
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
continue
}
feeds = append(feeds, &feed)
}
if err := iter.Error(); err != nil {
return nil, err
}
return feeds, nil
return scanFeeds(rows)
}
// GetFeedCount returns the total number of feeds in the database
func (c *Crawler) GetFeedCount() (int, error) {
count := 0
var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds").Scan(&count)
return count, err
}
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("feed:"),
UpperBound: []byte("feed:\xff"),
})
// GetFeedCountByHost returns the number of feeds for a specific host
func (c *Crawler) GetFeedCountByHost(host string) (int, error) {
var count int
err := c.db.QueryRow("SELECT COUNT(*) FROM feeds WHERE sourceHost = ?", host).Scan(&count)
return count, err
}
// GetFeedsDueForCheck returns feeds where nextCrawlAt <= now, ordered randomly, limited to n
func (c *Crawler) GetFeedsDueForCheck(limit int) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds
WHERE nextCrawlAt <= datetime('now') AND status != 'dead'
ORDER BY RANDOM()
LIMIT ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// GetFeedsByHost returns all feeds from a specific host
func (c *Crawler) GetFeedsByHost(host string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT url, type, title, description, language, siteUrl,
discoveredAt, lastCrawledAt, nextCrawlAt, lastBuildDate,
etag, lastModified,
ttlMinutes, updatePeriod, updateFreq,
status, errorCount, lastError, lastErrorAt,
sourceUrl, sourceHost, tld,
itemCount, avgPostFreqHrs, oldestItemDate, newestItemDate,
noUpdate
FROM feeds WHERE sourceHost = ?
`, host)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// SearchFeeds performs a full-text search on feeds
func (c *Crawler) SearchFeeds(query string) ([]*Feed, error) {
rows, err := c.db.Query(`
SELECT f.url, f.type, f.title, f.description, f.language, f.siteUrl,
f.discoveredAt, f.lastCrawledAt, f.nextCrawlAt, f.lastBuildDate,
f.etag, f.lastModified,
f.ttlMinutes, f.updatePeriod, f.updateFreq,
f.status, f.errorCount, f.lastError, f.lastErrorAt,
f.sourceUrl, f.sourceHost, f.tld,
f.itemCount, f.avgPostFreqHrs, f.oldestItemDate, f.newestItemDate,
f.noUpdate
FROM feeds f
JOIN feeds_fts fts ON f.rowid = fts.rowid
WHERE feeds_fts MATCH ?
`, query)
if err != nil {
return nil, err
}
defer rows.Close()
return scanFeeds(rows)
}
// scanFeeds is a helper to scan multiple feed rows
func scanFeeds(rows *sql.Rows) ([]*Feed, error) {
var feeds []*Feed
for rows.Next() {
feed := &Feed{}
var title, description, language, siteURL sql.NullString
var lastCrawledAt, nextCrawlAt, lastBuildDate, lastErrorAt, oldestItemDate, newestItemDate sql.NullTime
var etag, lastModified, updatePeriod, lastError, sourceURL, sourceHost, tld sql.NullString
var avgPostFreqHrs sql.NullFloat64
if err := rows.Scan(
&feed.URL, &feed.Type, &title, &description, &language, &siteURL,
&feed.DiscoveredAt, &lastCrawledAt, &nextCrawlAt, &lastBuildDate,
&etag, &lastModified,
&feed.TTLMinutes, &updatePeriod, &feed.UpdateFreq,
&feed.Status, &feed.ErrorCount, &lastError, &lastErrorAt,
&sourceURL, &sourceHost, &tld,
&feed.ItemCount, &avgPostFreqHrs, &oldestItemDate, &newestItemDate,
&feed.NoUpdate,
); err != nil {
continue
}
// Handle nullable fields
if title.Valid {
feed.Title = title.String
}
if description.Valid {
feed.Description = description.String
}
if language.Valid {
feed.Language = language.String
}
if siteURL.Valid {
feed.SiteURL = siteURL.String
}
if lastCrawledAt.Valid {
feed.LastCrawledAt = lastCrawledAt.Time
}
if nextCrawlAt.Valid {
feed.NextCrawlAt = nextCrawlAt.Time
}
if lastBuildDate.Valid {
feed.LastBuildDate = lastBuildDate.Time
}
if etag.Valid {
feed.ETag = etag.String
}
if lastModified.Valid {
feed.LastModified = lastModified.String
}
if updatePeriod.Valid {
feed.UpdatePeriod = updatePeriod.String
}
if lastError.Valid {
feed.LastError = lastError.String
}
if lastErrorAt.Valid {
feed.LastErrorAt = lastErrorAt.Time
}
if sourceURL.Valid {
feed.SourceURL = sourceURL.String
}
if sourceHost.Valid {
feed.SourceHost = sourceHost.String
}
if tld.Valid {
feed.TLD = tld.String
}
if avgPostFreqHrs.Valid {
feed.AvgPostFreqHrs = avgPostFreqHrs.Float64
}
if oldestItemDate.Valid {
feed.OldestItemDate = oldestItemDate.Time
}
if newestItemDate.Valid {
feed.NewestItemDate = newestItemDate.Time
}
feeds = append(feeds, feed)
}
return feeds, rows.Err()
}
// saveItem stores an item in SQLite (upsert by feedUrl + guid)
func (c *Crawler) saveItem(item *Item) error {
_, err := c.db.Exec(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
description = excluded.description,
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
`,
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
)
return err
}
// saveItems stores multiple items efficiently
func (c *Crawler) saveItems(items []*Item) error {
if len(items) == 0 {
return nil
}
tx, err := c.db.Begin()
if err != nil {
return err
}
defer tx.Rollback()
stmt, err := tx.Prepare(`
INSERT INTO items (feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(feedUrl, guid) DO UPDATE SET
title = excluded.title,
link = excluded.link,
description = excluded.description,
content = excluded.content,
author = excluded.author,
pubDate = excluded.pubDate,
updatedAt = excluded.updatedAt
`)
if err != nil {
return err
}
defer stmt.Close()
for _, item := range items {
if item == nil || item.GUID == "" {
continue // Skip nil items or items without GUID
}
_, err := stmt.Exec(
item.FeedURL, item.GUID, nullString(item.Title), nullString(item.Link),
nullString(item.Description), nullString(item.Content), nullString(item.Author),
nullTime(item.PubDate), item.DiscoveredAt, nullTime(item.UpdatedAt),
)
if err != nil {
continue // Skip failed items
}
}
return tx.Commit()
}
// GetItemsByFeed returns all items for a specific feed
func (c *Crawler) GetItemsByFeed(feedURL string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT id, feedUrl, guid, title, link, description, content, author, pubDate, discoveredAt, updatedAt
FROM items
WHERE feedUrl = ?
ORDER BY pubDate DESC
LIMIT ?
`, feedURL, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
); err != nil {
continue
}
if guid.Valid {
item.GUID = guid.String
}
if title.Valid {
item.Title = title.String
}
if link.Valid {
item.Link = link.String
}
if description.Valid {
item.Description = description.String
}
if content.Valid {
item.Content = content.String
}
if author.Valid {
item.Author = author.String
}
if pubDate.Valid {
item.PubDate = pubDate.Time
}
if updatedAt.Valid {
item.UpdatedAt = updatedAt.Time
}
items = append(items, item)
}
return items, rows.Err()
}
// SearchItems performs a full-text search on items
func (c *Crawler) SearchItems(query string, limit int) ([]*Item, error) {
rows, err := c.db.Query(`
SELECT i.id, i.feedUrl, i.guid, i.title, i.link, i.description, i.content, i.author, i.pubDate, i.discoveredAt, i.updatedAt
FROM items i
JOIN items_fts fts ON i.id = fts.rowid
WHERE items_fts MATCH ?
ORDER BY i.pubDate DESC
LIMIT ?
`, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var items []*Item
for rows.Next() {
item := &Item{}
var guid, title, link, description, content, author sql.NullString
var pubDate, updatedAt sql.NullTime
if err := rows.Scan(
&item.ID, &item.FeedURL, &guid, &title, &link,
&description, &content, &author, &pubDate,
&item.DiscoveredAt, &updatedAt,
); err != nil {
continue
}
if guid.Valid {
item.GUID = guid.String
}
if title.Valid {
item.Title = title.String
}
if link.Valid {
item.Link = link.String
}
if description.Valid {
item.Description = description.String
}
if content.Valid {
item.Content = content.String
}
if author.Valid {
item.Author = author.String
}
if pubDate.Valid {
item.PubDate = pubDate.Time
}
if updatedAt.Valid {
item.UpdatedAt = updatedAt.Time
}
items = append(items, item)
}
return items, rows.Err()
}
// CleanupOldItems removes items older than 12 months
func (c *Crawler) CleanupOldItems() (int64, error) {
cutoff := time.Now().AddDate(-1, 0, 0) // 12 months ago
result, err := c.db.Exec(`
DELETE FROM items WHERE pubDate < ? AND pubDate IS NOT NULL
`, cutoff)
if err != nil {
return 0, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
count++
}
if err := iter.Error(); err != nil {
return 0, err
}
return count, nil
return result.RowsAffected()
}
// processFeed parses and stores a feed with full metadata
@@ -179,12 +699,13 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
LastModified: headers.Get("Last-Modified"),
}
// Parse feed-specific metadata
// Parse feed-specific metadata and items
var items []*Item
switch feedType {
case "rss":
c.parseRSSMetadata(body, feed)
items = c.parseRSSMetadata(body, feed)
case "atom":
c.parseAtomMetadata(body, feed)
items = c.parseAtomMetadata(body, feed)
}
// Calculate next crawl time
@@ -193,11 +714,17 @@ func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Hea
if err := c.saveFeed(feed); err != nil {
return
}
// Save items
if len(items) > 0 {
c.saveItems(items)
}
}
// addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
if strings.Contains(feedURL, "/comment") {
// Skip comment, category, and article feeds
if skip, _ := shouldSkipFeed(feedURL); skip {
return
}
@@ -231,3 +758,141 @@ func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
return
}
}
// CheckFeed performs a conditional request to check if a feed has been updated
// Returns: changed (bool), error
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
atomic.AddInt32(&c.feedsChecked, 1)
// Try different scheme/www combinations since we store URLs without scheme
urlVariants := []string{
"https://" + feed.URL,
"http://" + feed.URL,
"https://www." + feed.URL,
"http://www." + feed.URL,
}
var resp *http.Response
var err error
var successURL string
for _, tryURL := range urlVariants {
req, reqErr := http.NewRequest("GET", tryURL, nil)
if reqErr != nil {
continue
}
req.Header.Set("User-Agent", c.UserAgent)
// Add conditional headers if we have them
if feed.ETag != "" {
req.Header.Set("If-None-Match", feed.ETag)
}
if feed.LastModified != "" {
req.Header.Set("If-Modified-Since", feed.LastModified)
}
resp, err = c.client.Do(req)
if err == nil {
successURL = tryURL
break
}
}
_ = successURL // May be used later for logging/debugging
// If no request succeeded, resp will be nil
if resp == nil {
if err == nil {
err = fmt.Errorf("all URL variants failed")
}
now := time.Now()
feed.LastCrawledAt = now
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
c.saveFeed(feed)
return false, err
}
defer resp.Body.Close()
now := time.Now()
feed.LastCrawledAt = now
// 304 Not Modified - feed hasn't changed
if resp.StatusCode == http.StatusNotModified {
feed.NoUpdate++
// Adaptive backoff: 100s base + 100s per consecutive no-change
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
return false, nil
}
// Non-200 response
if resp.StatusCode != http.StatusOK {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = resp.Status
feed.LastErrorAt = now
if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusGone {
feed.Status = "dead"
} else {
feed.Status = "error"
}
c.saveFeed(feed)
return false, nil
}
// 200 OK - feed has new content
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
feed.ErrorCount++
feed.NoUpdate++
feed.NextCrawlAt = now.Add(time.Duration(100+100*feed.NoUpdate) * time.Second)
feed.LastError = err.Error()
feed.LastErrorAt = now
feed.Status = "error"
c.saveFeed(feed)
return false, err
}
body := string(bodyBytes)
// Update cache headers
feed.ETag = resp.Header.Get("ETag")
feed.LastModified = resp.Header.Get("Last-Modified")
// Re-detect type and parse metadata
feedType := c.detectFeedType(body)
feed.Type = feedType
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
}
// Content changed - reset backoff
feed.NoUpdate = 0
feed.NextCrawlAt = now.Add(100 * time.Second)
feed.ErrorCount = 0
feed.LastError = ""
feed.Status = "active"
c.saveFeed(feed)
// Save items
if len(items) > 0 {
c.saveItems(items)
}
return true, nil
}