Files
crawler/feed_check.go
2026-02-04 20:53:52 -05:00

194 lines
4.2 KiB
Go

package main
import (
"fmt"
"io"
"net/http"
"sync/atomic"
"time"
)
// processFeed parses and stores a feed with full metadata
// This is called during discovery - all feeds start as STANDBY
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
// Fast path: check without lock
if c.feedExists(feedURL) {
return
}
c.feedsMu.Lock()
defer c.feedsMu.Unlock()
// Double-check after acquiring lock
if c.feedExists(feedURL) {
return
}
feedType := c.detectFeedType(body)
host := getDomainHost(sourceHost) // for spam check
feed := &Feed{
URL: normalizeURL(feedURL),
Type: feedType,
ETag: headers.Get("ETag"),
LastModified: headers.Get("Last-Modified"),
MissCount: 0,
}
// Parse feed-specific metadata and items
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
}
// Determine status based on spam check
if isSpam(host, feed.Language, feedType) {
feed.Status = "IGNORE"
} else {
feed.Status = "STANDBY" // All non-spam start as STANDBY for testing
}
if err := c.saveFeed(feed); err != nil {
return
}
// Save items
if len(items) > 0 {
c.saveItems(items)
}
}
// CheckFeed performs a conditional request to check if a feed has been updated
// Returns: newItems (bool), error
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
atomic.AddInt32(&c.feedsChecked, 1)
// Try different scheme/www combinations since we store URLs without scheme
urlVariants := []string{
"https://" + feed.URL,
"http://" + feed.URL,
"https://www." + feed.URL,
"http://www." + feed.URL,
}
var resp *http.Response
var err error
for _, tryURL := range urlVariants {
req, reqErr := http.NewRequest("GET", tryURL, nil)
if reqErr != nil {
continue
}
req.Header.Set("User-Agent", c.UserAgent)
// Add conditional headers if we have them
if feed.ETag != "" {
req.Header.Set("If-None-Match", feed.ETag)
}
if feed.LastModified != "" {
req.Header.Set("If-Modified-Since", feed.LastModified)
}
resp, err = c.client.Do(req)
if err == nil {
break
}
}
now := time.Now()
feed.LastCheckedAt = now
// If no request succeeded, treat as miss
if resp == nil {
if err == nil {
err = fmt.Errorf("all URL variants failed")
}
feed.MissCount++
feed.LastError = err.Error()
c.saveFeed(feed)
return false, err
}
defer resp.Body.Close()
// 304 Not Modified - no new items
if resp.StatusCode == http.StatusNotModified {
feed.MissCount++
feed.LastError = ""
c.saveFeed(feed)
return false, nil
}
// Non-200 response - treat as miss
if resp.StatusCode != http.StatusOK {
feed.MissCount++
feed.LastError = resp.Status
c.saveFeed(feed)
return false, nil
}
// 200 OK - read body
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
feed.MissCount++
feed.LastError = err.Error()
c.saveFeed(feed)
return false, err
}
body := string(bodyBytes)
// Update cache headers
feed.ETag = resp.Header.Get("ETag")
feed.LastModified = resp.Header.Get("Last-Modified")
// Re-detect type and parse metadata
// This handles feeds imported with type="html" that need content sniffing
feedType := c.detectFeedType(body)
if feedType != "unknown" {
feed.Type = feedType // Update type if we detected a real feed
}
var items []*Item
switch feedType {
case "rss":
items = c.parseRSSMetadata(body, feed)
case "atom":
items = c.parseAtomMetadata(body, feed)
case "json":
items = c.parseJSONFeedMetadata(body, feed)
case "unknown":
// Not a feed - mark as IGNORE if it was imported as html
if feed.Type == "html" {
feed.Status = "IGNORE"
feed.LastError = "not a feed (html content)"
}
}
// Save items and check if we got any
if len(items) > 0 {
if err := c.saveItems(items); err != nil {
feed.MissCount++
feed.LastError = err.Error()
c.saveFeed(feed)
return false, err
}
// Items found - reset miss count
feed.MissCount = 0
feed.LastError = ""
c.saveFeed(feed)
return true, nil
}
// No items - increment miss count
feed.MissCount++
feed.LastError = ""
c.saveFeed(feed)
return false, nil
}