194 lines
4.2 KiB
Go
194 lines
4.2 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// processFeed parses and stores a feed with full metadata
|
|
// This is called during discovery - all feeds start as STANDBY
|
|
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
|
// Fast path: check without lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
c.feedsMu.Lock()
|
|
defer c.feedsMu.Unlock()
|
|
|
|
// Double-check after acquiring lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
feedType := c.detectFeedType(body)
|
|
host := getDomainHost(sourceHost) // for spam check
|
|
|
|
feed := &Feed{
|
|
URL: normalizeURL(feedURL),
|
|
Type: feedType,
|
|
ETag: headers.Get("ETag"),
|
|
LastModified: headers.Get("Last-Modified"),
|
|
MissCount: 0,
|
|
}
|
|
|
|
// Parse feed-specific metadata and items
|
|
var items []*Item
|
|
switch feedType {
|
|
case "rss":
|
|
items = c.parseRSSMetadata(body, feed)
|
|
case "atom":
|
|
items = c.parseAtomMetadata(body, feed)
|
|
case "json":
|
|
items = c.parseJSONFeedMetadata(body, feed)
|
|
}
|
|
|
|
// Determine status based on spam check
|
|
if isSpam(host, feed.Language, feedType) {
|
|
feed.Status = "IGNORE"
|
|
} else {
|
|
feed.Status = "STANDBY" // All non-spam start as STANDBY for testing
|
|
}
|
|
|
|
if err := c.saveFeed(feed); err != nil {
|
|
return
|
|
}
|
|
|
|
// Save items
|
|
if len(items) > 0 {
|
|
c.saveItems(items)
|
|
}
|
|
}
|
|
|
|
// CheckFeed performs a conditional request to check if a feed has been updated
|
|
// Returns: newItems (bool), error
|
|
func (c *Crawler) CheckFeed(feed *Feed) (bool, error) {
|
|
atomic.AddInt32(&c.feedsChecked, 1)
|
|
|
|
// Try different scheme/www combinations since we store URLs without scheme
|
|
urlVariants := []string{
|
|
"https://" + feed.URL,
|
|
"http://" + feed.URL,
|
|
"https://www." + feed.URL,
|
|
"http://www." + feed.URL,
|
|
}
|
|
|
|
var resp *http.Response
|
|
var err error
|
|
|
|
for _, tryURL := range urlVariants {
|
|
req, reqErr := http.NewRequest("GET", tryURL, nil)
|
|
if reqErr != nil {
|
|
continue
|
|
}
|
|
|
|
req.Header.Set("User-Agent", c.UserAgent)
|
|
|
|
// Add conditional headers if we have them
|
|
if feed.ETag != "" {
|
|
req.Header.Set("If-None-Match", feed.ETag)
|
|
}
|
|
if feed.LastModified != "" {
|
|
req.Header.Set("If-Modified-Since", feed.LastModified)
|
|
}
|
|
|
|
resp, err = c.client.Do(req)
|
|
if err == nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
now := time.Now()
|
|
feed.LastCheckedAt = now
|
|
|
|
// If no request succeeded, treat as miss
|
|
if resp == nil {
|
|
if err == nil {
|
|
err = fmt.Errorf("all URL variants failed")
|
|
}
|
|
feed.MissCount++
|
|
feed.LastError = err.Error()
|
|
c.saveFeed(feed)
|
|
return false, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// 304 Not Modified - no new items
|
|
if resp.StatusCode == http.StatusNotModified {
|
|
feed.MissCount++
|
|
feed.LastError = ""
|
|
c.saveFeed(feed)
|
|
return false, nil
|
|
}
|
|
|
|
// Non-200 response - treat as miss
|
|
if resp.StatusCode != http.StatusOK {
|
|
feed.MissCount++
|
|
feed.LastError = resp.Status
|
|
c.saveFeed(feed)
|
|
return false, nil
|
|
}
|
|
|
|
// 200 OK - read body
|
|
bodyBytes, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
feed.MissCount++
|
|
feed.LastError = err.Error()
|
|
c.saveFeed(feed)
|
|
return false, err
|
|
}
|
|
|
|
body := string(bodyBytes)
|
|
|
|
// Update cache headers
|
|
feed.ETag = resp.Header.Get("ETag")
|
|
feed.LastModified = resp.Header.Get("Last-Modified")
|
|
|
|
// Re-detect type and parse metadata
|
|
// This handles feeds imported with type="html" that need content sniffing
|
|
feedType := c.detectFeedType(body)
|
|
if feedType != "unknown" {
|
|
feed.Type = feedType // Update type if we detected a real feed
|
|
}
|
|
|
|
var items []*Item
|
|
switch feedType {
|
|
case "rss":
|
|
items = c.parseRSSMetadata(body, feed)
|
|
case "atom":
|
|
items = c.parseAtomMetadata(body, feed)
|
|
case "json":
|
|
items = c.parseJSONFeedMetadata(body, feed)
|
|
case "unknown":
|
|
// Not a feed - mark as IGNORE if it was imported as html
|
|
if feed.Type == "html" {
|
|
feed.Status = "IGNORE"
|
|
feed.LastError = "not a feed (html content)"
|
|
}
|
|
}
|
|
|
|
// Save items and check if we got any
|
|
if len(items) > 0 {
|
|
if err := c.saveItems(items); err != nil {
|
|
feed.MissCount++
|
|
feed.LastError = err.Error()
|
|
c.saveFeed(feed)
|
|
return false, err
|
|
}
|
|
// Items found - reset miss count
|
|
feed.MissCount = 0
|
|
feed.LastError = ""
|
|
c.saveFeed(feed)
|
|
return true, nil
|
|
}
|
|
|
|
// No items - increment miss count
|
|
feed.MissCount++
|
|
feed.LastError = ""
|
|
c.saveFeed(feed)
|
|
return false, nil
|
|
}
|