Files
crawler/feed.go
primal 219b49352e Add PebbleDB storage, domain tracking, and web dashboard
- Split main.go into separate files for better organization:
  crawler.go, domain.go, feed.go, parser.go, html.go, util.go
- Add PebbleDB for persistent storage of feeds and domains
- Store feeds with metadata: title, TTL, update frequency, ETag, etc.
- Track domains with crawl status (uncrawled/crawled/error)
- Normalize URLs by stripping scheme and www. prefix
- Add web dashboard on port 4321 with real-time stats:
  - Crawl progress with completion percentage
  - Feed counts by type (RSS/Atom)
  - Top TLDs and domains by feed count
  - Recent feeds table
- Filter out comment feeds from results

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 16:29:00 -05:00

234 lines
5.7 KiB
Go

package main
import (
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
"github.com/cockroachdb/pebble"
)
// Feed represents a discovered RSS/Atom feed with metadata
type Feed struct {
URL string `json:"url"`
Type string `json:"type"` // "rss", "atom", or "unknown"
Title string `json:"title,omitempty"`
Description string `json:"description,omitempty"`
Language string `json:"language,omitempty"`
SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to
// Timing
DiscoveredAt time.Time `json:"discovered_at"`
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
NextCrawlAt time.Time `json:"next_crawl_at,omitempty"`
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
// Cache headers for conditional requests
ETag string `json:"etag,omitempty"`
LastModified string `json:"last_modified,omitempty"`
// Feed hints for crawl scheduling
TTLMinutes int `json:"ttl_minutes,omitempty"` // From RSS <ttl> element
UpdatePeriod string `json:"update_period,omitempty"` // From sy:updatePeriod (hourly, daily, weekly, monthly, yearly)
UpdateFreq int `json:"update_freq,omitempty"` // From sy:updateFrequency
// Health tracking
Status string `json:"status"` // "active", "dead", "redirect", "error"
ErrorCount int `json:"error_count"`
LastError string `json:"last_error,omitempty"`
LastErrorAt time.Time `json:"last_error_at,omitempty"`
// Discovery source
SourceURL string `json:"source_url,omitempty"` // Where we found this feed
SourceHost string `json:"source_host,omitempty"`
TLD string `json:"tld,omitempty"`
// Content stats
ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
}
// saveFeed stores a feed in PebbleDB
func (c *Crawler) saveFeed(feed *Feed) error {
data, err := json.Marshal(feed)
if err != nil {
return fmt.Errorf("failed to marshal feed: %v", err)
}
key := []byte("feed:" + feed.URL)
return c.db.Set(key, data, pebble.Sync)
}
// getFeed retrieves a feed from PebbleDB
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
key := []byte("feed:" + normalizeURL(feedURL))
data, closer, err := c.db.Get(key)
if err != nil {
if err == pebble.ErrNotFound {
return nil, nil
}
return nil, err
}
defer closer.Close()
var feed Feed
if err := json.Unmarshal(data, &feed); err != nil {
return nil, fmt.Errorf("failed to unmarshal feed: %v", err)
}
return &feed, nil
}
// feedExists checks if a feed URL already exists in the database
func (c *Crawler) feedExists(feedURL string) bool {
key := []byte("feed:" + normalizeURL(feedURL))
_, closer, err := c.db.Get(key)
if err != nil {
return false
}
closer.Close()
return true
}
// GetAllFeeds returns all feeds from the database
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
var feeds []*Feed
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("feed:"),
UpperBound: []byte("feed:\xff"),
})
if err != nil {
return nil, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
var feed Feed
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
continue
}
feeds = append(feeds, &feed)
}
if err := iter.Error(); err != nil {
return nil, err
}
return feeds, nil
}
// GetFeedCount returns the total number of feeds in the database
func (c *Crawler) GetFeedCount() (int, error) {
count := 0
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("feed:"),
UpperBound: []byte("feed:\xff"),
})
if err != nil {
return 0, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
count++
}
if err := iter.Error(); err != nil {
return 0, err
}
return count, nil
}
// processFeed parses and stores a feed with full metadata
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
if strings.Contains(feedURL, "/comment") {
return
}
// Fast path: check without lock
if c.feedExists(feedURL) {
return
}
c.feedsMu.Lock()
defer c.feedsMu.Unlock()
// Double-check after acquiring lock
if c.feedExists(feedURL) {
return
}
feedType := c.detectFeedType(body)
now := time.Now()
feed := &Feed{
URL: normalizeURL(feedURL),
Type: feedType,
DiscoveredAt: now,
LastCrawledAt: now,
Status: "active",
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
ETag: headers.Get("ETag"),
LastModified: headers.Get("Last-Modified"),
}
// Parse feed-specific metadata
switch feedType {
case "rss":
c.parseRSSMetadata(body, feed)
case "atom":
c.parseAtomMetadata(body, feed)
}
// Calculate next crawl time
feed.NextCrawlAt = c.calculateNextCrawl(feed)
if err := c.saveFeed(feed); err != nil {
return
}
}
// addFeed adds a discovered feed URL (not yet fetched)
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
if strings.Contains(feedURL, "/comment") {
return
}
// Fast path: check without lock
if c.feedExists(feedURL) {
return
}
c.feedsMu.Lock()
defer c.feedsMu.Unlock()
// Double-check after acquiring lock
if c.feedExists(feedURL) {
return
}
now := time.Now()
normalizedURL := normalizeURL(feedURL)
feed := &Feed{
URL: normalizedURL,
Type: feedType,
DiscoveredAt: now,
Status: "active",
SourceURL: normalizeURL(sourceURL),
SourceHost: sourceHost,
TLD: getTLD(sourceHost),
NextCrawlAt: now, // Should be crawled immediately
}
if err := c.saveFeed(feed); err != nil {
return
}
}