- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
234 lines
5.7 KiB
Go
234 lines
5.7 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/cockroachdb/pebble"
|
|
)
|
|
|
|
// Feed represents a discovered RSS/Atom feed with metadata
|
|
type Feed struct {
|
|
URL string `json:"url"`
|
|
Type string `json:"type"` // "rss", "atom", or "unknown"
|
|
Title string `json:"title,omitempty"`
|
|
Description string `json:"description,omitempty"`
|
|
Language string `json:"language,omitempty"`
|
|
SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to
|
|
|
|
// Timing
|
|
DiscoveredAt time.Time `json:"discovered_at"`
|
|
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
|
NextCrawlAt time.Time `json:"next_crawl_at,omitempty"`
|
|
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
|
|
|
|
// Cache headers for conditional requests
|
|
ETag string `json:"etag,omitempty"`
|
|
LastModified string `json:"last_modified,omitempty"`
|
|
|
|
// Feed hints for crawl scheduling
|
|
TTLMinutes int `json:"ttl_minutes,omitempty"` // From RSS <ttl> element
|
|
UpdatePeriod string `json:"update_period,omitempty"` // From sy:updatePeriod (hourly, daily, weekly, monthly, yearly)
|
|
UpdateFreq int `json:"update_freq,omitempty"` // From sy:updateFrequency
|
|
|
|
// Health tracking
|
|
Status string `json:"status"` // "active", "dead", "redirect", "error"
|
|
ErrorCount int `json:"error_count"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
LastErrorAt time.Time `json:"last_error_at,omitempty"`
|
|
|
|
// Discovery source
|
|
SourceURL string `json:"source_url,omitempty"` // Where we found this feed
|
|
SourceHost string `json:"source_host,omitempty"`
|
|
TLD string `json:"tld,omitempty"`
|
|
|
|
// Content stats
|
|
ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl
|
|
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
|
|
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
|
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
|
}
|
|
|
|
// saveFeed stores a feed in PebbleDB
|
|
func (c *Crawler) saveFeed(feed *Feed) error {
|
|
data, err := json.Marshal(feed)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal feed: %v", err)
|
|
}
|
|
|
|
key := []byte("feed:" + feed.URL)
|
|
return c.db.Set(key, data, pebble.Sync)
|
|
}
|
|
|
|
// getFeed retrieves a feed from PebbleDB
|
|
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
|
key := []byte("feed:" + normalizeURL(feedURL))
|
|
data, closer, err := c.db.Get(key)
|
|
if err != nil {
|
|
if err == pebble.ErrNotFound {
|
|
return nil, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
defer closer.Close()
|
|
|
|
var feed Feed
|
|
if err := json.Unmarshal(data, &feed); err != nil {
|
|
return nil, fmt.Errorf("failed to unmarshal feed: %v", err)
|
|
}
|
|
return &feed, nil
|
|
}
|
|
|
|
// feedExists checks if a feed URL already exists in the database
|
|
func (c *Crawler) feedExists(feedURL string) bool {
|
|
key := []byte("feed:" + normalizeURL(feedURL))
|
|
_, closer, err := c.db.Get(key)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
closer.Close()
|
|
return true
|
|
}
|
|
|
|
// GetAllFeeds returns all feeds from the database
|
|
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
|
var feeds []*Feed
|
|
|
|
iter, err := c.db.NewIter(&pebble.IterOptions{
|
|
LowerBound: []byte("feed:"),
|
|
UpperBound: []byte("feed:\xff"),
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer iter.Close()
|
|
|
|
for iter.First(); iter.Valid(); iter.Next() {
|
|
var feed Feed
|
|
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
|
|
continue
|
|
}
|
|
feeds = append(feeds, &feed)
|
|
}
|
|
|
|
if err := iter.Error(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return feeds, nil
|
|
}
|
|
|
|
// GetFeedCount returns the total number of feeds in the database
|
|
func (c *Crawler) GetFeedCount() (int, error) {
|
|
count := 0
|
|
|
|
iter, err := c.db.NewIter(&pebble.IterOptions{
|
|
LowerBound: []byte("feed:"),
|
|
UpperBound: []byte("feed:\xff"),
|
|
})
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer iter.Close()
|
|
|
|
for iter.First(); iter.Valid(); iter.Next() {
|
|
count++
|
|
}
|
|
|
|
if err := iter.Error(); err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return count, nil
|
|
}
|
|
|
|
// processFeed parses and stores a feed with full metadata
|
|
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
|
if strings.Contains(feedURL, "/comment") {
|
|
return
|
|
}
|
|
|
|
// Fast path: check without lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
c.feedsMu.Lock()
|
|
defer c.feedsMu.Unlock()
|
|
|
|
// Double-check after acquiring lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
feedType := c.detectFeedType(body)
|
|
now := time.Now()
|
|
|
|
feed := &Feed{
|
|
URL: normalizeURL(feedURL),
|
|
Type: feedType,
|
|
DiscoveredAt: now,
|
|
LastCrawledAt: now,
|
|
Status: "active",
|
|
SourceHost: sourceHost,
|
|
TLD: getTLD(sourceHost),
|
|
ETag: headers.Get("ETag"),
|
|
LastModified: headers.Get("Last-Modified"),
|
|
}
|
|
|
|
// Parse feed-specific metadata
|
|
switch feedType {
|
|
case "rss":
|
|
c.parseRSSMetadata(body, feed)
|
|
case "atom":
|
|
c.parseAtomMetadata(body, feed)
|
|
}
|
|
|
|
// Calculate next crawl time
|
|
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
|
|
|
if err := c.saveFeed(feed); err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
// addFeed adds a discovered feed URL (not yet fetched)
|
|
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
|
if strings.Contains(feedURL, "/comment") {
|
|
return
|
|
}
|
|
|
|
// Fast path: check without lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
c.feedsMu.Lock()
|
|
defer c.feedsMu.Unlock()
|
|
|
|
// Double-check after acquiring lock
|
|
if c.feedExists(feedURL) {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
normalizedURL := normalizeURL(feedURL)
|
|
feed := &Feed{
|
|
URL: normalizedURL,
|
|
Type: feedType,
|
|
DiscoveredAt: now,
|
|
Status: "active",
|
|
SourceURL: normalizeURL(sourceURL),
|
|
SourceHost: sourceHost,
|
|
TLD: getTLD(sourceHost),
|
|
NextCrawlAt: now, // Should be crawled immediately
|
|
}
|
|
|
|
if err := c.saveFeed(feed); err != nil {
|
|
return
|
|
}
|
|
}
|