Add PebbleDB storage, domain tracking, and web dashboard
- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,233 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/pebble"
|
||||
)
|
||||
|
||||
// Feed represents a discovered RSS/Atom feed with metadata
|
||||
type Feed struct {
|
||||
URL string `json:"url"`
|
||||
Type string `json:"type"` // "rss", "atom", or "unknown"
|
||||
Title string `json:"title,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
SiteURL string `json:"site_url,omitempty"` // The website the feed belongs to
|
||||
|
||||
// Timing
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
||||
NextCrawlAt time.Time `json:"next_crawl_at,omitempty"`
|
||||
LastBuildDate time.Time `json:"last_build_date,omitempty"` // From feed's lastBuildDate/updated
|
||||
|
||||
// Cache headers for conditional requests
|
||||
ETag string `json:"etag,omitempty"`
|
||||
LastModified string `json:"last_modified,omitempty"`
|
||||
|
||||
// Feed hints for crawl scheduling
|
||||
TTLMinutes int `json:"ttl_minutes,omitempty"` // From RSS <ttl> element
|
||||
UpdatePeriod string `json:"update_period,omitempty"` // From sy:updatePeriod (hourly, daily, weekly, monthly, yearly)
|
||||
UpdateFreq int `json:"update_freq,omitempty"` // From sy:updateFrequency
|
||||
|
||||
// Health tracking
|
||||
Status string `json:"status"` // "active", "dead", "redirect", "error"
|
||||
ErrorCount int `json:"error_count"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
LastErrorAt time.Time `json:"last_error_at,omitempty"`
|
||||
|
||||
// Discovery source
|
||||
SourceURL string `json:"source_url,omitempty"` // Where we found this feed
|
||||
SourceHost string `json:"source_host,omitempty"`
|
||||
TLD string `json:"tld,omitempty"`
|
||||
|
||||
// Content stats
|
||||
ItemCount int `json:"item_count,omitempty"` // Number of items in last crawl
|
||||
AvgPostFreqHrs float64 `json:"avg_post_freq_hrs,omitempty"` // Average hours between posts
|
||||
OldestItemDate time.Time `json:"oldest_item_date,omitempty"`
|
||||
NewestItemDate time.Time `json:"newest_item_date,omitempty"`
|
||||
}
|
||||
|
||||
// saveFeed stores a feed in PebbleDB
|
||||
func (c *Crawler) saveFeed(feed *Feed) error {
|
||||
data, err := json.Marshal(feed)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal feed: %v", err)
|
||||
}
|
||||
|
||||
key := []byte("feed:" + feed.URL)
|
||||
return c.db.Set(key, data, pebble.Sync)
|
||||
}
|
||||
|
||||
// getFeed retrieves a feed from PebbleDB
|
||||
func (c *Crawler) getFeed(feedURL string) (*Feed, error) {
|
||||
key := []byte("feed:" + normalizeURL(feedURL))
|
||||
data, closer, err := c.db.Get(key)
|
||||
if err != nil {
|
||||
if err == pebble.ErrNotFound {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer closer.Close()
|
||||
|
||||
var feed Feed
|
||||
if err := json.Unmarshal(data, &feed); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal feed: %v", err)
|
||||
}
|
||||
return &feed, nil
|
||||
}
|
||||
|
||||
// feedExists checks if a feed URL already exists in the database
|
||||
func (c *Crawler) feedExists(feedURL string) bool {
|
||||
key := []byte("feed:" + normalizeURL(feedURL))
|
||||
_, closer, err := c.db.Get(key)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
closer.Close()
|
||||
return true
|
||||
}
|
||||
|
||||
// GetAllFeeds returns all feeds from the database
|
||||
func (c *Crawler) GetAllFeeds() ([]*Feed, error) {
|
||||
var feeds []*Feed
|
||||
|
||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
||||
LowerBound: []byte("feed:"),
|
||||
UpperBound: []byte("feed:\xff"),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer iter.Close()
|
||||
|
||||
for iter.First(); iter.Valid(); iter.Next() {
|
||||
var feed Feed
|
||||
if err := json.Unmarshal(iter.Value(), &feed); err != nil {
|
||||
continue
|
||||
}
|
||||
feeds = append(feeds, &feed)
|
||||
}
|
||||
|
||||
if err := iter.Error(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return feeds, nil
|
||||
}
|
||||
|
||||
// GetFeedCount returns the total number of feeds in the database
|
||||
func (c *Crawler) GetFeedCount() (int, error) {
|
||||
count := 0
|
||||
|
||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
||||
LowerBound: []byte("feed:"),
|
||||
UpperBound: []byte("feed:\xff"),
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer iter.Close()
|
||||
|
||||
for iter.First(); iter.Valid(); iter.Next() {
|
||||
count++
|
||||
}
|
||||
|
||||
if err := iter.Error(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// processFeed parses and stores a feed with full metadata
|
||||
func (c *Crawler) processFeed(feedURL, sourceHost, body string, headers http.Header) {
|
||||
if strings.Contains(feedURL, "/comment") {
|
||||
return
|
||||
}
|
||||
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
feedType := c.detectFeedType(body)
|
||||
now := time.Now()
|
||||
|
||||
feed := &Feed{
|
||||
URL: normalizeURL(feedURL),
|
||||
Type: feedType,
|
||||
DiscoveredAt: now,
|
||||
LastCrawledAt: now,
|
||||
Status: "active",
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
ETag: headers.Get("ETag"),
|
||||
LastModified: headers.Get("Last-Modified"),
|
||||
}
|
||||
|
||||
// Parse feed-specific metadata
|
||||
switch feedType {
|
||||
case "rss":
|
||||
c.parseRSSMetadata(body, feed)
|
||||
case "atom":
|
||||
c.parseAtomMetadata(body, feed)
|
||||
}
|
||||
|
||||
// Calculate next crawl time
|
||||
feed.NextCrawlAt = c.calculateNextCrawl(feed)
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// addFeed adds a discovered feed URL (not yet fetched)
|
||||
func (c *Crawler) addFeed(feedURL, feedType, sourceHost, sourceURL string) {
|
||||
if strings.Contains(feedURL, "/comment") {
|
||||
return
|
||||
}
|
||||
|
||||
// Fast path: check without lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
c.feedsMu.Lock()
|
||||
defer c.feedsMu.Unlock()
|
||||
|
||||
// Double-check after acquiring lock
|
||||
if c.feedExists(feedURL) {
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
normalizedURL := normalizeURL(feedURL)
|
||||
feed := &Feed{
|
||||
URL: normalizedURL,
|
||||
Type: feedType,
|
||||
DiscoveredAt: now,
|
||||
Status: "active",
|
||||
SourceURL: normalizeURL(sourceURL),
|
||||
SourceHost: sourceHost,
|
||||
TLD: getTLD(sourceHost),
|
||||
NextCrawlAt: now, // Should be crawled immediately
|
||||
}
|
||||
|
||||
if err := c.saveFeed(feed); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user