Add PebbleDB storage, domain tracking, and web dashboard

- Split main.go into separate files for better organization:
  crawler.go, domain.go, feed.go, parser.go, html.go, util.go
- Add PebbleDB for persistent storage of feeds and domains
- Store feeds with metadata: title, TTL, update frequency, ETag, etc.
- Track domains with crawl status (uncrawled/crawled/error)
- Normalize URLs by stripping scheme and www. prefix
- Add web dashboard on port 4321 with real-time stats:
  - Crawl progress with completion percentage
  - Feed counts by type (RSS/Atom)
  - Top TLDs and domains by feed count
  - Recent feeds table
- Filter out comment feeds from results

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-22 16:29:00 -05:00
parent 0dd612b7e1
commit 219b49352e
9 changed files with 1574 additions and 642 deletions
+227
View File
@@ -0,0 +1,227 @@
package main
import (
"bufio"
"compress/gzip"
"encoding/json"
"fmt"
"io"
"os"
"strings"
"time"
"github.com/cockroachdb/pebble"
)
// Domain represents a host to be crawled for feeds
type Domain struct {
Host string `json:"host"` // Normalized hostname (no scheme, no www.)
Status string `json:"status"` // "uncrawled", "crawled", "error"
DiscoveredAt time.Time `json:"discovered_at"`
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
FeedsFound int `json:"feeds_found,omitempty"`
LastError string `json:"last_error,omitempty"`
TLD string `json:"tld,omitempty"`
}
// saveDomain stores a domain in PebbleDB
func (c *Crawler) saveDomain(domain *Domain) error {
data, err := json.Marshal(domain)
if err != nil {
return fmt.Errorf("failed to marshal domain: %v", err)
}
key := []byte("domain:" + domain.Host)
return c.db.Set(key, data, pebble.Sync)
}
// getDomain retrieves a domain from PebbleDB
func (c *Crawler) getDomain(host string) (*Domain, error) {
key := []byte("domain:" + normalizeHost(host))
data, closer, err := c.db.Get(key)
if err != nil {
if err == pebble.ErrNotFound {
return nil, nil
}
return nil, err
}
defer closer.Close()
var domain Domain
if err := json.Unmarshal(data, &domain); err != nil {
return nil, fmt.Errorf("failed to unmarshal domain: %v", err)
}
return &domain, nil
}
// domainExists checks if a domain already exists in the database
func (c *Crawler) domainExists(host string) bool {
key := []byte("domain:" + normalizeHost(host))
_, closer, err := c.db.Get(key)
if err != nil {
return false
}
closer.Close()
return true
}
// GetUncrawledDomains returns all domains with status "uncrawled"
func (c *Crawler) GetUncrawledDomains() ([]*Domain, error) {
var domains []*Domain
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("domain:"),
UpperBound: []byte("domain:\xff"),
})
if err != nil {
return nil, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
var domain Domain
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
continue
}
if domain.Status == "uncrawled" {
domains = append(domains, &domain)
}
}
if err := iter.Error(); err != nil {
return nil, err
}
return domains, nil
}
// markDomainCrawled updates a domain's status after crawling
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
domain, err := c.getDomain(host)
if err != nil {
return err
}
if domain == nil {
return fmt.Errorf("domain not found: %s", host)
}
domain.LastCrawledAt = time.Now()
domain.FeedsFound = feedsFound
if lastError != "" {
domain.Status = "error"
domain.LastError = lastError
} else {
domain.Status = "crawled"
domain.LastError = ""
}
return c.saveDomain(domain)
}
// GetDomainCount returns the total number of domains in the database
func (c *Crawler) GetDomainCount() (total int, uncrawled int, err error) {
iter, err := c.db.NewIter(&pebble.IterOptions{
LowerBound: []byte("domain:"),
UpperBound: []byte("domain:\xff"),
})
if err != nil {
return 0, 0, err
}
defer iter.Close()
for iter.First(); iter.Valid(); iter.Next() {
total++
var domain Domain
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
continue
}
if domain.Status == "uncrawled" {
uncrawled++
}
}
if err := iter.Error(); err != nil {
return 0, 0, err
}
return total, uncrawled, nil
}
// ImportDomainsFromFile reads a vertices file and stores new domains as "uncrawled"
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
file, err := os.Open(filename)
if err != nil {
return 0, 0, fmt.Errorf("failed to open file: %v", err)
}
defer file.Close()
return c.parseAndStoreDomains(file, limit)
}
func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) {
var bodyReader io.Reader
bufReader := bufio.NewReader(reader)
peekBytes, err := bufReader.Peek(2)
if err != nil && err != io.EOF {
return 0, 0, fmt.Errorf("failed to peek at file: %v", err)
}
if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b {
gzReader, err := gzip.NewReader(bufReader)
if err != nil {
return 0, 0, fmt.Errorf("failed to create gzip reader: %v", err)
}
defer gzReader.Close()
bodyReader = gzReader
} else {
bodyReader = bufReader
}
scanner := bufio.NewScanner(bodyReader)
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
now := time.Now()
count := 0
for scanner.Scan() {
if limit > 0 && count >= limit {
break
}
line := scanner.Text()
parts := strings.Split(line, "\t")
if len(parts) >= 2 {
reverseHostName := strings.TrimSpace(parts[1])
if reverseHostName != "" {
host := normalizeHost(reverseHost(reverseHostName))
count++
// Skip if domain already exists
if c.domainExists(host) {
skipped++
continue
}
// Store new domain as uncrawled
domain := &Domain{
Host: host,
Status: "uncrawled",
DiscoveredAt: now,
TLD: getTLD(host),
}
if err := c.saveDomain(domain); err != nil {
continue
}
imported++
}
}
}
if err := scanner.Err(); err != nil {
return imported, skipped, fmt.Errorf("error reading file: %v", err)
}
return imported, skipped, nil
}