Files
crawler/main.go
primal 219b49352e Add PebbleDB storage, domain tracking, and web dashboard
- Split main.go into separate files for better organization:
  crawler.go, domain.go, feed.go, parser.go, html.go, util.go
- Add PebbleDB for persistent storage of feeds and domains
- Store feeds with metadata: title, TTL, update frequency, ETag, etc.
- Track domains with crawl status (uncrawled/crawled/error)
- Normalize URLs by stripping scheme and www. prefix
- Add web dashboard on port 4321 with real-time stats:
  - Crawl progress with completion percentage
  - Feed counts by type (RSS/Atom)
  - Top TLDs and domains by feed count
  - Recent feeds table
- Filter out comment feeds from results

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 16:29:00 -05:00

31 lines
620 B
Go

package main
import (
"fmt"
"os"
)
func main() {
crawler, err := NewCrawler("feeds.db")
if err != nil {
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
os.Exit(1)
}
defer crawler.Close()
// Start dashboard in background
go func() {
if err := crawler.StartDashboard("0.0.0.0:4321"); err != nil {
fmt.Fprintf(os.Stderr, "Dashboard error: %v\n", err)
}
}()
// Import domains from vertices file (only adds new ones as "uncrawled")
crawler.ImportDomainsFromFile("vertices.txt.gz", 0)
// Crawl all uncrawled domains (runs continuously)
for {
crawler.CrawlUncrawledDomains()
}
}