- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
31 lines
620 B
Go
31 lines
620 B
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
)
|
|
|
|
func main() {
|
|
crawler, err := NewCrawler("feeds.db")
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Error initializing crawler: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer crawler.Close()
|
|
|
|
// Start dashboard in background
|
|
go func() {
|
|
if err := crawler.StartDashboard("0.0.0.0:4321"); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Dashboard error: %v\n", err)
|
|
}
|
|
}()
|
|
|
|
// Import domains from vertices file (only adds new ones as "uncrawled")
|
|
crawler.ImportDomainsFromFile("vertices.txt.gz", 0)
|
|
|
|
// Crawl all uncrawled domains (runs continuously)
|
|
for {
|
|
crawler.CrawlUncrawledDomains()
|
|
}
|
|
}
|