Add PebbleDB storage, domain tracking, and web dashboard
- Split main.go into separate files for better organization: crawler.go, domain.go, feed.go, parser.go, html.go, util.go - Add PebbleDB for persistent storage of feeds and domains - Store feeds with metadata: title, TTL, update frequency, ETag, etc. - Track domains with crawl status (uncrawled/crawled/error) - Normalize URLs by stripping scheme and www. prefix - Add web dashboard on port 4321 with real-time stats: - Crawl progress with completion percentage - Feed counts by type (RSS/Atom) - Top TLDs and domains by feed count - Recent feeds table - Filter out comment feeds from results Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,227 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"compress/gzip"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/pebble"
|
||||
)
|
||||
|
||||
// Domain represents a host to be crawled for feeds
|
||||
type Domain struct {
|
||||
Host string `json:"host"` // Normalized hostname (no scheme, no www.)
|
||||
Status string `json:"status"` // "uncrawled", "crawled", "error"
|
||||
DiscoveredAt time.Time `json:"discovered_at"`
|
||||
LastCrawledAt time.Time `json:"last_crawled_at,omitempty"`
|
||||
FeedsFound int `json:"feeds_found,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
TLD string `json:"tld,omitempty"`
|
||||
}
|
||||
|
||||
// saveDomain stores a domain in PebbleDB
|
||||
func (c *Crawler) saveDomain(domain *Domain) error {
|
||||
data, err := json.Marshal(domain)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal domain: %v", err)
|
||||
}
|
||||
|
||||
key := []byte("domain:" + domain.Host)
|
||||
return c.db.Set(key, data, pebble.Sync)
|
||||
}
|
||||
|
||||
// getDomain retrieves a domain from PebbleDB
|
||||
func (c *Crawler) getDomain(host string) (*Domain, error) {
|
||||
key := []byte("domain:" + normalizeHost(host))
|
||||
data, closer, err := c.db.Get(key)
|
||||
if err != nil {
|
||||
if err == pebble.ErrNotFound {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer closer.Close()
|
||||
|
||||
var domain Domain
|
||||
if err := json.Unmarshal(data, &domain); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal domain: %v", err)
|
||||
}
|
||||
return &domain, nil
|
||||
}
|
||||
|
||||
// domainExists checks if a domain already exists in the database
|
||||
func (c *Crawler) domainExists(host string) bool {
|
||||
key := []byte("domain:" + normalizeHost(host))
|
||||
_, closer, err := c.db.Get(key)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
closer.Close()
|
||||
return true
|
||||
}
|
||||
|
||||
// GetUncrawledDomains returns all domains with status "uncrawled"
|
||||
func (c *Crawler) GetUncrawledDomains() ([]*Domain, error) {
|
||||
var domains []*Domain
|
||||
|
||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
||||
LowerBound: []byte("domain:"),
|
||||
UpperBound: []byte("domain:\xff"),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer iter.Close()
|
||||
|
||||
for iter.First(); iter.Valid(); iter.Next() {
|
||||
var domain Domain
|
||||
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
|
||||
continue
|
||||
}
|
||||
if domain.Status == "uncrawled" {
|
||||
domains = append(domains, &domain)
|
||||
}
|
||||
}
|
||||
|
||||
if err := iter.Error(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return domains, nil
|
||||
}
|
||||
|
||||
// markDomainCrawled updates a domain's status after crawling
|
||||
func (c *Crawler) markDomainCrawled(host string, feedsFound int, lastError string) error {
|
||||
domain, err := c.getDomain(host)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if domain == nil {
|
||||
return fmt.Errorf("domain not found: %s", host)
|
||||
}
|
||||
|
||||
domain.LastCrawledAt = time.Now()
|
||||
domain.FeedsFound = feedsFound
|
||||
if lastError != "" {
|
||||
domain.Status = "error"
|
||||
domain.LastError = lastError
|
||||
} else {
|
||||
domain.Status = "crawled"
|
||||
domain.LastError = ""
|
||||
}
|
||||
|
||||
return c.saveDomain(domain)
|
||||
}
|
||||
|
||||
// GetDomainCount returns the total number of domains in the database
|
||||
func (c *Crawler) GetDomainCount() (total int, uncrawled int, err error) {
|
||||
iter, err := c.db.NewIter(&pebble.IterOptions{
|
||||
LowerBound: []byte("domain:"),
|
||||
UpperBound: []byte("domain:\xff"),
|
||||
})
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
defer iter.Close()
|
||||
|
||||
for iter.First(); iter.Valid(); iter.Next() {
|
||||
total++
|
||||
var domain Domain
|
||||
if err := json.Unmarshal(iter.Value(), &domain); err != nil {
|
||||
continue
|
||||
}
|
||||
if domain.Status == "uncrawled" {
|
||||
uncrawled++
|
||||
}
|
||||
}
|
||||
|
||||
if err := iter.Error(); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
return total, uncrawled, nil
|
||||
}
|
||||
|
||||
// ImportDomainsFromFile reads a vertices file and stores new domains as "uncrawled"
|
||||
func (c *Crawler) ImportDomainsFromFile(filename string, limit int) (imported int, skipped int, err error) {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("failed to open file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
return c.parseAndStoreDomains(file, limit)
|
||||
}
|
||||
|
||||
func (c *Crawler) parseAndStoreDomains(reader io.Reader, limit int) (imported int, skipped int, err error) {
|
||||
var bodyReader io.Reader
|
||||
|
||||
bufReader := bufio.NewReader(reader)
|
||||
peekBytes, err := bufReader.Peek(2)
|
||||
if err != nil && err != io.EOF {
|
||||
return 0, 0, fmt.Errorf("failed to peek at file: %v", err)
|
||||
}
|
||||
|
||||
if len(peekBytes) >= 2 && peekBytes[0] == 0x1f && peekBytes[1] == 0x8b {
|
||||
gzReader, err := gzip.NewReader(bufReader)
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("failed to create gzip reader: %v", err)
|
||||
}
|
||||
defer gzReader.Close()
|
||||
bodyReader = gzReader
|
||||
} else {
|
||||
bodyReader = bufReader
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(bodyReader)
|
||||
buf := make([]byte, 0, 64*1024)
|
||||
scanner.Buffer(buf, 1024*1024)
|
||||
|
||||
now := time.Now()
|
||||
count := 0
|
||||
|
||||
for scanner.Scan() {
|
||||
if limit > 0 && count >= limit {
|
||||
break
|
||||
}
|
||||
|
||||
line := scanner.Text()
|
||||
parts := strings.Split(line, "\t")
|
||||
if len(parts) >= 2 {
|
||||
reverseHostName := strings.TrimSpace(parts[1])
|
||||
if reverseHostName != "" {
|
||||
host := normalizeHost(reverseHost(reverseHostName))
|
||||
count++
|
||||
|
||||
// Skip if domain already exists
|
||||
if c.domainExists(host) {
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
|
||||
// Store new domain as uncrawled
|
||||
domain := &Domain{
|
||||
Host: host,
|
||||
Status: "uncrawled",
|
||||
DiscoveredAt: now,
|
||||
TLD: getTLD(host),
|
||||
}
|
||||
if err := c.saveDomain(domain); err != nil {
|
||||
continue
|
||||
}
|
||||
imported++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return imported, skipped, fmt.Errorf("error reading file: %v", err)
|
||||
}
|
||||
|
||||
return imported, skipped, nil
|
||||
}
|
||||
Reference in New Issue
Block a user