Add PebbleDB storage, domain tracking, and web dashboard

- Split main.go into separate files for better organization:
  crawler.go, domain.go, feed.go, parser.go, html.go, util.go
- Add PebbleDB for persistent storage of feeds and domains
- Store feeds with metadata: title, TTL, update frequency, ETag, etc.
- Track domains with crawl status (uncrawled/crawled/error)
- Normalize URLs by stripping scheme and www. prefix
- Add web dashboard on port 4321 with real-time stats:
  - Crawl progress with completion percentage
  - Feed counts by type (RSS/Atom)
  - Top TLDs and domains by feed count
  - Recent feeds table
- Filter out comment feeds from results

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
primal
2026-01-22 16:29:00 -05:00
parent 0dd612b7e1
commit 219b49352e
9 changed files with 1574 additions and 642 deletions
+122
View File
@@ -0,0 +1,122 @@
package main
import (
"regexp"
"strings"
"golang.org/x/net/html"
)
// simpleFeed is a lightweight feed reference used during HTML extraction
type simpleFeed struct {
URL string
Type string
}
func (c *Crawler) isFeedContent(body, contentType string) bool {
if strings.Contains(contentType, "application/rss+xml") ||
strings.Contains(contentType, "application/atom+xml") ||
strings.Contains(contentType, "application/xml") ||
strings.Contains(contentType, "text/xml") {
return true
}
body = strings.TrimSpace(body)
if strings.HasPrefix(body, "<?xml") {
if strings.Contains(body, "<rss") || strings.Contains(body, "<feed") {
return true
}
}
return false
}
func (c *Crawler) detectFeedType(body string) string {
if strings.Contains(body, "<rss") {
return "rss"
}
if strings.Contains(body, "<feed") {
return "atom"
}
return "unknown"
}
func (c *Crawler) extractFeedLinks(n *html.Node, baseURL string) []simpleFeed {
feeds := make([]simpleFeed, 0)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "link" {
var rel, href, typeAttr string
for _, attr := range n.Attr {
switch attr.Key {
case "rel":
rel = attr.Val
case "href":
href = attr.Val
case "type":
typeAttr = attr.Val
}
}
if rel == "alternate" && (typeAttr == "application/rss+xml" || typeAttr == "application/atom+xml") {
absURL := makeAbsoluteURL(href, baseURL)
feedType := "rss"
if typeAttr == "application/atom+xml" {
feedType = "atom"
}
feeds = append(feeds, simpleFeed{URL: absURL, Type: feedType})
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(n)
return feeds
}
func (c *Crawler) extractAnchorFeeds(n *html.Node, baseURL string) []simpleFeed {
feeds := make([]simpleFeed, 0)
feedPattern := regexp.MustCompile(`(?i)(rss|atom|feed)`)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
href := attr.Val
if feedPattern.MatchString(href) {
absURL := makeAbsoluteURL(href, baseURL)
feeds = append(feeds, simpleFeed{URL: absURL, Type: "unknown"})
}
}
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(n)
return feeds
}
func (c *Crawler) extractLinks(n *html.Node, baseURL string) []string {
links := make([]string, 0)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
link := makeAbsoluteURL(attr.Val, baseURL)
links = append(links, link)
}
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(n)
return links
}